In [1]:
import pandas as pd
import json
gdelt_news = pd.read_csv('csvs/gdelt_news_updated.csv', sep='\t')

In [2]:
gdelt_news.fillna('', inplace=True)

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
tfidf = TfidfVectorizer(stop_words='english')

In [5]:
dtm = tfidf.fit_transform(gdelt_news['News'])

In [6]:
from sklearn.decomposition import LatentDirichletAllocation

In [7]:
LDA = LatentDirichletAllocation(n_components=10,random_state=101)

In [8]:
LDA.fit(dtm)

LatentDirichletAllocation(random_state=101)

In [9]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV

search_params = {
  'n_components': [5, 10, 15, 20, 25],
  'learning_decay': [.5, .7]
}

model = LatentDirichletAllocation()

# Try all of the options
gridsearch = GridSearchCV(model, param_grid=search_params, n_jobs=-1, verbose=1)
gridsearch.fit(dtm)

# What did we find?
print("Best Model's Params: ", gridsearch.best_params_)
print("Best Log Likelihood Score: ", gridsearch.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Model's Params:  {'learning_decay': 0.5, 'n_components': 5}
Best Log Likelihood Score:  -2896.018811810439


In [9]:
topic_lst = []
for index,topic in enumerate(LDA.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([tfidf.get_feature_names_out()[i] for i in topic.argsort()[-15:]])
    print('\n')
    topic_lst.append([tfidf.get_feature_names_out()[i] for i in topic.argsort()[-15:]])

THE TOP 15 WORDS FOR TOPIC #0
['semiconductor', 'kompas', 'banking', 'trend', 'perindustri', 'tiongkok', 'cibatu', 'kunjungi', 'garut', 'rights', 'jl', 'auto', '38', 'banjar', 'desa']


THE TOP 15 WORDS FOR TOPIC #1
['car', 'toshiba', '780g', 'dihasilkan', 'und', 'der', 'se', 'ps5', 'peranti', 'power', 'console', 'na', 'snapdragon', 'stocks', 'die']


THE TOP 15 WORDS FOR TOPIC #2
['pemprosesan', '1200', 'mengesahkan', 'dimensity', 'مبيعات', 'cu', 'producătorii', 'pentru', 'cipuri', 'mediatek', 'nord', 'din', 'în', 'şi', 'oneplus']


THE TOP 15 WORDS FOR TOPIC #3
['moody', 'companies', 'cnbc', 'like', 'demand', 'manufacturing', 'uy', 'chip', 'said', 'production', 'new', 'semiconductor', 'shortage', 'supply', 'chips']


THE TOP 15 WORDS FOR TOPIC #4
['kelangkaan', 'dari', 'pada', 'semikonduktor', 'wib', 'indonesia', '2021', 'industri', 'akan', 'untuk', 'ini', 'dan', 'chip', 'yang', 'di']


THE TOP 15 WORDS FOR TOPIC #5
['www', 'جديدة', 'insert_random_number_here', 'advm', 'org', 'php', 

In [10]:
with open('csvs/gdelt_topics.json','w+', encoding='utf-8') as f:
    f.writelines(json.dumps(topic_lst))

In [11]:
topic_results = LDA.transform(dtm)

In [12]:
gdelt_news['Topic'] = topic_results.argmax(axis=1)

In [13]:
gdelt_news.to_csv('csvs/gdelt_news_with_topic.csv', index=False)

In [14]:
all_news = list(gdelt_news['News'])

In [15]:
import spacy
from spacy import displacy
from collections import Counter

In [16]:
import en_core_web_sm
nlp = en_core_web_sm.load()

In [19]:
doc = nlp(all_news[10])
[(X.text, X.label_) for X in doc.ents]

[('webových stránkách', 'PERSON'),
 ('které mohou', 'ORG'),
 ('údaje', 'GPE'),
 ('např', 'ORG'),
 ('jak procházíte', 'PERSON'),
 ('partneři', 'ORG'),
 ('Vašem', 'PERSON'),
 ('Tyto', 'GPE'),
 ('Můžeme Vám', 'ORG'),
 ('Váš', 'ORG'),
 ('Svůj souhlas můžete', 'PERSON'),
 ('či jej odvoláte', 'PERSON'),
 ('Vám', 'GPE'),
 ('oprávněného zájmu', 'PERSON'),
 ('Vznést', 'ORG'),
 ('oprávněného zájmu', 'PERSON'),
 ('další nastavení soukromí můžete v sekci', 'ORG'),
 ('Podrobné nastavení', 'WORK_OF_ART'),
 ('či v odkazech tam uvedených', 'WORK_OF_ART'),
 ('Vámi', 'PERSON'),
 ('partneři zpracováváme údaje', 'ORG'),
 ('následujícím způsobem', 'PERSON'),
 ('Personalizovaná', 'PERSON'),
 ('Ukládání', 'GPE'),
 ('Zpracování', 'PERSON'),
 ('Zpracování', 'PERSON'),
 ('Daimleru Nedostatek', 'PERSON'),
 ('pocítí', 'GPE'),
 ('ještě v roce', 'PERSON'),
 ('2023', 'DATE'),
 ('Daimleru', 'GPE')]

In [20]:
organization = []
for news in all_news:
    doc = nlp(news)
    organization.append([i.text for i in doc.ents if (i.label_ == 'ORG' and i.text != "LLC")])

In [21]:
flat_org = [item for sublist in organization for item in sublist]

In [22]:
flat_org_unq = list(set(flat_org))

In [23]:
keywords = [
    "house",
    "university",
    "report",
    "information",
    "communication",
    "time",
    "visa",
    "master",
    "social",
    "sale",
    "invest",
    "street",
    "network",
    "terminal",
    "linked",
    "facebook",
    "yahoo",
    'youtube',
    "daily",
    "week",
    "business",
    "trust",
    "foundation",
    ".com",
    "stud",
    "history",
    "school",
    "stock",
    "world",
    "cnn",
    "science",
    "england &",
    "fund",
    "wordpress",
    "trump",
    "electric",
    "twitter",
    "musk",
    "elon",
    "berkley",
    "usb",
    "l.",
    "modi",
    "postal"
    "app",
    "semiconductor",
    "Li",
    "ST",
    "on",
    "Lattice",
    "micro",
    "state",
    "ip",
    "ev",
    "ml",
    "next",
    "live",
    "led",
    "silicon",
    "covid",
    "news",
    "journal",
    "automobile",
    "car",
    "council"
]

In [24]:
filtered_flat_org_unq = []

for org in flat_org_unq:
    flag = True
    for k in keywords:
        if k.strip().lower() in org.strip().lower():
            flag = False
            break
    if flag:
        filtered_flat_org_unq.append(org)

In [25]:
filtered_flat_org_unq

['Toyoty',
 'mengambil langkah',
 'jaguar',
 'mobil',
 'Serang Diabetes hilang selamanya & pankreas',
 'что',
 'Иван Чернов',
 'Benchmark Company',
 'AS',
 'Copyright ©',
 'използвани в центрове за данни',
 'kata Karyanto',
 'MIT',
 'The Supply Chain is Broken Again for Chicken',
 'США',
 'Ford',
 'Murdaningsih Pasokan',
 'kedua perusahaan',
 'Sampai Banyak Orang Rela',
 'أكبر مصنع تابع لها',
 'India Motor',
 'Toyota',
 'kolejne iteracje',
 'търсенето',
 'muka oleh beberapa perusahaan',
 'приемането',
 'Yardeni Research',
 'JLR',
 'بين',
 'Android Authority',
 'AlterNet',
 'dengan',
 'său',
 'meningkatkan',
 'Artificial Intelegence (AI',
 'the Detroit Free Press',
 'M.2',
 'berdampak pada',
 'CCTV',
 'Baidu',
 'w Ameryce Północnej',
 'Pasokan',
 'GIIAS Batal',
 'Pembangunan Jalan',
 'WIB',
 'Batik Kediri Berpromosi',
 'Armata',
 'Kuartal III',
 'harga pasar',
 'Polymetal',
 'كما ستمدد شركة أودي',
 'The Energy & Oil Sector Trade',
 'КХЛ 1 октября',
 'لإقامة مشروعات',
 '2221 Коментари',


In [26]:
temp_all_news = " ".join(all_news)

In [27]:
freq_dict = dict()
for org in filtered_flat_org_unq:
    if len(org) > 2:
        count = temp_all_news.lower().count(org.lower())
        freq_dict[org] = count

In [28]:
sorted_freq_dict = {k: v for k, v in sorted(freq_dict.items(), key=lambda item: item[1], reverse=True)}

In [47]:
sorted_freq_dict

{'Pada': 150,
 'WIB': 115,
 'dengan': 108,
 'Pasokan': 101,
 'Ford': 58,
 'nic': 58,
 'Samsung': 54,
 'Baca': 50,
 'Toyota': 48,
 'MIT': 47,
 'ANTARA': 42,
 'إلى': 37,
 'EED': 36,
 'Asan': 35,
 'Salah': 33,
 'عالم': 30,
 'Pasar': 29,
 'ZEN': 28,
 'Tesla': 27,
 'Hyundai': 26,
 'недостиг': 26,
 'Intel': 25,
 'CNBC': 25,
 'Penjualan': 25,
 'BMW': 23,
 'العالم': 23,
 'Tapi': 23,
 'Apple': 22,
 'عام': 20,
 'salah satu': 19,
 "Moody's": 19,
 'PS5': 18,
 'что': 17,
 'Google': 17,
 'Jeep': 17,
 'Volkswagen': 17,
 'Mercedes-Benz': 16,
 'بين': 14,
 'Toshiba': 14,
 'Sejak': 14,
 'Seluruh': 14,
 'seluruh': 14,
 'IBD': 13,
 'Reuters': 13,
 'الياباني': 13,
 'Olahraga': 13,
 'AMD': 12,
 'للسيارات': 12,
 'Copyright ©': 11,
 'العام': 11,
 'COPYRIGHT ©': 11,
 'CELE': 11,
 'General Motors': 11,
 'Wi-Fi': 11,
 'Mei': 11,
 'Mercedes Benz': 11,
 'masalah': 11,
 'meningkatkan': 10,
 'tetapi': 10,
 'TSMC': 10,
 'OTT': 10,
 'Tetapi': 10,
 'Qualcomm': 10,
 'Xiaomi': 10,
 'търсенето': 9,
 'WHATS NEW 02/10/2021':

In [52]:
with open('csvs/updated_freq_company_names_gdlet.json', 'w+', encoding='utf-8') as f:
    f.writelines(json.dumps(sorted_freq_dict))

In [53]:
import itertools

thirty_sorted_freq_dict = dict(itertools.islice(sorted_freq_dict.items(), 30))

with open('csvs/updated_thirty_freq_company_names_gdlet.json', 'w+', encoding='utf-8') as f:
    f.writelines(json.dumps(thirty_sorted_freq_dict))

In [54]:
thirty_sorted_freq_dict

{'Pada': 150,
 'WIB': 115,
 'dengan': 108,
 'Pasokan': 101,
 'Ford': 58,
 'nic': 58,
 'Samsung': 54,
 'Baca': 50,
 'Toyota': 48,
 'MIT': 47,
 'ANTARA': 42,
 'إلى': 37,
 'EED': 36,
 'Asan': 35,
 'Salah': 33,
 'عالم': 30,
 'Pasar': 29,
 'ZEN': 28,
 'Tesla': 27,
 'Hyundai': 26,
 'недостиг': 26,
 'Intel': 25,
 'CNBC': 25,
 'Penjualan': 25,
 'BMW': 23,
 'العالم': 23,
 'Tapi': 23,
 'Apple': 22,
 'عام': 20,
 'salah satu': 19}