In [1]:
import pandas as pd
import json
gdelt_news = pd.read_csv('csvs/gdelt_news_updated.csv', sep='\t')

In [2]:
gdelt_news[gdelt_news.isnull().any(axis=1)]

Unnamed: 0,News
59,


In [3]:
gdelt_news.dropna(inplace=True)

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
tfidf = TfidfVectorizer(stop_words='english')

In [6]:
dtm = tfidf.fit_transform(gdelt_news['News'])

In [7]:
from sklearn.decomposition import LatentDirichletAllocation

In [8]:
LDA = LatentDirichletAllocation(n_components=10,random_state=101)

In [9]:
LDA.fit(dtm)

LatentDirichletAllocation(random_state=101)

In [9]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV

search_params = {
  'n_components': [5, 10, 15, 20, 25],
  'learning_decay': [.5, .7]
}

model = LatentDirichletAllocation()

# Try all of the options
gridsearch = GridSearchCV(model, param_grid=search_params, n_jobs=-1, verbose=1)
gridsearch.fit(dtm)

# What did we find?
print("Best Model's Params: ", gridsearch.best_params_)
print("Best Log Likelihood Score: ", gridsearch.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Model's Params:  {'learning_decay': 0.5, 'n_components': 5}
Best Log Likelihood Score:  -2896.018811810439


In [10]:
topic_lst = []
for index,topic in enumerate(LDA.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([tfidf.get_feature_names_out()[i] for i in topic.argsort()[-15:]])
    print('\n')
    topic_lst.append([tfidf.get_feature_names_out()[i] for i in topic.argsort()[-15:]])

THE TOP 15 WORDS FOR TOPIC #0
['se', 'peluang', 'ujian', 'menyebutkan', 'video', 'na', 'network', 'investasi', 'suzuki', 'cnn', 'seks', 'october', 'tekanan', 'agus', 'menperin']


THE TOP 15 WORDS FOR TOPIC #1
['fsr', '게시글', 'года', 'дефицита', 'liga', '업데이트', 'чипов', 'что', 'за', 'из', '뉴스', '이벤트', '케이벤치', 'amd', 'на']


THE TOP 15 WORDS FOR TOPIC #2
['footsteps', 'rival', 'bring', 'نيكاي', 'شراء', 'day', 'list', 'insights', 'following', 'upside', 'house', 'daily', 'google', 'advisor', 'stock']


THE TOP 15 WORDS FOR TOPIC #3
['780g', 'syarikat', 'taiwan', 'bekalan', 'networking', 'malaysia', 'فى', 'peranti', 'kilang', 'wi', 'fi', 'snapdragon', 'cip', 'stocks', 'korsel']


THE TOP 15 WORDS FOR TOPIC #4
['إلكترونية', 'ونقلت', 'الأزمة', 'رقائق', 'نتيجة', 'عام', 'العالمي', 'التأمين', 'الدراسة', 'الدفع', 'و6', 'يتراوح', 'بما', 'ارتفاع', 'تقرير']


THE TOP 15 WORDS FOR TOPIC #5
['industry', 'chip', 'companies', 'uy', 'manufacturing', 'demand', 'en', 'said', 'production', 'la', 'new', 'sem

In [11]:
with open('csvs/gdelt_topics.json','w+', encoding='utf-8') as f:
    f.writelines(json.dumps(topic_lst))

In [12]:
topic_results = LDA.transform(dtm)

In [13]:
gdelt_news['Topic'] = topic_results.argmax(axis=1)

In [14]:
gdelt_news.to_csv('csvs/gdelt_news_with_topic.csv', sep='\t', index=False)

In [15]:
gdelt_news

Unnamed: 0,News,Topic
0,Menperin Targetkan RI Produksi Chip untuk Mobi...,6
1,Kekurangan Pasokan Chip PS5 Bakal Diprediksi h...,6
2,Na těchto webových stránkách se používají soub...,9
3,REPUBLIKA.ID REPUBLIKA TV GERAI IHRAM REPJABAR...,6
4,ANTARANEWS.COM TERKINI TOP NEWS TERPOPULER Thu...,6
...,...,...
68,"Krisis Chip, Produksi Suzuki India Anjlok Sept...",6
69,ANTARANEWS.COM TERKINI TOP NEWS TERPOPULER Thu...,6
70,"miércoles, octubre 6, 2021 Canal de televisión...",5
71,AKTUÁLNĚ: Koronavirus Situace v Afghánistánu D...,0


In [None]:
# ================== Company Info GDELT =====================

In [16]:
all_news = list(gdelt_news['News'])

In [17]:
import spacy
from spacy import displacy
from collections import Counter

In [18]:
import en_core_web_sm
nlp = en_core_web_sm.load()

In [19]:
doc = nlp(all_news[10])
[(X.text, X.label_) for X in doc.ents]

[('Thursday', 'DATE'),
 ('1', 'CARDINAL'),
 ('October 2021', 'DATE'),
 ('Oto Tek', 'PERSON'),
 ('2023', 'DATE'),
 ('12', 'CARDINAL'),
 ('Penurunan', 'GPE'),
 ('ilustrasi mobil listrik menggunakan chip', 'ORG'),
 ('Penurunan', 'GPE'),
 ('JAKARTA', 'ORG'),
 ('Kelangkaan dan kurangnya', 'PERSON'),
 ('produk mobil', 'ORG'),
 ('secara global', 'PERSON'),
 ('2023', 'DATE'),
 ('dari berbagai pihak', 'PERSON'),
 ('mobil', 'ORG'),
 ('Jerman Mercedez-Benz yang', 'PERSON'),
 ('kuartal ketiga', 'ORG'),
 ('2021 mengalami', 'QUANTITY'),
 ('dan penjualan secara signifikan karena', 'PERSON'),
 ('menyebutkan kondisi', 'ORG'),
 ('akan', 'NORP'),
 ('terus', 'PRODUCT'),
 ('2022', 'CARDINAL'),
 ('dan kemudian', 'PERSON'),
 ('Artinya kelangkaan bisa', 'PERSON'),
 ('2023', 'DATE'),
 ('Mercedes-Benz Kallenius seperti', 'ORG'),
 ('BBC', 'ORG'),
 ('12/9', 'CARDINAL'),
 ('5', 'CARDINAL'),
 ('Makanan Kaya', 'PERSON'),
 ('Turunkan Berat', 'PERSON'),
 ('Isolasi Covid Saat PON XX', 'PERSON'),
 ('Papua Masjid Sunshin

In [20]:
organization = []
for news in all_news:
    doc = nlp(news)
    organization.append([i.text for i in doc.ents if (i.label_ == 'ORG' and i.text != "LLC")])

In [21]:
flat_org = [item for sublist in organization for item in sublist]

In [22]:
flat_org_unq = list(set(flat_org))

In [32]:
keywords = [
    "house",
    "university",
    "report",
    "information",
    "communication",
    "time",
    "visa",
    "master",
    "social",
    "sale",
    "invest",
    "street",
    "network",
    "terminal",
    "linked",
    "facebook",
    "yahoo",
    'youtube',
    "daily",
    "week",
    "business",
    "trust",
    "foundation",
    ".com",
    "stud",
    "history",
    "school",
    "stock",
    "world",
    "cnn",
    "science",
    "england &",
    "fund",
    "wordpress",
    "trump",
    "electric",
    "twitter",
    "musk",
    "elon",
    "berkley",
    "usb",
    "l.",
    "modi",
    "postal"
    "app",
    "semiconductor",
    "Li",
    "ST",
    "on",
    "Lattice",
    "micro",
    "state",
    "ip",
    "ev",
    "ml",
    "next",
    "live",
    "led",
    "silicon",
    "covid",
    "news",
    "journal",
    "council",
    "SaaS",
    'Global',
    'car',
    'app',
    'Government',
    'VAT',
    'Air',
    'OPEN',
    'Group',
    'WSJ',
    'Shop',
    'cnbc',
    'bank',
    "air",
    "fact",
    "mobil"
]

In [33]:
filtered_flat_org_unq = []

for org in flat_org_unq:
    flag = True
    for k in keywords:
        if k.strip().lower() in org.strip().lower():
            flag = False
            break
    if flag:
        filtered_flat_org_unq.append(org)

In [34]:
filtered_flat_org_unq

['skirtų plataus vartojimo prekių',
 "Moody's Analytics",
 'PWL',
 'RPP Złoty',
 'eBay',
 'Niska',
 'iOS',
 'dan webcam',
 'INTC',
 'ČTK',
 'ŞTIRI PE',
 'serta',
 'Hollywood Video',
 'General Motors Co',
 'CKD',
 'Gunakan Layanan DriveThru Supaya Cepat',
 'KOSPI',
 'WIB Berikut',
 'De',
 'Die&quot',
 'Digital',
 'ستايل محطة',
 'ECU',
 'Video',
 'FIT 55',
 'САЩ General Motors',
 'mengatakan bahwa',
 'إلى يوم',
 'HMC',
 'E-Sport Football Sulut',
 'OPINIA',
 'Към',
 'HP',
 'Artificial Intelegence (AI',
 'mengurangi disrupsi pada',
 'segera memperoleh',
 'Toyota Motor Corp',
 'Ordinul',
 'OLEH NAZRIN',
 'aveau un impact asupra transmiterii',
 'tetapi sektor',
 'из всех американских',
 'Saksikan Video di Bawah Ini',
 'Impresi City Hatchback',
 'nieco w czasie zagładę ludzkości',
 'Menurut NY Magazine',
 'Pfizer',
 'fintech de soluţii complete de plată şi',
 'Katarak Bisa Anda',
 'Criza',
 'حوالي',
 'BMW M4',
 'się',
 'Polaków',
 'Zymergen',
 'Ada',
 'dan perusahaan',
 'Disney',
 'FOTO',
 'm

In [35]:
temp_all_news = " ".join(all_news)

In [36]:
freq_dict = dict()
for org in filtered_flat_org_unq:
    if len(org) > 2:
        count = temp_all_news.lower().count(org.lower())
        freq_dict[org] = count

In [37]:
sorted_freq_dict = {k: v for k, v in sorted(freq_dict.items(), key=lambda item: item[1], reverse=True)}

In [38]:
sorted_freq_dict

{'Ada': 319,
 'NGK': 267,
 'Par': 183,
 'Pada': 160,
 'aut': 145,
 'dari': 125,
 'dengan': 123,
 'iti': 109,
 'Pasokan': 102,
 'NIK': 98,
 'WIB': 98,
 'CIA': 95,
 'nic': 92,
 'pak': 85,
 'السيارات': 73,
 'iki': 61,
 'Toyota': 60,
 'się': 54,
 'Smart': 54,
 'Baru': 53,
 'Baca': 51,
 'MIT': 50,
 'Mit': 50,
 'Video': 48,
 'Samsung': 47,
 'ANTARA': 46,
 'Hyundai': 44,
 'JAKARTA': 44,
 'Penjualan': 43,
 'ZEN': 40,
 'Asan': 39,
 'Mercedes': 38,
 'Intel': 38,
 'Ford': 37,
 'إلى': 36,
 'ECU': 35,
 'Tesla': 35,
 'عالم': 33,
 'Salah': 30,
 'Persen': 30,
 'العالم': 29,
 'iOS': 27,
 'Kia': 26,
 'Nissan': 26,
 'Tapi': 25,
 'BMW': 25,
 'عام': 25,
 'Kamis': 24,
 'Mercedes-Benz': 23,
 "Moody's": 18,
 'Ceny': 18,
 'Sementara': 18,
 'CELE': 18,
 'Volkswagen': 18,
 'General Motors': 17,
 'بين': 17,
 'Islam': 17,
 'Usia': 17,
 'salah satu': 17,
 'Sejak': 16,
 'العام': 16,
 'PS5': 16,
 'Digital': 15,
 'Reuters': 15,
 'Google': 15,
 'BSI': 15,
 'AHM': 15,
 'Jeep': 15,
 'Toshiba': 15,
 'FOTO': 14,
 'AMD': 14

In [41]:
with open('csvs/updated_freq_company_names_gdlet.json', 'w+', encoding='utf-8') as f:
    f.writelines(json.dumps(sorted_freq_dict))

In [42]:
import itertools

thirty_sorted_freq_dict = dict(itertools.islice(sorted_freq_dict.items(), 30))

with open('csvs/updated_thirty_freq_company_names_gdlet.json', 'w+', encoding='utf-8') as f:
    f.writelines(json.dumps(thirty_sorted_freq_dict))

In [43]:
thirty_sorted_freq_dict

{'Ada': 319,
 'NGK': 267,
 'Pada': 160,
 'aut': 145,
 'dari': 125,
 'dengan': 123,
 'iti': 109,
 'Pasokan': 102,
 'NIK': 98,
 'WIB': 98,
 'CIA': 95,
 'nic': 92,
 'pak': 85,
 'السيارات': 73,
 'iki': 61,
 'Toyota': 60,
 'się': 54,
 'Smart': 54,
 'Baru': 53,
 'Baca': 51,
 'MIT': 50,
 'Video': 48,
 'Samsung': 47,
 'ANTARA': 46,
 'Hyundai': 44,
 'JAKARTA': 44,
 'Penjualan': 43,
 'ZEN': 40,
 'Asan': 39,
 'Mercedes': 38}