In [1]:
import pandas as pd
import json
gdelt_news = pd.read_csv('csvs/gdelt_news_updated.csv', sep='\t')

In [2]:
gdelt_news.fillna('', inplace=True)

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
tfidf = TfidfVectorizer(stop_words='english')

In [5]:
dtm = tfidf.fit_transform(gdelt_news['News'])

In [6]:
from sklearn.decomposition import LatentDirichletAllocation

In [7]:
LDA = LatentDirichletAllocation(n_components=10,random_state=101)

In [8]:
LDA.fit(dtm)

LatentDirichletAllocation(random_state=101)

In [9]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV

search_params = {
  'n_components': [5, 10, 15, 20, 25],
  'learning_decay': [.5, .7]
}

model = LatentDirichletAllocation()

# Try all of the options
gridsearch = GridSearchCV(model, param_grid=search_params, n_jobs=-1, verbose=1)
gridsearch.fit(dtm)

# What did we find?
print("Best Model's Params: ", gridsearch.best_params_)
print("Best Log Likelihood Score: ", gridsearch.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Model's Params:  {'learning_decay': 0.5, 'n_components': 5}
Best Log Likelihood Score:  -2896.018811810439


In [9]:
topic_lst = []
for index,topic in enumerate(LDA.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([tfidf.get_feature_names_out()[i] for i in topic.argsort()[-15:]])
    print('\n')
    topic_lst.append([tfidf.get_feature_names_out()[i] for i in topic.argsort()[-15:]])

THE TOP 15 WORDS FOR TOPIC #0
['disadvantage', '1996', 'unsubscribe', 'stabilize', 'emails', 'cells', 'markit', 'vw', 'sanctions', 'audi', 'monthly', 'ihs', 'duesmann', 'rivian', 'reuss']


THE TOP 15 WORDS FOR TOPIC #1
['mile', 'selecting', '219', 'lowering', 'simple', 'sacrificing', 'cheapest', 'exports', 'petrol', 'dependence', 'ways', 'vulnerable', 'freight', 'recession', 'hurting']


THE TOP 15 WORDS FOR TOPIC #2
['detected', 'javascript', 'unusual', 'message', 'hoarding', 'window', 'ya', 'nan', 'circular', 'report', 'refurbished', 'abf', 'gmt', 'carbide', 'substrate']


THE TOP 15 WORDS FOR TOPIC #3
['news', 'gm', 'production', 'chain', '2021', 'new', 'tsmc', 'industry', 'chips', 'semiconductor', 'year', 'shortage', 'said', 'supply', 'chip']


THE TOP 15 WORDS FOR TOPIC #4
['uids', 'heating', 'biontech', 'easy', 'robin', 'cent', 'electropages', 'ids', 'unique', 'ticker', 'original', 'ft', 'barra', 'infineon', 'india']


THE TOP 15 WORDS FOR TOPIC #5
['photoresists', 'quad', '31st

In [10]:
with open('csvs/gdelt_topics.json','w+', encoding='utf-8') as f:
    f.writelines(json.dumps(topic_lst))

In [11]:
topic_results = LDA.transform(dtm)

In [12]:
gdelt_news['Topic'] = topic_results.argmax(axis=1)

In [13]:
gdelt_news.to_csv('csvs/gdelt_news_with_topic.csv', index=False)

In [14]:
all_news = list(gdelt_news['News'])

In [15]:
import spacy
from spacy import displacy
from collections import Counter

In [16]:
import en_core_web_sm
nlp = en_core_web_sm.load()

In [17]:
doc = nlp(all_news[10])
[(X.text, X.label_) for X in doc.ents]

[('Kay Georgi', 'PERSON'),
 ('John Gurtunca', 'PERSON'),
 ('David Hanke', 'PERSON'),
 ('Marwa Hassoun', 'PERSON'),
 ('late September', 'DATE'),
 ('the Biden Administration', 'ORG'),
 ('White House', 'ORG'),
 ('Congress', 'ORG'),
 ('Commerce Department', 'ORG'),
 ('White House', 'ORG'),
 ('Congress', 'ORG'),
 ('America Act', 'ORG'),
 ('Commerce Continues Engagement', 'ORG'),
 ('September 23, 2021', 'DATE'),
 ('Commerce', 'ORG'),
 ('Gina Raimondo', 'PERSON'),
 ('National Economic Council', 'ORG'),
 ('Brian Deese', 'PERSON'),
 ('White House', 'FAC'),
 ('third', 'ORDINAL'),
 ('the Biden Administration', 'ORG'),
 ('the U.S. Department of Commerce', 'ORG'),
 ('September 24, 2021', 'DATE'),
 ('the Commerce Department', 'ORG'),
 ('Bureau of Industry and Security', 'ORG'),
 ('BIS', 'ORG'),
 ('U.S.', 'GPE'),
 ('non-U.S.', 'GPE'),
 ('BIS', 'ORG'),
 ('RFI', 'ORG'),
 ('RFI', 'ORG'),
 ('two', 'CARDINAL'),
 ('first', 'ORDINAL'),
 ('years 2019 and', 'DATE'),
 ('2020', 'DATE'),
 ('2021', 'DATE'),
 ('th

In [18]:
organization = []
for news in all_news:
    doc = nlp(news)
    organization.append([i.text for i in doc.ents if (i.label_ == 'ORG' and i.text != "LLC")])

In [19]:
flat_org = [item for sublist in organization for item in sublist]

In [20]:
flat_org_unq = list(set(flat_org))

In [24]:
keywords = [
    "house",
    "university",
    "report",
    "information",
    "communication",
    "time",
    "visa",
    "master",
    "social",
    "sale",
    "invest",
    "street",
    "network",
    "terminal",
    "linked",
    "facebook",
    "yahoo",
    'youtube',
    "daily",
    "week",
    "business",
    "trust",
    "foundation",
    ".com",
    "stud",
    "history",
    "school",
    "stock",
    "world",
    "cnn",
    "science",
    "england &",
    "fund",
    "wordpress",
    "trump",
    "electric",
    "twitter",
    "musk",
    "elon",
    "berkley",
    "usb",
    "l.",
    "modi",
    "postal"
    "app",
    "semiconductor",
    "Li",
    "ST",
    "on",
    "Lattice",
    "micro",
    "state",
    "ip",
    "ev",
    "ml",
    "next",
    "live",
    "led",
    "silicon",
    "covid",
    "news",
    "journal",
    "council",
    "SaaS",
    'Global',
    'car',
    'app',
    'Government',
    'VAT',
    'Air',
    'OPEN',
    'Group',
    'WSJ',
    'Shop',
    'cnbc',
    'Bank'
]

In [25]:
filtered_flat_org_unq = []

for org in flat_org_unq:
    flag = True
    for k in keywords:
        if k.strip().lower() in org.strip().lower():
            flag = False
            break
    if flag:
        filtered_flat_org_unq.append(org)

In [47]:
filtered_flat_org_unq

['Mellanox Technologies',
 'US Treasury',
 'BoE',
 'General Motors Co.',
 'Native',
 'Mehrotra',
 'Marvell',
 'MacRumors',
 'ET Intel Corp.',
 'IHS MARKIT - OCTOBER 2021',
 'Toyota Motors',
 'Nikkei Asia',
 'the General Assembly',
 'Ultrasound Companies Seek Priority',
 'PSA Group',
 'Qualcomm',
 'Toshiba',
 'GKN',
 'PE',
 'Nintendo Switch',
 'RAM',
 'Apple Arcade',
 'Shop',
 'GT Advanced Technologies',
 'GOP',
 'AMD',
 'VW',
 'Nan Ya Printed Circuit Board Corp.',
 'Campbell Soup',
 'CMX Denim',
 'Xcode',
 'PFE',
 'Alexa',
 'The Home Office',
 'LMC Automotive',
 'ResMed',
 'EUV',
 'Ford',
 'DigitalEurope',
 'Wolfspeed',
 'the U.S. Government',
 'Harrell',
 'Punit PARANJPE',
 'MSU',
 'IHS MARKIT Automotive Research and Analysis Analysis',
 'S&P',
 'pan-EU',
 'Getty Images CHENG TING-FANG',
 'Null Pointer',
 'the Park Slope Food Co-op',
 'RIN 0694-XC084',
 'Insulate Britain',
 'USPS Press',
 'IHS',
 'Mazda',
 'Culture Committee',
 'General Motors',
 'Oki Engineering',
 'Hanbury of Bain',

In [27]:
temp_all_news = " ".join(all_news)

In [28]:
freq_dict = dict()
for org in filtered_flat_org_unq:
    if len(org) > 2:
        count = temp_all_news.lower().count(org.lower())
        freq_dict[org] = count

In [29]:
sorted_freq_dict = {k: v for k, v in sorted(freq_dict.items(), key=lambda item: item[1], reverse=True)}

In [50]:
sorted_freq_dict

{'REE': 270,
 'TSMC': 147,
 'MIT': 120,
 'UBS': 111,
 'NSO': 111,
 'Intel': 86,
 'Apple': 82,
 'Reuters': 80,
 'RAM': 75,
 'NIO': 73,
 'General Motors': 47,
 'AMD': 43,
 'FRI': 35,
 'Ford': 31,
 'Nvidia': 30,
 'Commerce': 29,
 'Fed': 28,
 'Toyota': 27,
 'Digital': 27,
 'PRA': 25,
 'GlobalFoundries': 24,
 'Nissan': 23,
 'Barra': 23,
 'MSU': 22,
 'Target': 22,
 'BIS': 19,
 'Samsung': 19,
 'Audi': 18,
 'GETTY': 18,
 'RHA': 18,
 'Segars': 17,
 'Native': 16,
 'Park': 16,
 'Getty Images': 15,
 'Market Data': 15,
 'The Prime Effect': 15,
 'AWS': 14,
 'ABF': 14,
 'OPEC': 14,
 'Murphy': 13,
 'Twitch': 13,
 'POPULAR': 13,
 'IHS': 12,
 'CSCRP': 12,
 'SOURCING': 12,
 'Duesmann': 12,
 'Marvell': 11,
 'S&P': 11,
 'Brexit': 11,
 'BMW': 11,
 'Goldman Sachs': 11,
 'MarketWatch': 11,
 'Qualcomm': 10,
 'Wolfspeed': 10,
 'Gavekal': 10,
 'Marketplace': 10,
 'Volkswagen': 10,
 'Wang': 10,
 'RFI': 10,
 'Mehrotra': 9,
 'the U.S. Government': 9,
 'IDC': 9,
 'IMF': 9,
 'Asda': 9,
 'Macy': 9,
 'Snap': 8,
 'Runwa

In [51]:
with open('csvs/updated_freq_company_names_gdlet.json', 'w+', encoding='utf-8') as f:
    f.writelines(json.dumps(sorted_freq_dict))

In [52]:
import itertools

thirty_sorted_freq_dict = dict(itertools.islice(sorted_freq_dict.items(), 30))

with open('csvs/updated_thirty_freq_company_names_gdlet.json', 'w+', encoding='utf-8') as f:
    f.writelines(json.dumps(thirty_sorted_freq_dict))

In [53]:
thirty_sorted_freq_dict

{'REE': 270,
 'TSMC': 147,
 'MIT': 120,
 'UBS': 111,
 'NSO': 111,
 'Intel': 86,
 'Apple': 82,
 'Reuters': 80,
 'RAM': 75,
 'NIO': 73,
 'General Motors': 47,
 'AMD': 43,
 'FRI': 35,
 'Ford': 31,
 'Nvidia': 30,
 'Commerce': 29,
 'Fed': 28,
 'Toyota': 27,
 'Digital': 27,
 'PRA': 25,
 'GlobalFoundries': 24,
 'Nissan': 23,
 'Barra': 23,
 'MSU': 22,
 'Target': 22,
 'BIS': 19,
 'Samsung': 19,
 'Audi': 18,
 'GETTY': 18,
 'RHA': 18}