In [1]:
import pandas as pd

In [2]:
google_news = pd.read_csv('csvs/google_news_updated.csv', sep='\t')

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
tfidf = TfidfVectorizer(stop_words='english')

In [5]:
google_news[google_news.isnull().any(axis=1)]

Unnamed: 0,News
4,
61,


In [6]:
google_news.dropna(inplace=True)

In [7]:
dtm = tfidf.fit_transform(google_news['News'])

In [8]:
from sklearn.decomposition import LatentDirichletAllocation

In [9]:
LDA = LatentDirichletAllocation(n_components=10,random_state=101)

In [10]:
LDA.fit(dtm)

LatentDirichletAllocation(random_state=101)

In [11]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV

search_params = {
  'n_components': [5, 10, 15, 20, 25],
  'learning_decay': [.5, .7]
}

model = LatentDirichletAllocation()

# Try all of the options
gridsearch = GridSearchCV(model, param_grid=search_params, n_jobs=-1, verbose=1)
gridsearch.fit(dtm)

# What did we find?
print("Best Model's Params: ", gridsearch.best_params_)
print("Best Log Likelihood Score: ", gridsearch.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Model's Params:  {'learning_decay': 0.5, 'n_components': 5}
Best Log Likelihood Score:  -3454.3441540666377


In [11]:
topic_lst = []
for index,topic in enumerate(LDA.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([tfidf.get_feature_names_out()[i] for i in topic.argsort()[-15:]])
    print('\n')
    
    topic_lst.append([tfidf.get_feature_names_out()[i] for i in topic.argsort()[-15:]])

THE TOP 15 WORDS FOR TOPIC #0
['effect', 'electropages', 'ids', 'robin', 'heaven', 'prime', 'minerals', 'unique', 'ft', 'rare', 'amazon', 'point47', 'chakrabarti', 'sanctions', 'infineon']


THE TOP 15 WORDS FOR TOPIC #1
['bid', 'discount', 'regulators', 'rtx', 'stockpiling', 'lake', 'alder', 'vw', 'hours', 'graphics', 'gaming', 'audi', 'segars', 'arm', 'duesmann']


THE TOP 15 WORDS FOR TOPIC #2
['joint', 'javascript', 'detected', 'unusual', 'inquiries', 'activity', 'sure', 'availability', 'gaming', 'marvell', 'review', 'healthcare', 'contact', 'murphy', 'ministry']


THE TOP 15 WORDS FOR TOPIC #3
['car', 'supply', 'tesla', 'semiconductor', 'global', 'news', 'production', '2021', 'new', 'chips', 'gm', 'year', 'said', 'shortage', 'chip']


THE TOP 15 WORDS FOR TOPIC #4
['wang', 'mubadala', 'noah', 'bel', 'trevor', 'msrp', 'picks', 'hoarding', 'established', 'quarterly', 'smith', 'bolling', 'mansion', 'reuss', 'globalfoundries']


THE TOP 15 WORDS FOR TOPIC #5
['126', 'gallons', 'settle

In [12]:
import json

In [13]:
with open('csvs/google_topics.json','w+', encoding='utf-8') as f:
    f.writelines(json.dumps(topic_lst))

In [14]:
topic_results = LDA.transform(dtm)

In [15]:
topic_results.shape

(83, 10)

In [16]:
google_news['Topic'] = topic_results.argmax(axis=1)

In [17]:
google_news

Unnamed: 0,News,Topic
0,"Last Updated 7 days ago U.S., EU agree to work...",7
1,Live Now: President Biden Hosts a Meeting with...,7
2,Skip to main content The Struggle to Define Lo...,6
3,"Search quotes, news & videos AMD CEO Lisa Su s...",7
5,"Search quotes, news & videos Abu Dhabi-control...",3
...,...,...
80,WORKING FROM HOME CLOUD INNOVATION CXO MORE Hu...,4
81,"If you're a crypto novice, you might be wonder...",3
82,Last Updated 40 minutes ago Auto output dives ...,7
83,"Thursday, Oct 7th 2021 12AM 26°C 'It's easier ...",7


In [19]:
google_news.to_csv('csvs/google_news_with_topic.csv',sep='\t', index=False)

In [23]:
# ===================== COMPANY INFO Google News============================

In [18]:
all_news = list(google_news['News'])

In [4]:
import spacy
from spacy import displacy
from collections import Counter

In [5]:
import en_core_web_sm

In [6]:
nlp = en_core_web_sm.load()

In [22]:
doc = nlp(all_news[10])
[(X.text, X.label_) for X in doc.ents]

[('Elon Musk', 'PERSON'),
 ('FRI', 'ORG'),
 ('20217:41 AM', 'TIME'),
 ('25 20212:50 AM EDT Share Article', 'TIME'),
 ('Facebook Share Article', 'WORK_OF_ART'),
 ('Twitter Share Article', 'PRODUCT'),
 ('LinkedIn Share Article', 'ORG'),
 ('next year', 'DATE'),
 ('Italian', 'NORP'),
 ('Friday', 'DATE'),
 ('Intel', 'ORG'),
 ('TSMC', 'ORG'),
 ('U.S.', 'GPE'),
 ('several years', 'DATE'),
 ('Tesla Motors', 'ORG'),
 ('Elon Musk', 'PERSON'),
 ('Model S.', 'PRODUCT'),
 ('Elon Musk', 'ORG'),
 ('Friday', 'DATE'),
 ('next year', 'DATE'),
 ('next year', 'DATE'),
 ('Italian', 'NORP'),
 ('Friday', 'DATE'),
 ('Intel', 'ORG'),
 ('TSMC', 'ORG'),
 ('U.S.', 'GPE'),
 ('several years', 'DATE'),
 ('Glenn O’Donnell', 'PERSON'),
 ('Forrester', 'PERSON'),
 ('2023', 'DATE'),
 ('2022', 'DATE'),
 ('2023', 'DATE'),
 ('April', 'DATE'),
 ('Ford', 'ORG'),
 ('Volkswagen', 'ORG'),
 ('Daimler', 'ORG'),
 ('first-quarter', 'DATE'),
 ('Musk', 'ORG'),
 ('Tesla', 'ORDINAL'),
 ('This quarter', 'DATE'),
 ('Tesla', 'ORDINAL'),
 (

In [23]:
organization = []
for news in all_news:
    doc = nlp(news)
    organization.append([i.text for i in doc.ents if (i.label_ == 'ORG' and i.text != "LLC")])

In [24]:
flat_org = [item for sublist in organization for item in sublist]

In [25]:
flat_org_unq = list(set(flat_org))

In [33]:
keywords = [
    "house",
    "university",
    "report",
    "information",
    "communication",
    "time",
    "visa",
    "master",
    "social",
    "sale",
    "invest",
    "street",
    "network",
    "terminal",
    "linked",
    "facebook",
    "yahoo",
    'youtube',
    "daily",
    "week",
    "business",
    "trust",
    "foundation",
    ".com",
    "stud",
    "history",
    "school",
    "stock",
    "world",
    "cnn",
    "science",
    "england &",
    "fund",
    "wordpress",
    "trump",
    "electric",
    "twitter",
    "musk",
    "elon",
    "berkley",
    "usb",
    "l.",
    "modi",
    "postal"
    "app",
    "semiconductor",
    "Li",
    "ST",
    "on",
    "Lattice",
    "micro",
    "state",
    "ip",
    "ev",
    "ml",
    "next",
    "live",
    "led",
    "silicon",
    "covid",
    "news",
    "journal",
    "council",
    'Global',
    'car',
    'app',
    'Government',
    'VAT',
    'Air',
    'OPEN',
    'Group',
    'WSJ',
    'Shop',
    'cnbc',
    'bank',
    "air",
    "fact",
    "mobil"
]

In [34]:
filtered_flat_org_unq = []

for org in flat_org_unq:
    flag = True
    for k in keywords:
        if k.strip().lower() in org.strip().lower():
            flag = False
            break
    if flag:
        filtered_flat_org_unq.append(org)

In [35]:
temp_all_news = " ".join(all_news)

In [36]:
freq_dict = dict()
for org in filtered_flat_org_unq:
    if len(org) > 2:
        count = temp_all_news.lower().count(org.lower())
        freq_dict[org] = count

In [37]:
freq_dict

{'Rewards': 4,
 'Barcroft Media/Getty': 1,
 "Pandora Papers '": 2,
 'EPM': 1,
 'Nissan Motor': 3,
 'Jeep': 2,
 'Marvell': 11,
 'USPS View': 1,
 'FAA': 1,
 'Churchill': 5,
 'Tesla General Motors Co': 5,
 'Mellanox Technologies': 1,
 'SUMCO': 1,
 'Commerce Department': 7,
 "The Democratic Party's": 1,
 'JetBlue': 2,
 'Lunchclub': 1,
 'Part VII Surveillance': 1,
 'Bain &': 2,
 'Fiat Chrysler': 3,
 'FOUP': 1,
 'Nissan Motor Co.,': 2,
 'IHS Markit': 8,
 'Sainsbury': 1,
 'Guardian': 2,
 'Dodge': 2,
 'Wang': 10,
 'Ultra Cruise VW': 2,
 'CSCRP': 12,
 'SPD': 1,
 'TSMC data': 1,
 'Mojix': 2,
 'Saks Fifth Avenue': 4,
 'Pfizer': 4,
 'Oak Furnitureland': 1,
 'The Commerce Department': 4,
 'US Treasury': 3,
 'Ford Motor Co.’s': 1,
 'Disney': 4,
 'Hanbury': 4,
 'Ajinomoto Co.': 1,
 'CMA': 1,
 'Fiat': 4,
 'REE': 270,
 'Market Data': 15,
 'Terran Orbital': 1,
 'HNA': 8,
 'Volkswagen': 10,
 'GamesRadar': 3,
 'A16': 1,
 'Verdict': 4,
 'Intel INTC': 1,
 'the US Department of Interior': 1,
 'Null Pointer':

In [38]:
sorted_freq_dict = {k: v for k, v in sorted(freq_dict.items(), key=lambda item: item[1], reverse=True)}

In [39]:
sorted_freq_dict

{'REE': 270,
 'TSMC': 147,
 'MIT': 120,
 'NSO': 111,
 'UBS': 111,
 'Intel': 86,
 'Reuters': 80,
 'RAM': 75,
 'NIO': 73,
 'General Motors': 47,
 'AMD': 43,
 'FRI': 35,
 'Ford': 31,
 'Nvidia': 30,
 'Commerce': 29,
 'Fed': 28,
 'Toyota': 27,
 'Digital': 27,
 'PRA': 25,
 'Nissan': 23,
 'Barra': 23,
 'Target': 22,
 'MSU': 22,
 'BIS': 19,
 'Samsung': 19,
 'GETTY': 18,
 'RHA': 18,
 'Audi': 18,
 'Segars': 17,
 'Native': 16,
 'Park': 16,
 'Market Data': 15,
 'Getty Images': 15,
 'The Prime Effect': 15,
 'OPEC': 14,
 'ABF': 14,
 'AWS': 14,
 'Murphy': 13,
 'POPULAR': 13,
 'Twitch': 13,
 'CSCRP': 12,
 'Duesmann': 12,
 'IHS': 12,
 'SOURCING': 12,
 'Marvell': 11,
 'Goldman Sachs': 11,
 'S&P': 11,
 'Brexit': 11,
 'BMW': 11,
 'MarketWatch': 11,
 'Wang': 10,
 'Volkswagen': 10,
 'RFI': 10,
 'Gavekal': 10,
 'Marketplace': 10,
 'Qualcomm': 10,
 'Wolfspeed': 10,
 'Mehrotra': 9,
 'IMF': 9,
 'Macy': 9,
 'Asda': 9,
 'IDC': 9,
 'IHS Markit': 8,
 'HNA': 8,
 'Brent': 8,
 'Universal Credit': 8,
 'Nasdaq': 8,
 'Ru

In [40]:
import json

In [41]:
with open('csvs/updated_freq_company_names.json', 'w+', encoding='utf-8') as f:
    f.writelines(json.dumps(sorted_freq_dict))

In [42]:
import itertools

thirty_sorted_freq_dict = dict(itertools.islice(sorted_freq_dict.items(), 30))

In [43]:
with open('csvs/updated_thirty_freq_company_names.json', 'w+', encoding='utf-8') as f:
    f.writelines(json.dumps(thirty_sorted_freq_dict))

In [44]:
keys = list(sorted_freq_dict.keys())

In [45]:
lower_sorted_freq_dict = {k.lower():sorted_freq_dict[k] for k in keys}

In [211]:
with open('csvs/updated_sorted_freq_company_names.json', 'w+', encoding='utf-8') as f:
    f.writelines(json.dumps(lower_sorted_freq_dict))

In [212]:
import itertools

lower_thirty_sorted_freq_dict = dict(itertools.islice(lower_sorted_freq_dict.items(), 30))

In [213]:
with open('csvs/lower_updated_thirty_freq_company_names.json', 'w+', encoding='utf-8') as f:
    f.writelines(json.dumps(lower_thirty_sorted_freq_dict))