In [1]:
import pandas as pd

In [2]:
google_news = pd.read_csv('csvs/google_news_updated.csv', sep='\t')

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
tfidf = TfidfVectorizer(stop_words='english')

In [5]:
dtm = tfidf.fit_transform(google_news['News'])

In [6]:
from sklearn.decomposition import LatentDirichletAllocation

In [7]:
LDA = LatentDirichletAllocation(n_components=10,random_state=101)

In [8]:
LDA.fit(dtm)

LatentDirichletAllocation(random_state=101)

In [70]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV

search_params = {
  'n_components': [5, 10, 15, 20, 25],
  'learning_decay': [.5, .7]
}

model = LatentDirichletAllocation()

# Try all of the options
gridsearch = GridSearchCV(model, param_grid=search_params, n_jobs=-1, verbose=1)
gridsearch.fit(dtm)

# What did we find?
print("Best Model's Params: ", gridsearch.best_params_)
print("Best Log Likelihood Score: ", gridsearch.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Model's Params:  {'learning_decay': 0.5, 'n_components': 5}
Best Log Likelihood Score:  -3039.524846940912


In [74]:
lda_model = gridsearch.best_estimator_

In [76]:
single_topic = lda_model.components_[0]
top_word_indices = single_topic.argsort()[-10:]
for index in top_word_indices:
    print(tfidf.get_feature_names()[index])

companies
2021
new
said
chips
supply
stocks
industry
chip
semiconductor




In [9]:
topic_lst = []
for index,topic in enumerate(LDA.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([tfidf.get_feature_names_out()[i] for i in topic.argsort()[-15:]])
    print('\n')
    
    topic_lst.append([tfidf.get_feature_names_out()[i] for i in topic.argsort()[-15:]])

THE TOP 15 WORDS FOR TOPIC #0
['market', 'manufacturing', 'semiconductors', 'year', 'stock', 'companies', '2021', 'new', 'said', 'chips', 'supply', 'stocks', 'industry', 'chip', 'semiconductor']


THE TOP 15 WORDS FOR TOPIC #1
['indices', 'ads', 'opinion', 'span', 'hydrogen', 'usa', 'water', 'standard', 'splitting', 'carbon', 'purdue', 'dow', 'li', 'jones', 'milwaukee']


THE TOP 15 WORDS FOR TOPIC #2
['inhibits', 'sens', 'crapo', 'mosi2n4', 'siltron', 'ang', 'contacts', 'boi', 'chest', 'window', 'makris', 'dallas', 'ut', '2d', 'michigan']


THE TOP 15 WORDS FOR TOPIC #3
['lamps', 'victoria', 'cascade', 'commercialization', 'tillmann', 'deductions', 'code', 'lighting', 'casfet', 'steven', 'woo', 'santo', 'kubis', 'osram', 'purdue']


THE TOP 15 WORDS FOR TOPIC #4
['footprint', 'ideas', 'senesky', 'son', 'carbon', 'clean', 'climate', 'fab', 'lewis', 'performer', 'wong', 'softbank', 'liu', 'theme', 'stanford']


THE TOP 15 WORDS FOR TOPIC #5
['expandable', 'computers', '1941', 'bcg', 'go

In [10]:
import json

In [11]:
with open('csvs/google_topics.json','w+', encoding='utf-8') as f:
    f.writelines(json.dumps(topic_lst))

In [12]:
topic_results = LDA.transform(dtm)

In [13]:
topic_results.shape

(70, 10)

In [14]:
google_news['Topic'] = topic_results.argmax(axis=1)

In [15]:
google_news

Unnamed: 0,News,Titles,Topic
0,TOP BUSINESS NEWS (15 VIDEOS) GlobalFoundries ...,GlobalFoundries CEO on demand for semiconducto...,1
1,Tom's Hardware is supported by its audience. W...,China's Mandated Power Cuts Hit Local Semicond...,0
2,"Sign up as a reader Academic rigour, journalis...",Europe is running out of semiconductors – here...,0
3,Analyst Remains Bullish On Micron And These 2...,Analyst Remains Bullish On Micron And These 2 ...,0
4,New semiconductor device possibilities using b...,New semiconductor device possibilities using b...,0
...,...,...,...
65,Skip to main content The computer chip industr...,The computer chip industry has a dirty climate...,0
66,Ford CEO on F-150 Lightning: ‘we want to lea...,Ford CEO: Semiconductor shortage is improving ...,0
67,Guidance for the Brookings community and the p...,Podcast: The view from industry on U.S. semico...,9
68,"OCTOBER 2, 2021 | MORE EVIDENCE THAT VITAMIN D...",Newly Discovered Family of 2D Semiconductors E...,0


In [16]:
google_news.to_csv('csvs/google_news_with_topic.csv', index=False)

In [None]:
# ===================== COMPANY INFO ============================

In [6]:
all_news = list(google_news['News'])

In [27]:
import spacy
from spacy import displacy
from collections import Counter

In [32]:
import en_core_web_sm

In [34]:
nlp = en_core_web_sm.load()

In [43]:
doc = nlp(all_news[10])
[(X.text, X.label_) for X in doc.ents]

[('FDA', 'ORG'),
 ('Facebook', 'ORG'),
 ('CTO', 'PRODUCT'),
 ('White House', 'ORG'),
 ("Yahoo Finance's", 'ORG'),
 ('Julie Hyman', 'PERSON'),
 ('Thursday', 'DATE'),
 ('White House', 'ORG'),
 ('September 23', 'DATE'),
 ('2021·2', 'DATE'),
 ('WASHINGTON', 'GPE'),
 ('Sept 23', 'DATE'),
 ('Reuters', 'ORG'),
 ('The White House', 'ORG'),
 ('Thursday', 'DATE'),
 ('Commerce', 'ORG'),
 ('Gina Raimondo', 'PERSON'),
 ('White House National Economic Council', 'ORG'),
 ('Detroit', 'GPE'),
 ('General Motors', 'ORG'),
 ('Ford Motor', 'ORG'),
 ('Stellantis', 'ORG'),
 ('Apple', 'ORG'),
 ('Daimler AG', 'ORG'),
 ('GlobalFoundries', 'ORG'),
 ('Micron', 'ORG'),
 ('Microsoft', 'ORG'),
 ('Samsung', 'ORG'),
 ('TSMC', 'ORG'),
 ('Intel Corp', 'ORG'),
 ('Pat Gelsinger', 'PERSON'),
 ('The White House', 'ORG'),
 ('this week', 'DATE'),
 ('45 days', 'DATE'),
 ('Biden', 'GPE'),
 ('Wednesday', 'DATE'),
 ('first', 'ORDINAL'),
 ('Biden', 'PERSON'),
 ('U.S.', 'GPE'),
 ('Vietnam', 'GPE'),
 ('Malaysia', 'GPE'),
 ('Last mon

In [60]:
organization = []
for news in all_news:
    doc = nlp(news)
    organization.append([i.text for i in doc.ents if (i.label_ == 'ORG' and i.text != "LLC")])

In [61]:
flat_org = [item for sublist in organization for item in sublist]

In [62]:
flat_org_unq = list(set(flat_org))

In [119]:
keywords = [
    "house",
    "university",
    "report",
    "information",
    "communication",
    "time",
    "visa",
    "master",
    "social",
    "sale",
    "invest",
    "street",
    "network",
    "terminal",
    "linked",
    "facebook",
    "yahoo",
    'youtube',
    "daily",
    "week",
    "business",
    "trust",
    "foundation",
    ".com",
    "stud",
    "history",
    "school",
    "stock",
    "world",
    "cnn",
    "science",
    "england &",
    "fund",
    "wordpress",
    "trump",
    "electric",
    "twitter",
    "musk",
    "elon",
    "berkley",
    "usb",
    "l.",
    "modi",
    "postal"
    "app",
    "semiconductor",
    "Li",
    "ST",
    "on",
    "Lattice",
    "micro",
    "state",
    "ip",
    "ev",
    "ml",
    "next",
    "live",
    "led",
    "silicon",
    "covid",
    "news",
    "journal",
    "automobile",
    "car",
    "council"
]

In [120]:
filtered_flat_org_unq = []

for org in flat_org_unq:
    flag = True
    for k in keywords:
        if k.strip().lower() in org.strip().lower():
            flag = False
            break
    if flag:
        filtered_flat_org_unq.append(org)

In [163]:
temp_all_news = " ".join(all_news)

In [164]:
freq_dict = dict()
for org in filtered_flat_org_unq:
    if len(org) > 2:
        count = temp_all_news.lower().count(org.lower())
        freq_dict[org] = count

In [165]:
freq_dict

{'FPGA': 79,
 'KS ASEPTIC PROCESS ENGINEER - PFIZER - ROCKY MOUNT': 1,
 'North Pole Express': 1,
 'Qualcomm Europe': 1,
 'Synopsys': 5,
 'Ford': 66,
 'WeWork, Uber, Grab, Compass, DoorDash': 1,
 'MPW': 4,
 'Nio, BYD Co.': 1,
 'Plug Power': 2,
 'MTW': 2,
 'USICA': 1,
 'Lülsdorf': 2,
 'HPCwire': 4,
 'Price Bank of America': 1,
 'Agenda & PR ©': 1,
 'Nikkei Asia': 3,
 'Bain & Co.': 1,
 'the Intel Parallel Computing Center': 1,
 'Unique': 11,
 'Barney and Associates': 1,
 'Si-N': 1,
 'BMW': 3,
 'Dimitra': 1,
 'JPMorgan': 1,
 'OECD': 1,
 'How Modern Health': 1,
 'Cerner': 2,
 'the U.S. Patent and Trademark Office': 1,
 'QLC': 1,
 'Kubis': 11,
 'ET Intel Corp.': 1,
 'MOSFET': 4,
 'Tridge': 2,
 'IndieBio': 2,
 'Removing Tax Barriers—': 1,
 'RPI': 5,
 'TAM': 17,
 'IUCRC': 1,
 'Digital Interfaces': 1,
 'Inez Kerr Bell Professors': 1,
 'DFT': 3,
 'ETF Winners and Losers': 1,
 'Ford Motor': 4,
 'Pfizer Inc': 2,
 'NAISE': 1,
 'VLSI Technology': 1,
 'Surge': 14,
 'Movers': 2,
 'Umicore AG & Co.': 1

In [166]:
sorted_freq_dict = {k: v for k, v in sorted(freq_dict.items(), key=lambda item: item[1], reverse=True)}

In [215]:
sorted_freq_dict

{'ARK': 319,
 'ASE': 290,
 'Ning': 241,
 'SEC': 190,
 'CED': 175,
 'MIT': 162,
 'Intel': 150,
 'NDA': 121,
 'EMS': 92,
 'ADI': 88,
 'Arm': 85,
 'FPGA': 79,
 'Mac': 74,
 'TSMC': 71,
 'Nio': 69,
 'Ford': 66,
 'SIA': 65,
 'NGL': 63,
 'NASDAQ': 61,
 'Quantum': 54,
 'ETF': 51,
 'DOE': 51,
 'Apple': 50,
 'FULL': 49,
 'NAM': 49,
 'Samsung': 49,
 'SPAC': 48,
 'COO': 41,
 'Phys': 41,
 'AMD': 40,
 'Reuters': 33,
 'Nvidia': 33,
 'DOI': 33,
 'Mental': 31,
 'CFIUS': 30,
 'NYSE': 29,
 'Fed': 28,
 'FREE': 28,
 'Merck': 23,
 'Telecom': 22,
 'Google': 20,
 'NSW': 19,
 'S&P': 18,
 'Factories': 18,
 'QMC': 18,
 'TAM': 17,
 'SoftBank': 17,
 'Commerce': 17,
 'Goal': 17,
 'LSI': 17,
 'ACES': 16,
 'HPC': 16,
 'Pfizer': 16,
 'Senate': 16,
 'RAMP': 15,
 'Surge': 14,
 'ISM': 13,
 'Committee': 13,
 'MIA': 13,
 'Makris': 12,
 'NXP': 12,
 'Unique': 11,
 'Kubis': 11,
 'QMCPACK': 11,
 'Toyota': 11,
 'GPU': 11,
 'Rambus': 11,
 'Qualcomm': 11,
 'CDC': 11,
 'MarketWatch': 10,
 'Berkeley': 10,
 'OEM': 9,
 'Insider': 9,


In [179]:
import json

In [216]:
with open('csvs/updated_freq_company_names.json', 'w+', encoding='utf-8') as f:
    f.writelines(json.dumps(sorted_freq_dict))

In [217]:
import itertools

thirty_sorted_freq_dict = dict(itertools.islice(sorted_freq_dict.items(), 30))

In [218]:
with open('csvs/updated_thirty_freq_company_names.json', 'w+', encoding='utf-8') as f:
    f.writelines(json.dumps(thirty_sorted_freq_dict))

In [196]:
keys = list(sorted_freq_dict.keys())

In [209]:
lower_sorted_freq_dict = {k.lower():sorted_freq_dict[k] for k in keys}

In [210]:
lower_sorted_freq_dict.pop('ark')

{'ark': 319,
 'ase': 290,
 'ning': 241,
 'sec': 190,
 'ced': 175,
 'mit': 162,
 'intel': 150,
 'nda': 121,
 'ems': 92,
 'adi': 88,
 'arm': 85,
 'fpga': 79,
 'mac': 74,
 'tsmc': 71,
 'nio': 69,
 'ford': 66,
 'sia': 65,
 'trade': 63,
 'ngl': 63,
 'nasdaq': 61,
 'quantum': 54,
 'etf': 51,
 'doe': 51,
 'apple': 50,
 'full': 49,
 'nam': 49,
 'samsung': 49,
 'spac': 48,
 'coo': 41,
 'phys': 41,
 'amd': 40,
 'reuters': 33,
 'nvidia': 33,
 'doi': 33,
 'mental': 31,
 'cfius': 30,
 'nyse': 29,
 'fed': 28,
 'free': 28,
 'merck': 23,
 'telecom': 22,
 'google': 20,
 'nsw': 19,
 's&p': 18,
 'factories': 18,
 'qmc': 18,
 'tam': 17,
 'softbank': 17,
 'commerce': 17,
 'goal': 17,
 'lsi': 17,
 'aces': 16,
 'hpc': 16,
 'pfizer': 16,
 'senate': 16,
 'ramp': 15,
 'surge': 14,
 'ism': 13,
 'committee': 13,
 'mia': 13,
 'makris': 12,
 'nxp': 12,
 'unique': 11,
 'kubis': 11,
 'qmcpack': 11,
 'toyota': 11,
 'gpu': 11,
 'rambus': 11,
 'qualcomm': 11,
 'cdc': 11,
 'marketwatch': 10,
 'berkeley': 10,
 'oem': 9,
 

In [211]:
with open('csvs/updated_sorted_freq_company_names.json', 'w+', encoding='utf-8') as f:
    f.writelines(json.dumps(lower_sorted_freq_dict))

In [212]:
import itertools

lower_thirty_sorted_freq_dict = dict(itertools.islice(lower_sorted_freq_dict.items(), 30))

In [213]:
with open('csvs/lower_updated_thirty_freq_company_names.json', 'w+', encoding='utf-8') as f:
    f.writelines(json.dumps(lower_thirty_sorted_freq_dict))