In [3]:
import pandas as pd
from ast import literal_eval
import gc
from sklearn.feature_extraction.text import CountVectorizer

path = "../clean_data/lemmatized_tokenized_data.csv"

col = "tokenized_text"

df = pd.read_csv(path, usecols=[col])
    
df.head() 

Unnamed: 0,tokenized_text
0,"['last', 'federal', 'reserve', 'board', 'issue..."
1,"['test', 'door', 'service', 'service', 'board'..."
2,"['sanction', 'chinese', 'contract']"
3,"['lead', 'frazier', '4', '496', 'langer', 'mar..."
4,"['chicago', 'april', '30', '300', 'suspicious'..."


In [None]:

df[col] = df[col].apply(lambda x: " ".join(literal_eval(x)))
df.head()

Unnamed: 0,tokenized_text
0,last federal reserve board issued rule organiz...
1,test door service service board examine r safe...
2,sanction chinese contract
3,lead frazier 4 496 langer margin fall north da...
4,chicago april 30 300 suspicious character incl...


In [5]:
vectorizer = CountVectorizer()

data = vectorizer.fit_transform(df[col])

In [29]:
import pickle
with open("count_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

In [8]:
data.shape

(11027532, 1328101)

In [39]:
from scipy import sparse
from scipy.sparse import csr_matrix, save_npz, load_npz
import numpy as np

save_npz('dtm.npz', data, compressed=True)

In [40]:
ndata = load_npz('dtm.npz')
type(ndata)

scipy.sparse._csr.csr_matrix

In [54]:
from sklearn.decomposition import LatentDirichletAllocation
import gc
N_TOPICS = 20
chunk_size = 100000

lda = LatentDirichletAllocation(
    n_components=N_TOPICS,
    learning_method='online',  # Allows partial_fit
    random_state=42,
    batch_size=chunk_size,
    max_iter=2,                  # One pass per chunk,
    n_jobs=-1
)

fitted = lda.fit(data)


In [55]:
def print_top_words(model, feature_names, n_top_words=10):
    for topic_idx, topic in enumerate(model.components_):
        top_features = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        print(f"Topic #{topic_idx+1}: {' | '.join(top_features)}")

print_top_words(lda, vectorizer.get_feature_names_out(), 100)

Topic #1: said | state | would | united | year | government | new | american | nation | today | country | world | president | tax | official | could | economic | last | billion | trade | may | bank | administration | week | one | say | international | deal | month | plan | time | two | economy | budget | foreign | europe | next | problem | political | policy | business | financial | lead | first | agreement | talk | change | european | washington | many | crisis | money | video | cut | major | since | security | 1988 | national | war | britain | long | effort | expected | still | much | even | people | way | make | global | power | minister | term | news | end | federal | issue | america | move | british | interest | meeting | debt | whether | made | prime | might | spending | public | question | dollar | military | capital | future | page | france | credit | hope | step
Topic #2: new | music | theater | art | show | night | york | dance | performance | hall | play | review | concert |

In [56]:
import pickle

with open(f'lda_model-n-topics-{N_TOPICS}.pk', 'wb') as f:
    pickle.dump(lda, f)


# Check reloading the model

In [57]:
l_lda = None
with open(f'lda_model-n-topics-5.pk', 'rb') as f:
    l_lda = pickle.load(f)

l_lda

0,1,2
,n_components,5
,doc_topic_prior,
,topic_word_prior,
,learning_method,'online'
,learning_decay,0.7
,learning_offset,10.0
,max_iter,1
,batch_size,100000
,evaluate_every,-1
,total_samples,1000000.0


In [58]:
l_v = None
with open(f'count_vectorizer.pkl', 'rb') as f:
    l_v = pickle.load(f)
l_v

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(1, ...)"


In [59]:
def print_top_words(model, feature_names, n_top_words=10):
    for topic_idx, topic in enumerate(model.components_):
        top_features = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        print(f"Topic #{topic_idx+1}: {' | '.join(top_features)}")

print_top_words(l_lda, l_v.get_feature_names_out(), 100)

Topic #1: say | said | american | world | new | united | state | war | government | country | official | would | president | nation | people | today | china | day | talk | force | leader | military | time | news | may | one | soviet | foreign | could | iraq | economic | attack | year | two | many | week | trade | political | south | europe | minister | last | article | plan | international | israel | prime | british | air | market | japan | power | policy | problem | end | effort | nuclear | european | way | israeli | still | lead | security | briefing | since | see | make | chinese | even | party | photo | month | group | peace | move | french | russia | economy | bush | west | army | crisis | britain | need | america | north | question | russian | much | issue | first | iran | france | take | washington | change | back | recent | administration | german
Topic #2: new | mr | york | year | one | photo | art | time | death | life | book | family | john | child | street | old | like | sh