In [7]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import random
import pickle as pkl

In [8]:
df = pd.read_pickle("..\\datasets\\tweets_cleaned_lemma_stopwords.pkl.gz")

In [9]:
tweets = df.tweet.to_list()
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
random.Random(RANDOM_SEED).shuffle(tweets)

train_tweets, test_tweets = train_test_split(tweets, train_size=0.8)


vectorizer = CountVectorizer(min_df=2, max_df=0.95, max_features=4000)
train_counts = vectorizer.fit_transform(train_tweets)
test_counts = vectorizer.transform(test_tweets)


In [14]:
import plotly.express as px
counts = np.array(train_counts.sum(axis=0)).squeeze()

counts_sorted = np.sort(counts)[::-1]

values = np.cumsum(counts_sorted)

loss = -(values - np.sum(counts_sorted)) / np.sum(counts_sorted)

fig = px.line(loss)
fig.show(renderer='browser')

In [11]:
with open(f"vectorizer_4000.pkl.gz", "wb") as vec_file:
    pkl.dump(vectorizer, vec_file, protocol=pkl.HIGHEST_PROTOCOL)

In [12]:
with open("lda_results.csv", "wt") as f:
    f.write("components;train;test\n")

    for k_components in [10, 15, 20, 25, 30]:
        print(f"Fitting {k_components}")
        lda = LatentDirichletAllocation(n_components=k_components, random_state=42, verbose=True, evaluate_every=10, n_jobs=15, max_iter=100)
        lda.fit(train_counts)
        
        train_perplexity = lda.perplexity(train_counts)
        test_perplexity = lda.perplexity(test_counts)
        f.write(f"{k_components};{train_perplexity};{test_perplexity}\n")
        f.flush()
        with open(f"lda_{k_components}.pkl.gz", "wb") as model_file:
            pkl.dump(lda, model_file, protocol=pkl.HIGHEST_PROTOCOL)

    

Fitting 10
iteration: 1 of max_iter: 100
iteration: 2 of max_iter: 100
iteration: 3 of max_iter: 100
iteration: 4 of max_iter: 100
iteration: 5 of max_iter: 100
iteration: 6 of max_iter: 100
iteration: 7 of max_iter: 100
iteration: 8 of max_iter: 100
iteration: 9 of max_iter: 100
iteration: 10 of max_iter: 100, perplexity: 1827.5571
iteration: 11 of max_iter: 100
iteration: 12 of max_iter: 100
iteration: 13 of max_iter: 100
iteration: 14 of max_iter: 100
iteration: 15 of max_iter: 100
iteration: 16 of max_iter: 100
iteration: 17 of max_iter: 100
iteration: 18 of max_iter: 100
iteration: 19 of max_iter: 100
iteration: 20 of max_iter: 100, perplexity: 1761.9150
iteration: 21 of max_iter: 100
iteration: 22 of max_iter: 100
iteration: 23 of max_iter: 100
iteration: 24 of max_iter: 100
iteration: 25 of max_iter: 100
iteration: 26 of max_iter: 100
iteration: 27 of max_iter: 100
iteration: 28 of max_iter: 100
iteration: 29 of max_iter: 100
iteration: 30 of max_iter: 100, perplexity: 1747.1276

iteration: 18 of max_iter: 100
iteration: 19 of max_iter: 100
iteration: 20 of max_iter: 100, perplexity: 2282.7258
iteration: 21 of max_iter: 100
iteration: 22 of max_iter: 100
iteration: 23 of max_iter: 100
iteration: 24 of max_iter: 100
iteration: 25 of max_iter: 100
iteration: 26 of max_iter: 100
iteration: 27 of max_iter: 100
iteration: 28 of max_iter: 100
iteration: 29 of max_iter: 100
iteration: 30 of max_iter: 100, perplexity: 2286.4650
iteration: 31 of max_iter: 100
iteration: 32 of max_iter: 100
iteration: 33 of max_iter: 100
iteration: 34 of max_iter: 100
iteration: 35 of max_iter: 100
iteration: 36 of max_iter: 100
iteration: 37 of max_iter: 100
iteration: 38 of max_iter: 100
iteration: 39 of max_iter: 100
iteration: 40 of max_iter: 100, perplexity: 2286.3272
iteration: 41 of max_iter: 100
iteration: 42 of max_iter: 100
iteration: 43 of max_iter: 100
iteration: 44 of max_iter: 100
iteration: 45 of max_iter: 100
iteration: 46 of max_iter: 100
iteration: 47 of max_iter: 100
i

In [23]:
with open("vectorizer_4000.pkl.gz", "rb") as f:
    vectorizer = pkl.load(f)

with open("lda_15.pkl.gz", "rb") as f:
    lda = pkl.load(f)


In [24]:
import numpy as np
top_words_indices = np.argsort(lda.components_)[:, ::-1]

In [25]:
top_words = np.array(vectorizer.get_feature_names())[top_words_indices]


In [26]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.DataFrame(top_words[:, :30])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29
0,sejm,posiedzieć,ustrój,sprawa,komenda,pis,słońce,prom,zmiana,radna,wnioskodawca,marsz,góra,post,senat,uzdrowićpolskę,rt,preferencja,jutro,parlamentaryzm,gwiazda,poprzeć,senator,stanowisko,praktyka,pech,opowieść,niepokój,więzienie,marka
1,knf,sądnajwyższy,fabryka,słabość,sprawa,opieka,preferencja,zgoda,rodak,promować,politechnika,timmermansa,żołnierz,radom,dedykacja,wyróżnienie,bohaterka,podsumowanie,sens,juncker,król,ręka,jajko,ziobro,wojtek,artur,cześć,promocja,legislacja,tryb
2,dziesiątka,spotkanie,dożynki,mieszkaniec,linia,kolumna,kp,samorządowiec,sld,koalicjaobywatelska,budować,klimatnazmianę,glapiński,powietrze,regulamin,zespół,wstrzymać,ranking,wojsko,gwiazda,wrzesień,praktyka,gust,internauta,klasyk,inauguracja,limuzyna,unia,okrzyk,radość
3,reakcja,minister,pretekst,rządzący,pis,nagranie,tychy,morderca,teczka,szefowa,miesiąc,machina,wiedza,sprawa,wielkanoc,spółka,plan,poznaniak,afera,monika,wola,konsultacja,samopoczucie,newsweek,tvp3,wiceprezes,obietnica,echo,samoobrona,ministerstwo
4,partiaobciachu,człowiek,kuchciński,kwintesencja,wyraz,wybory2015,kuźmiuk,ignorancja,część,pogrzeb,cenzura,terytorium,pomorzezachodnie,ulotka,realia,wolnemediawsejmie,instytut,lotos,turystyka,dotrzymujemysłowa,level,manifa,liga,nazywać,cba,posterunek,spokój,metoda,stypendium,hotel
5,pobić,debatape,stan,tvn,święto,turysta,radio,pozdrowienie,imperium,publ,magdalena,aresztowanie,wiatr,odwaga,trudność,fm,mucha,wirus,aparat,tożsamość,zamachlipcowy,spór,pbs,retoryka,domena,poszanowanie,grudziądz,związek,wincenty,ruchnarodowy
6,wybory2019,miejsce,świat,rozpad,wypoczynek,zdarzenie,niekompetencja,puszka,sondaż,myśliwy,szef,wroclaw,pech,minuta,konieczność,bełkot,mistrzostwo,fin,zaprzysiężenie,wieczor,wesele,teresa,nienawiść,miał,pole,zakażenie,komuna,filip,magdalena,azyl
7,prezydentura,justyna,kondolencja,zdobyć,szlak,henryk,kalendarz,pis,medium,rzecznik,janina,długopis,naukowiec,wiarygodność,festiwal,europejczyk,konto,bój,przyjmować,ziemia,wysłuchanie,dobragospodyni,uczucie,ojciec,projekt,zwycięstwo,przekroczyć,tysiąc,gosiewska,robota
8,czas,prolife,strona,udzielić,import,gospodarka,gmach,dolar,platforma,połowa,wydawnictwo,prowadzić,powstanie,materia,senior,program,ekonomia,moda,oglądalność,tusk,salon24,klip,wielkość,telewizor,gospodarz,telefon,anta,at,komórka,węgier
9,pieniądz,pomysł,bulwar,polskajestjedna,rządzący,środek,rpo,filozofia,praktyka,podatnik,gowin,gol,centrala,państwopis,kosiniak,ugrupowanie,prawakobiet,pl,rywal,system,przedstawić,frazes,etat,celebryta,rozwiązywać,wódka,cameron,rondo,elita,karetka


In [13]:
def predict(lda, vectorizer, texts):
    counts = vectorizer.transform(texts)
    probas = lda.transform(counts)
    labels = np.argmax(probas, axis=1)
    return labels

def show_random_k_tweets_from_each_class(lda, vectorizer, texts, k=10):
    labels = np.arange(0, np.shape(lda.components_)[0])
    predictions = predict(lda, vectorizer, texts)
    for label in labels:
        topic_texts = np.array(texts)[predictions==label]
        k_texts = np.random.choice(topic_texts, k, replace=False)

In [16]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import random
import pickle as pkl

df = pd.read_pickle("..\\datasets\\tweets_cleaned_lemma_stopwords.pkl.gz")

tweets = df.tweet.to_list()

vectorizer = CountVectorizer(min_df=2, max_df=0.95, max_features=4000)
counts = vectorizer.fit_transform(tweets)

with open(f"vectorizer_4000_all.pkl.gz", "wb") as vec_file:
    pkl.dump(vectorizer, vec_file, protocol=pkl.HIGHEST_PROTOCOL)

lda = LatentDirichletAllocation(n_components=15, random_state=42, verbose=True, evaluate_every=10, n_jobs=15, max_iter=100)
lda.fit(counts)


iteration: 1 of max_iter: 100
iteration: 2 of max_iter: 100
iteration: 3 of max_iter: 100
iteration: 4 of max_iter: 100
iteration: 5 of max_iter: 100
iteration: 6 of max_iter: 100
iteration: 7 of max_iter: 100
iteration: 8 of max_iter: 100
iteration: 9 of max_iter: 100
iteration: 10 of max_iter: 100, perplexity: 1874.7515
iteration: 11 of max_iter: 100
iteration: 12 of max_iter: 100
iteration: 13 of max_iter: 100
iteration: 14 of max_iter: 100
iteration: 15 of max_iter: 100
iteration: 16 of max_iter: 100
iteration: 17 of max_iter: 100
iteration: 18 of max_iter: 100
iteration: 19 of max_iter: 100
iteration: 20 of max_iter: 100, perplexity: 1832.7209
iteration: 21 of max_iter: 100
iteration: 22 of max_iter: 100
iteration: 23 of max_iter: 100
iteration: 24 of max_iter: 100
iteration: 25 of max_iter: 100
iteration: 26 of max_iter: 100
iteration: 27 of max_iter: 100
iteration: 28 of max_iter: 100
iteration: 29 of max_iter: 100
iteration: 30 of max_iter: 100, perplexity: 1826.0008
iteration:

ValueError: I/O operation on closed file.

In [17]:



with open(f"lda_{15}_all.pkl.gz", "wb") as model_file:
    pkl.dump(lda, model_file, protocol=pkl.HIGHEST_PROTOCOL)