In [59]:
# packages to store and manipulate data
import numpy as np
import pandas as pd

# model building package
import sklearn
import re
import unidecode
import demoji
import spacy
from nltk.corpus import stopwords
from spacy.lang.pt.stop_words import STOP_WORDS
from string import punctuation

nlp = spacy.load("pt")
nlp.Defaults.stop_words |= {"gt", "to","uol","mi","budddhetg","the", "ne", "vou", "ta", "via","ex", "pq", "vc","aa","pra","to", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"}
palavras_irrelevantes = set(stopwords.words('portuguese') + list(punctuation))

In [60]:
# df = pd.read_csv('data/Article.csv')
data = pd.read_csv("data/tweeterLeo/dataTweeter.csv", sep=";", encoding='utf8')

def cleanText(tweet):
    if "|" in tweet:
        tweet = tweet.split("|")[1]
        
    tweet = ' '.join(re.sub("(@[A-Za-z0-9]+)|(#[A-Za-z0-9]+)|([0-9])", " ", tweet).split())
    tweet = ' '.join(re.sub("(\w+:\/\/\S+)", " ", tweet).split())
    tweet = ' '.join(re.sub("[\_\|\.\,\"\'\!\?\:\;\$\-\(\)\=]", " ", tweet).split())
    tweet = tweet.lower()
    
    le = list(demoji.findall(tweet))
    for i in le:
        tweet = tweet.replace(i, "")
    
    if tweet.startswith('rt '):
        tweet = tweet.replace("rt ", "")
    
    lNewTweet = []
    for i in tweet.split(" "):
        if i not in palavras_irrelevantes and i not in STOP_WORDS:
            lNewTweet.append(i)
    
    newTweet = " ".join(lNewTweet)
        
    return unidecode.unidecode(newTweet.replace(" rt ", ""))

data["tweet_text_clean"] = data.text.apply(cleanText)

In [61]:
from sklearn.feature_extraction.text import CountVectorizer
# cv = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
cv = CountVectorizer(max_df=0.95, min_df=2,)

In [62]:
# dtm = cv.fit_transform(df['Article'])
dtm = cv.fit_transform(data["tweet_text_clean"])

In [63]:
dtm

<2869x4016 sparse matrix of type '<class 'numpy.int64'>'
	with 23496 stored elements in Compressed Sparse Row format>

In [75]:
from sklearn.decomposition import LatentDirichletAllocation
LDA = LatentDirichletAllocation(n_components=5,random_state=42)

In [76]:
LDA.fit(dtm)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=5, n_jobs=None,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [81]:
# print(len(LDA.components_),type(LDA.components_))
print(LDA.components_)

[[ 0.20000352  0.20000473  2.19756117 ...  1.20013228  0.2000115
   0.22831849]
 [ 0.20000316  0.20039495  0.20000802 ...  1.19984747  1.20046813
   2.15525342]
 [14.19946457  0.2000041   2.20108195 ...  0.20000711  0.72361353
   0.21639297]
 [ 0.20012738  2.19959188  0.20000613 ...  0.20000595  0.20075823
   0.20001545]
 [ 0.20040138  0.20000434  0.20134273 ...  0.20000719  2.67514861
   0.20001967]]


In [82]:
single_topic = LDA.components_[0]
top_10_words = single_topic.argsort()[-10:]

In [83]:
for index in top_10_words:
    print(cv.get_feature_names()[index])

maguito
eua
milhoes
mega
homem
medico
mulher
rio
covid
gt


In [84]:
for index,topic in enumerate(LDA.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([cv.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')
    

THE TOP 15 WORDS FOR TOPIC #0
['anos', 'vivo', 'sena', 'vilela', 'paulo', 'maguito', 'eua', 'milhoes', 'mega', 'homem', 'medico', 'mulher', 'rio', 'covid', 'gt']


THE TOP 15 WORDS FOR TOPIC #1
['maia', 'senado', 'casa', 'trump', 'pandemia', 'eua', 'vacinacao', 'ouca', 'video', 'sp', 'aprova', 'gt', 'covid', 'bolsonaro', 'camara']


THE TOP 15 WORDS FOR TOPIC #2
['pessoas', 'suspeito', 'caso', 'preso', 'prisao', 'reveillon', 'justica', 'covid', 'ano', 'governo', 'policia', 'sp', 'gt', 'rio', 'anos']


THE TOP 15 WORDS FOR TOPIC #3
['alta', 'reino', 'unido', 'dias', 'interior', 'mulher', 'coronavac', 'anos', 'mortes', 'vacina', 'brasil', 'morre', 'gt', 'sp', 'covid']


THE TOP 15 WORDS FOR TOPIC #4
['biden', 'paises', 'natal', 'bolsonaro', 'pandemia', 'governo', 'brasil', 'milhoes', 'pfizer', 'coronavirus', 'eua', 'ano', 'gt', 'vacina', 'covid']




In [85]:
topic_results = LDA.transform(dtm)
print(topic_results.shape)
print(topic_results[0])


(2869, 5)
[0.02002639 0.02007929 0.91975869 0.02002374 0.02011189]


In [71]:
topic_results[0].round(2)

array([0.01, 0.01, 0.91, 0.01, 0.01, 0.01, 0.01])

In [73]:
data['Topic'] = topic_results.argmax(axis=1)
# topic_results

In [74]:
# data["Topic" == 1][""]
data[data['Topic'] == 4]["tweet_text_clean"].sample(20)
# data.head()

2497    russia vacinou pessoas covid pretende chegar m...
15      comissario uniao europeia critica imagens verg...
2375             revista time descreve pior ano ke yvbjgd
1        paises america latina previsao crescimento qrehg
1821    governo zera cobranca iof operacoes credito fe...
826     biden violacao dados governo representa grave ...
727     governo sp recebe lote milhoes doses vacina co...
699     estudo inedito usp identifica proteina organis...
2579    video mostra radiotelescopio desaba porto rico...
975     vaticano uso vacinas covid etico moralmente ac...
1133    agencia reguladora eua aprova uso emergencial ...
2344    marilia mendonca lidera paradas clipes lives s...
1409    reino unido vacinado quase pessoas covid irmto...
1330    reta enem hora relaxar pesado estudos revisar ...
2159    pazuello governo comprara vacina registro anvi...
1195    pedidos refugio brasil despenca pandemia wgocs...
1756    liberacao vacina eua garante aprovacao automat...
1213     mike 