In [2]:
import pandas as pd
import numpy as np

from helpers import *
from pre_processing import *


%load_ext autoreload
%autoreload 2

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import gensim
import pyLDAvis.gensim_models

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mathi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\mathi\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\mathi\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mathi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mathi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
#Load the tweets 
df = pd.read_csv('data/to_be_clustered.csv.gz', compression="gzip")
country = 'France'
clean_tweets = df[df.whcs == country].clean.copy()

## Params 

In [7]:
params = {
    'vec_repr': {
        'max_df' : 0.95, 
        'min_df' : 1, 
                },
    'LDA' : {
        'nb_topics' : 9,
        'passes': 10, 
        'random_state': 50
            }
     }
params['LDA']['alpha'] = 1/params['LDA']['nb_topics'],
params['LDA']['beta'] = 1/params['LDA']['nb_topics'],

more_stop_words = ['u', 'amp', 'get', 'one', 'go']

### LDA with `gensim.LdaMulticore`

In [8]:
def add_bi_tri_grams(data_words):
    bigram = gensim.models.Phrases(data_words, min_count=3, threshold=1) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[data_words], threshold=1)  
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    data_bi_tri = [bigram_mod[doc] for doc in data_words]
    data_bi_tri = [trigram_mod[bigram_mod[doc]] for doc in data_words]
    return data_bi_tri
def show_topics(model, nb_word_per_model = 8):
    for topic in model.show_topics(num_words=nb_word_per_model):
        print(topic)

def tune_params(params,corpus,nb_topics = [params['LDA']['nb_topics']], alphas = [params['LDA']['alpha']], betas = [params['LDA']['beta']]):
    base_beta = params['LDA']['beta']
    base_alpha = params['LDA']['alpha']
    base_k = params['LDA']['nb_topics']
    for k in nb_topics:
        params['LDA']['nb_topics'] = k
        for alpha in alphas :
            params['LDA']['alpha'] = alpha
            for beta in betas:
                params['LDA']['beta'] = beta
                model = get_model(params,corpus)
                print("alpha = %3.4f, beta = %3.4f, nb_topics = %d"%(alpha, beta, k))
                show_topics(model)
                print()
    params['LDA']['beta'] = base_beta
    params['LDA']['alpha'] = base_alpha
    params['LDA']['nb_topics'] = base_k
    
def get_model(params, corpus):
    return LdaMulticore(corpus=corpus, num_topics=params['LDA']['nb_topics'], alpha = params['LDA']['alpha'], eta = params['LDA']['beta'],
                        id2word=dictionary, workers=6, passes=params2['passes'], random_state=params2['random_state'])

## Construct the models

In [10]:
# models
from gensim.models import LdaMulticore
from gensim.corpora import Dictionary

data_words = clean_tweets.apply(lambda x : remove_stop_words(str(x), more_stop_words).split(' ')).tolist()
data_words = add_bi_tri_grams(data_words)
dictionary = Dictionary(data_words)
dictionary.filter_extremes(no_below=params['vec_repr']['min_df'], no_above=params['vec_repr']['min_df'])
corpus = [dictionary.doc2bow(doc) for doc in data_words]

## Find best parameters for alpha, betas and number of topics
### For the alphas 
The alpha parameter represents the document-topic relation. If alpha is big, every documents will have a lot of topics whereas if it is small each documents would be assigned to only one topic.

In our case, because the documents which are tweets are small, we will make alpha small, so that each tweet is assign to one or two topics at most.

Otherwise we can see that when alpha grows, we have in one topics both communism and black people which we would prefer not to have and have differenciating categories.

### For the betas
The beta parameter repesents the topics-word relation. If beta is big, all the topics will have the same distribution over word whereas when it is small the distibution are different and more words in the corpus are taken into account. 

For our purpose, because we want to explain the whole data and not only the main topic we have to set beta low i.e. < 0.1

### For the number of topics 
From the exploratory data analysis, we already saw that there were multiple topics like racist, antisemtism, american elections, covid

In [84]:
alphas = [0.0001, 0.001, 0.01, 0.1, 1, 10]   
betas  = [0.0001, 0.001, 0.01, 0.1, 1, 10]
alphas = [0.0001, 0.001, 0.01, 0.1]   
betas  = [0.0001, 0.001, 0.01, 0.1]
tune_params(params,corpus, alphas = alphas, betas = betas)

alpha = 0.0001, beta = 0.0001
(0, '0.010*"white" + 0.010*"account" + 0.009*"play" + 0.008*"know" + 0.007*"today" + 0.006*"yet" + 0.006*"follower" + 0.006*"allow"')
(1, '0.012*"stop" + 0.009*"white" + 0.008*"muslim" + 0.007*"never" + 0.006*"like" + 0.006*"racist" + 0.006*"must" + 0.005*"video"')
(2, '0.012*"black" + 0.011*"like" + 0.009*"let" + 0.007*"communist" + 0.007*"try" + 0.007*"trump" + 0.006*"want" + 0.006*"never"')
(3, '0.012*"america" + 0.008*"time" + 0.007*"attack" + 0.007*"country" + 0.006*"left" + 0.006*"power" + 0.006*"write" + 0.006*"state"')
(4, '0.015*"say" + 0.009*"call" + 0.008*"world" + 0.008*"live" + 0.008*"german" + 0.008*"covid" + 0.008*"christian" + 0.008*"sign"')
(5, '0.032*"white" + 0.012*"like" + 0.007*"muslim" + 0.006*"via" + 0.006*"woman" + 0.006*"jew" + 0.006*"make" + 0.005*"arrest"')
(6, '0.015*"many" + 0.013*"white" + 0.012*"know" + 0.011*"trump" + 0.008*"mr" + 0.007*"take" + 0.006*"end" + 0.006*"time"')
(7, '0.010*"keep" + 0.008*"really" + 0.008*"darker"

alpha = 0.0010, beta = 0.1000
(0, '0.008*"white" + 0.007*"play" + 0.006*"account" + 0.006*"know" + 0.005*"today" + 0.005*"yet" + 0.004*"allow" + 0.004*"new"')
(1, '0.009*"stop" + 0.007*"white" + 0.006*"muslim" + 0.005*"never" + 0.005*"video" + 0.005*"like" + 0.004*"racist" + 0.004*"must"')
(2, '0.009*"black" + 0.008*"like" + 0.006*"let" + 0.005*"communist" + 0.005*"try" + 0.005*"trump" + 0.005*"never" + 0.005*"want"')
(3, '0.008*"america" + 0.006*"time" + 0.005*"attack" + 0.005*"country" + 0.004*"left" + 0.004*"write" + 0.004*"power" + 0.004*"state"')
(4, '0.009*"say" + 0.005*"live" + 0.005*"world" + 0.005*"covid" + 0.005*"christian" + 0.005*"sign" + 0.005*"expose" + 0.005*"german"')
(5, '0.025*"white" + 0.009*"like" + 0.006*"want" + 0.005*"muslim" + 0.005*"call" + 0.004*"via" + 0.004*"jew" + 0.004*"woman"')
(6, '0.011*"many" + 0.010*"white" + 0.009*"know" + 0.007*"trump" + 0.006*"mr" + 0.005*"take" + 0.005*"end" + 0.005*"time"')
(7, '0.007*"keep" + 0.006*"really" + 0.006*"darker" + 0.

alpha = 0.1000, beta = 0.0100
(0, '0.012*"white" + 0.007*"account" + 0.007*"play" + 0.007*"know" + 0.006*"like" + 0.006*"attack" + 0.006*"america" + 0.006*"fuck"')
(1, '0.011*"stop" + 0.009*"white" + 0.007*"video" + 0.006*"never" + 0.006*"muslim" + 0.006*"racist" + 0.006*"must" + 0.006*"like"')
(2, '0.010*"let" + 0.010*"try" + 0.009*"black" + 0.007*"like" + 0.007*"trump" + 0.007*"communist" + 0.007*"want" + 0.006*"never"')
(3, '0.009*"america" + 0.008*"country" + 0.008*"time" + 0.007*"power" + 0.005*"first" + 0.005*"break" + 0.005*"good" + 0.005*"left"')
(4, '0.014*"say" + 0.013*"world" + 0.011*"live" + 0.008*"call" + 0.008*"german" + 0.007*"day" + 0.007*"christian" + 0.006*"germany"')
(5, '0.031*"white" + 0.014*"like" + 0.007*"muslim" + 0.006*"want" + 0.006*"jew" + 0.005*"woman" + 0.005*"attack" + 0.005*"guy"')
(6, '0.016*"white" + 0.015*"many" + 0.013*"know" + 0.010*"trump" + 0.010*"take" + 0.009*"mr" + 0.006*"time" + 0.006*"new"')
(7, '0.009*"call" + 0.009*"keep" + 0.009*"really" + 

In [91]:
nb_topics = range(3,12,2)
params['LDA']['beta'] = 0.001
params['LDA']['alpha'] = 0.001
tune_params(params,corpus, nb_topics=nb_topics)

alpha = 0.0100, beta = 0.0100, nb_topics = 3
(0, '0.011*"white" + 0.008*"know" + 0.005*"islamic" + 0.004*"black" + 0.004*"play" + 0.004*"america" + 0.004*"day" + 0.004*"biden"')
(1, '0.014*"white" + 0.007*"woman" + 0.005*"say" + 0.005*"attack" + 0.004*"stop" + 0.004*"racist" + 0.004*"video" + 0.004*"like"')
(2, '0.009*"black" + 0.007*"like" + 0.007*"america" + 0.006*"right" + 0.006*"trump" + 0.006*"many" + 0.005*"see" + 0.005*"say"')

alpha = 0.0100, beta = 0.0100, nb_topics = 5
(0, '0.015*"white" + 0.011*"know" + 0.008*"black" + 0.007*"today" + 0.006*"play" + 0.006*"new" + 0.005*"america" + 0.005*"twitter"')
(1, '0.012*"white" + 0.007*"woman" + 0.007*"video" + 0.007*"like" + 0.007*"stop" + 0.006*"say" + 0.005*"racist" + 0.005*"never"')
(2, '0.008*"trump" + 0.008*"black" + 0.006*"right" + 0.006*"like" + 0.006*"let" + 0.005*"year" + 0.005*"white" + 0.005*"president"')
(3, '0.010*"many" + 0.008*"america" + 0.006*"biden" + 0.006*"white" + 0.006*"attack" + 0.006*"muslim" + 0.005*"time" + 0

### Topics 

In [12]:
params['LDA']['beta'] = 0.001
params['LDA']['alpha'] = 0.001
params['LDA']['nb_topics'] = 9
model = get_model(params, corpus)
show_topics(model)

NameError: name 'params2' is not defined

In [1]:
# plot topics
data =  pyLDAvis.gensim_models.prepare(model, corpus, dictionary)
pyLDAvis.display(data)

NameError: name 'pyLDAvis' is not defined

## Analysis 