In [14]:
import pandas as pd
import numpy as np
import nltk 

from lda_helpers import * 
import sys
sys.path.append('../helpers_python')
from pre_processing import *
from helpers import *
%load_ext autoreload
%autoreload 2

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [22]:
# Choose the country we want to work with
country = 'France'

# Load data
cleaned_tweets = load_data_lda(country).clean

## Params 

In [3]:
# Set the parameters for LDA
params = {
    'vec_repr': {
        'max_df' : 0.95,  # remove all words that appear in more than 95% of the documents
        'min_df' : 1,     # remove all words that appear only once
                },
    'LDA' : {
        'nb_topics' : 9,  # the number of clusters/topics
        'passes': 15,     # number of passes during LDA
        'random_state': 50# seed used 
            }
     }
params['LDA']['alpha'] = 1/params['LDA']['nb_topics'], # the distribution of topics per document paramaters
params['LDA']['beta'] = 1/params['LDA']['nb_topics'],  # the distribution of words per topics parameters

more_stop_words = ['u', 'amp', 'get', 'one', 'go'] # some words we want to remove because they are not interesting

### LDA with `gensim.LdaMulticore`

## Construct the models

In [4]:
dictionary = get_dictionary(cleaned_tweets, params, more_stop_words)
corpus = get_corpus_in_bow(cleaned_tweets,dictionary, more_stop_words)

## Find best parameters for alpha, betas and number of topics
### For the alphas 
The alpha parameter represents the document-topic relation. If alpha is big, every documents will have a lot of topics whereas if it is small each documents would be assigned to only one topic.

In our case, because the documents which are tweets are small, we will make alpha small, so that each tweet is assign to one or two topics at most.

Otherwise we can see that when alpha grows, we have in one topics both communism and black people which we would prefer not to have and have differenciating categories.

### For the betas
The beta parameter repesents the topics-word relation. If beta is big, all the topics will have the same distribution over word whereas when it is small the distibution are different and more words in the corpus are taken into account. 

For our purpose, because we want to explain the whole data and not only the main topic we have to set beta low i.e. < 0.1

### For the number of topics 
From the exploratory data analysis, we already saw that there were multiple topics like racist, antisemtism, american elections, covid so we try multiple values expecting that our best topic repartition is around 8.

In [5]:
alphas = [0.0001, 0.001, 0.01, 0.1, 1, 10]   
betas  = [0.0001, 0.001, 0.01, 0.1, 1, 10]
tune_params(params,corpus, dictionary, alphas = alphas, betas = betas)

alpha = 0.0001, beta = 0.0001, nb_topics = 9, u_mass -16.04700
(0, '0.013*"black" + 0.010*"diversity" + 0.009*"time" + 0.007*"let" + 0.007*"war" + 0.007*"strength" + 0.006*"see" + 0.006*"conservative"')
(1, '0.018*"want" + 0.010*"people" + 0.008*"like" + 0.008*"black" + 0.008*"woman" + 0.008*"president" + 0.007*"white_people" + 0.007*"trump"')
(2, '0.025*"people" + 0.010*"know" + 0.008*"white" + 0.007*"never" + 0.006*"warn" + 0.006*"woman" + 0.006*"anti" + 0.006*"well"')
(3, '0.014*"many" + 0.012*"racist" + 0.012*"people" + 0.009*"mr" + 0.009*"know" + 0.009*"way" + 0.007*"let" + 0.007*"fuck"')
(4, '0.016*"white" + 0.014*"say" + 0.010*"attack" + 0.010*"white_people" + 0.009*"call" + 0.008*"time" + 0.007*"black" + 0.006*"people"')
(5, '0.015*"people" + 0.012*"twitter" + 0.008*"america" + 0.008*"today" + 0.008*"like" + 0.007*"day" + 0.007*"say" + 0.007*"face"')
(6, '0.008*"muslim" + 0.007*"know" + 0.007*"medium" + 0.007*"people" + 0.007*"yes" + 0.007*"believe" + 0.007*"lie" + 0.006*"part"

alpha = 0.0010, beta = 0.0010, nb_topics = 9, u_mass -16.04152
(0, '0.013*"black" + 0.010*"diversity" + 0.009*"time" + 0.007*"let" + 0.007*"war" + 0.007*"strength" + 0.006*"see" + 0.006*"alain_soral"')
(1, '0.018*"want" + 0.010*"people" + 0.008*"like" + 0.008*"black" + 0.008*"woman" + 0.008*"president" + 0.007*"white_people" + 0.007*"trump"')
(2, '0.025*"people" + 0.010*"know" + 0.008*"white" + 0.007*"woman" + 0.006*"never" + 0.006*"warn" + 0.006*"anti" + 0.006*"well"')
(3, '0.014*"many" + 0.012*"racist" + 0.012*"people" + 0.009*"mr" + 0.009*"know" + 0.009*"way" + 0.007*"let" + 0.007*"fuck"')
(4, '0.016*"white" + 0.014*"say" + 0.010*"attack" + 0.010*"white_people" + 0.009*"call" + 0.008*"time" + 0.007*"black" + 0.006*"people"')
(5, '0.015*"people" + 0.012*"twitter" + 0.008*"america" + 0.008*"today" + 0.008*"like" + 0.007*"day" + 0.007*"say" + 0.007*"face"')
(6, '0.008*"muslim" + 0.007*"know" + 0.007*"medium" + 0.007*"people" + 0.007*"yes" + 0.007*"believe" + 0.007*"lie" + 0.006*"part"'

alpha = 0.0100, beta = 0.0100, nb_topics = 9, u_mass -16.03100
(0, '0.012*"black" + 0.010*"diversity" + 0.007*"time" + 0.007*"let" + 0.007*"war" + 0.007*"strength" + 0.006*"see" + 0.006*"conservative"')
(1, '0.017*"want" + 0.009*"people" + 0.008*"woman" + 0.008*"like" + 0.008*"black" + 0.008*"president" + 0.007*"white_people" + 0.007*"trump"')
(2, '0.024*"people" + 0.010*"know" + 0.007*"white" + 0.006*"never" + 0.006*"warn" + 0.006*"woman" + 0.006*"anti" + 0.006*"well"')
(3, '0.012*"racist" + 0.011*"people" + 0.011*"many" + 0.008*"mr" + 0.008*"know" + 0.008*"way" + 0.007*"let" + 0.007*"fuck"')
(4, '0.015*"white" + 0.014*"say" + 0.010*"attack" + 0.009*"white_people" + 0.009*"call" + 0.008*"time" + 0.007*"black" + 0.006*"people"')
(5, '0.014*"people" + 0.012*"twitter" + 0.008*"america" + 0.008*"like" + 0.008*"today" + 0.007*"day" + 0.007*"say" + 0.007*"face"')
(6, '0.008*"muslim" + 0.007*"know" + 0.007*"medium" + 0.007*"people" + 0.007*"yes" + 0.007*"believe" + 0.006*"lie" + 0.005*"india

alpha = 0.1000, beta = 0.1000, nb_topics = 9, u_mass -16.43939
(0, '0.008*"diversity" + 0.006*"war" + 0.005*"black" + 0.005*"time" + 0.005*"white" + 0.005*"government" + 0.005*"strength" + 0.004*"let"')
(1, '0.015*"want" + 0.007*"president" + 0.007*"every" + 0.007*"day" + 0.007*"people" + 0.006*"like" + 0.006*"black" + 0.006*"woman"')
(2, '0.017*"people" + 0.008*"know" + 0.006*"woman" + 0.006*"white" + 0.006*"well" + 0.005*"black" + 0.005*"never" + 0.005*"white_people"')
(3, '0.010*"racist" + 0.009*"people" + 0.008*"many" + 0.007*"mr" + 0.007*"always" + 0.007*"let" + 0.006*"know" + 0.006*"stop"')
(4, '0.015*"white" + 0.013*"say" + 0.010*"white_people" + 0.007*"attack" + 0.007*"black" + 0.007*"call" + 0.006*"time" + 0.006*"try"')
(5, '0.011*"people" + 0.008*"twitter" + 0.006*"like" + 0.006*"today" + 0.005*"america" + 0.005*"face" + 0.004*"day" + 0.004*"friend"')
(6, '0.007*"hate" + 0.006*"muslim" + 0.005*"know" + 0.005*"medium" + 0.005*"people" + 0.004*"yes" + 0.004*"india" + 0.004*"lie

alpha = 1.0000, beta = 1.0000, nb_topics = 9, u_mass -14.21554
(0, '0.001*"people" + 0.001*"let" + 0.001*"time" + 0.001*"america" + 0.001*"democrat" + 0.001*"government" + 0.001*"trump" + 0.001*"play"')
(1, '0.006*"want" + 0.004*"people" + 0.003*"every" + 0.003*"like" + 0.002*"know" + 0.002*"never" + 0.002*"even" + 0.002*"trump"')
(2, '0.005*"people" + 0.002*"woman" + 0.002*"know" + 0.002*"germany" + 0.002*"warn" + 0.002*"power" + 0.001*"make" + 0.001*"gov"')
(3, '0.002*"racist" + 0.002*"people" + 0.002*"mr" + 0.001*"many" + 0.001*"america" + 0.001*"know" + 0.001*"need" + 0.001*"trump"')
(4, '0.008*"white" + 0.007*"black" + 0.006*"white_people" + 0.006*"attack" + 0.005*"say" + 0.004*"call" + 0.003*"police" + 0.002*"people"')
(5, '0.004*"twitter" + 0.002*"new" + 0.002*"account" + 0.002*"people" + 0.002*"day" + 0.002*"follower" + 0.002*"like" + 0.002*"year"')
(6, '0.001*"people" + 0.001*"like" + 0.001*"stop" + 0.001*"know" + 0.001*"play" + 0.001*"america" + 0.001*"woman" + 0.001*"governm

alpha = 10.0000, beta = 10.0000, nb_topics = 9, u_mass -12.31918
(0, '0.001*"people" + 0.000*"white" + 0.000*"white_people" + 0.000*"like" + 0.000*"black" + 0.000*"say" + 0.000*"america" + 0.000*"know"')
(1, '0.001*"people" + 0.000*"white" + 0.000*"white_people" + 0.000*"like" + 0.000*"black" + 0.000*"say" + 0.000*"america" + 0.000*"know"')
(2, '0.001*"people" + 0.000*"white" + 0.000*"white_people" + 0.000*"like" + 0.000*"black" + 0.000*"say" + 0.000*"america" + 0.000*"know"')
(3, '0.001*"people" + 0.000*"white" + 0.000*"white_people" + 0.000*"like" + 0.000*"black" + 0.000*"say" + 0.000*"america" + 0.000*"know"')
(4, '0.001*"people" + 0.000*"white" + 0.000*"white_people" + 0.000*"like" + 0.000*"black" + 0.000*"say" + 0.000*"america" + 0.000*"know"')
(5, '0.001*"people" + 0.000*"white" + 0.000*"white_people" + 0.000*"like" + 0.000*"black" + 0.000*"say" + 0.000*"america" + 0.000*"know"')
(6, '0.001*"people" + 0.000*"white" + 0.000*"white_people" + 0.000*"like" + 0.000*"black" + 0.000*"sa

In [6]:
nb_topics = range(3,12,2)
params['LDA']['beta'] = 0.1
params['LDA']['alpha'] = 0.1
tune_params(params,corpus, dictionary, nb_topics=nb_topics)

alpha = 0.1000, beta = 0.1000, nb_topics = 3, u_mass -13.47830
(0, '0.006*"say" + 0.005*"people" + 0.005*"back" + 0.005*"white" + 0.005*"time" + 0.004*"muslim" + 0.004*"war" + 0.004*"america"')
(1, '0.009*"people" + 0.007*"want" + 0.007*"like" + 0.006*"white" + 0.005*"black" + 0.005*"day" + 0.004*"every" + 0.004*"know"')
(2, '0.012*"people" + 0.008*"white_people" + 0.006*"know" + 0.005*"biden" + 0.005*"try" + 0.005*"woman" + 0.005*"make" + 0.005*"america"')

alpha = 0.1000, beta = 0.1000, nb_topics = 5, u_mass -14.77933
(0, '0.007*"democrat" + 0.006*"white" + 0.005*"people" + 0.005*"new" + 0.005*"twitter" + 0.005*"left" + 0.004*"biden" + 0.004*"let"')
(1, '0.008*"like" + 0.008*"want" + 0.007*"people" + 0.006*"day" + 0.005*"every" + 0.005*"trump" + 0.005*"never" + 0.005*"woman"')
(2, '0.013*"people" + 0.008*"know" + 0.006*"make" + 0.005*"lie" + 0.005*"try" + 0.005*"biden" + 0.005*"like" + 0.005*"woman"')
(3, '0.010*"people" + 0.010*"many" + 0.008*"racist" + 0.007*"know" + 0.006*"stop" +

### Topics 
We can give categories we used for labelling to these clusters
1. 'tweeter acccounts', 'communism - against bankers , against state  prder '
1. 'homophobia'
1. 'american elections'
1. 'farright', 'Alain Soral', 'communism - against bankers , against state  prder '
1. 'white/black - racism', 'women'
1. 'jew - antisemtism'
1. 'islam', 'terrorism'
1. 'immigration'
1. Other

Don't have 
- 'India - Pakistan'
- 'coronavirus - vaccination'
- 'fantasy sex play'
- 'homophobia'

In [7]:
params['LDA']['beta'] = 0.1
params['LDA']['alpha'] = 0.1
params['LDA']['nb_topics'] = 9
model = get_model(params, corpus, dictionary)
show_topics(model)

(0, '0.008*"diversity" + 0.006*"war" + 0.005*"black" + 0.005*"time" + 0.005*"white" + 0.005*"government" + 0.005*"strength" + 0.004*"let"')
(1, '0.015*"want" + 0.007*"president" + 0.007*"every" + 0.007*"day" + 0.007*"people" + 0.006*"like" + 0.006*"black" + 0.006*"woman"')
(2, '0.017*"people" + 0.008*"know" + 0.006*"woman" + 0.006*"white" + 0.006*"well" + 0.005*"black" + 0.005*"never" + 0.005*"white_people"')
(3, '0.010*"racist" + 0.009*"people" + 0.008*"many" + 0.007*"mr" + 0.007*"always" + 0.007*"let" + 0.006*"know" + 0.006*"stop"')
(4, '0.015*"white" + 0.013*"say" + 0.010*"white_people" + 0.007*"attack" + 0.007*"black" + 0.007*"call" + 0.006*"time" + 0.006*"try"')
(5, '0.011*"people" + 0.008*"twitter" + 0.006*"like" + 0.006*"today" + 0.005*"america" + 0.005*"face" + 0.004*"day" + 0.004*"friend"')
(6, '0.007*"hate" + 0.006*"muslim" + 0.005*"know" + 0.005*"medium" + 0.005*"people" + 0.004*"yes" + 0.004*"india" + 0.004*"lie"')
(7, '0.009*"people" + 0.008*"say" + 0.006*"right" + 0.005*"

In [8]:
topics = [
    ['tweeter acccounts', 'communism - against bankers , against state prder'],
    ['homophobia'],
    ['american elections'],
    ['farright', 'Alain Soral', 'communism - against bankers , against state prder'],
    ['white/black - racism', 'women'],
    ['jew - antisemtism'],
    ['islam', 'terrorism'],
    ['immigration'],
    []
]

In [9]:
# plot topics
data =  pyLDAvis.gensim_models.prepare(model, corpus, dictionary)
pyLDAvis.display(data)

  default_term_info = default_term_info.sort_values(


## Analysis 

In [26]:
# Get the df with the tweets labelled by hand
df_labels = get_labelled_tweets(country)
# Get the tweets that have been labelled
labeled_tweets = pd.DataFrame(cleaned_tweets[df_labels.index])
# Get the bow representation for these tweets
labeled_tweets['bow'] = get_corpus_in_bow(labeled_tweets['clean'],dictionary)
# Join the two dfs 
df_labels = labeled_tweets.join(df_labels).drop('translated', axis = 1)
df_labels.head(5)

Unnamed: 0_level_0,clean,bow,labels
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
262,write wall white race slat genocide hyperbole ...,"[(27, 1), (28, 1), (29, 1), (30, 1), (31, 1), ...",[white/black - racism]
443,one good thing french police never hesitate fi...,"[(8, 1), (52, 1), (53, 1), (54, 1), (55, 1), (...","[islam, farright]"
470,muslim ever respect religious teacher religion...,"[(8, 1), (79, 1), (80, 1), (81, 1), (82, 2), (...",[islam]
731,warn communist weaponize fake asian hate crime...,"[(66, 1), (110, 1), (111, 1), (112, 1), (114, ...",[jew - antisemtism]
1065,panic kitchen rebel restaurateur deprive aid b...,"[(184, 1), (185, 1), (186, 1), (187, 1), (188,...",[other]


In [27]:
# Get the most likelly topic 
df_labels['topics'] = df_labels.bow\
            .apply(lambda bow : model.get_document_topics(bow, minimum_probability = 0.3))\
            .apply(lambda tpcs : [topics[t[0]] for t in tpcs][0])
# Compare the topics given by LDA with the ones given by us by giving 1 if the topics are the same
df_labels['score'] = df_labels.apply(lambda row : sum([1 for topic in row.topics if topic in row.labels]),axis = 1)
# Compute the accuracy 
final_score = df_labels['score'].agg(sum)/len(df_labels)
print("LDA has an accuracy of %.4f to label the tweets as we did."%(final_score))

LDA has an accuracy of 0.0960 to label the tweets as we did.


### Coherence

In [31]:
documents = pd.DataFrame({"Document": cleaned_tweets,
                              "ID": range(len(cleaned_tweets))})

documents['bow'] = get_corpus_in_bow(documents['Document'],dictionary)
documents['Topic'] = documents.bow.apply(lambda bow : get_topic_from_bow(model, bow))
documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
documents_per_topic['Document'] = documents_per_topic['Document'].apply(lambda s : s.split(' '))

In [33]:
from collections import Counter
texts = documents_per_topic.Document.tolist()
texts = [[t[0] for t in Counter(texts[i]).most_common(10)]for i in range(1,10)]

In [41]:
docs = load_data_lda(country).clean.apply(lambda x: str(x).split()).to_numpy()

In [44]:
coherence_c_v = CoherenceModel(texts = docs, topics=texts, corpus=corpus, dictionary = dictionary,  coherence='c_v').get_coherence()
coherence_c_v

KeyError: 'white'

In [38]:
coherence_u_mass = CoherenceModel(model=model, corpus=corpus, coherence='u_mass').get_coherence()
coherence_u_mass

-16.439386513609932