In [1]:
import pandas as pd
import numpy as np
import nltk 

from lda_helpers import * 
import sys
sys.path.append('../helpers_python')
from pre_processing import *
from helpers import *
%load_ext autoreload
%autoreload 2

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mathi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\mathi\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\mathi\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mathi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mathi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Choose the country we want to work with
country = 'France'

# Load data
cleaned_tweets = load_data_lda(country).clean

## Params 

In [3]:
# Set the parameters for LDA
params = {
    'vec_repr': {
        'max_df' : 0.95,  # remove all words that appear in more than 95% of the documents
        'min_df' : 1,     # remove all words that appear only once
                },
    'LDA' : {
        'nb_topics' : 9,  # the number of clusters/topics
        'passes': 15,     # number of passes during LDA
        'random_state': 50# seed used 
            }
     }
params['LDA']['alpha'] = 1/params['LDA']['nb_topics'], # the distribution of topics per document paramaters
params['LDA']['beta'] = 1/params['LDA']['nb_topics'],  # the distribution of words per topics parameters

more_stop_words = ['u', 'amp', 'get', 'one', 'go'] # some words we want to remove because they are not interesting

### LDA with `gensim.LdaMulticore`

## Construct the models

In [4]:
dictionary = get_dictionary(cleaned_tweets, params, more_stop_words)
corpus = get_corpus_in_bow(cleaned_tweets, dictionary, more_stop_words)

## Find best parameters for alpha, betas and number of topics
### For the alphas 
The alpha parameter represents the document-topic relation. If alpha is big, every documents will have a lot of topics whereas if it is small each documents would be assigned to only one topic.

In our case, because the documents which are tweets are small, we will make alpha small, so that each tweet is assign to one or two topics at most.

Otherwise we can see that when alpha grows, we have in one topics both communism and black people which we would prefer not to have and have differenciating categories.

### For the betas
The beta parameter repesents the topics-word relation. If beta is big, all the topics will have the same distribution over word whereas when it is small the distibution are different and more words in the corpus are taken into account. 

For our purpose, because we want to explain the whole data and not only the main topic we have to set beta low i.e. < 0.1

### For the number of topics 
From the exploratory data analysis, we already saw that there were multiple topics like racist, antisemtism, american elections, covid so we try multiple values expecting that our best topic repartition is around 8.

In [None]:
alphas = [0.0001, 0.001, 0.01, 0.1, 1, 10]   
betas  = [0.0001, 0.001, 0.01, 0.1, 1, 10]
tune_params(params,corpus, dictionary, alphas = alphas, betas = betas)

In [None]:
nb_topics = range(3,12,2)
params['LDA']['beta'] = 0.1
params['LDA']['alpha'] = 0.1
tune_params(params,corpus, dictionary, nb_topics=nb_topics)

### Topics 
We can give categories we used for labelling to these clusters
1. 'tweeter acccounts', 'communism - against bankers , against state  prder '
1. 'homophobia'
1. 'american elections'
1. 'farright', 'Alain Soral', 'communism - against bankers , against state  prder '
1. 'white/black - racism', 'women'
1. 'jew - antisemtism'
1. 'islam', 'terrorism'
1. 'immigration'
1. Other

Don't have 
- 'India - Pakistan'
- 'coronavirus - vaccination'
- 'fantasy sex play'
- 'homophobia'

In [6]:
params['LDA']['beta'] = 0.1
params['LDA']['alpha'] = 0.1
params['LDA']['nb_topics'] = 9
model = get_model(params, corpus, dictionary)
show_topics(model)

(0, '0.008*"diversity" + 0.006*"war" + 0.005*"black" + 0.005*"time" + 0.005*"white" + 0.005*"government" + 0.005*"strength" + 0.004*"let"')
(1, '0.015*"want" + 0.007*"president" + 0.007*"every" + 0.007*"day" + 0.007*"people" + 0.006*"like" + 0.006*"black" + 0.006*"woman"')
(2, '0.017*"people" + 0.008*"know" + 0.006*"woman" + 0.006*"white" + 0.006*"well" + 0.005*"black" + 0.005*"never" + 0.005*"white_people"')
(3, '0.010*"racist" + 0.009*"people" + 0.008*"many" + 0.007*"mr" + 0.007*"always" + 0.007*"let" + 0.006*"know" + 0.006*"stop"')
(4, '0.015*"white" + 0.013*"say" + 0.010*"white_people" + 0.007*"attack" + 0.007*"black" + 0.007*"call" + 0.006*"time" + 0.006*"try"')
(5, '0.011*"people" + 0.008*"twitter" + 0.006*"like" + 0.006*"today" + 0.005*"america" + 0.005*"face" + 0.004*"day" + 0.004*"friend"')
(6, '0.007*"hate" + 0.006*"muslim" + 0.005*"know" + 0.005*"medium" + 0.005*"people" + 0.004*"yes" + 0.004*"india" + 0.004*"lie"')
(7, '0.009*"people" + 0.008*"say" + 0.006*"right" + 0.005*"

In [5]:
topics = [
    ['tweeter acccounts', 'communism - against bankers , against state prder'],
    ['homophobia'],
    ['american elections'],
    ['farright', 'Alain Soral', 'communism - against bankers , against state prder'],
    ['white/black - racism', 'women'],
    ['jew - antisemtism'],
    ['islam', 'terrorism'],
    ['immigration'],
    []
]

In [None]:
# plot topics
data =  pyLDAvis.gensim_models.prepare(model, corpus, dictionary)
pyLDAvis.display(data)

## Analysis 

In [None]:
# Get the df with the tweets labelled by hand
df_labels = get_labeled_tweets(country)
# Get the tweets that have been labelled
labeled_tweets = pd.DataFrame(cleaned_tweets[df_labels.index])
# Get the bow representation for these tweets
labeled_tweets['bow'] = get_corpus_in_bow(labeled_tweets['clean'],dictionary)
# Join the two dfs 
df_labels = labeled_tweets.join(df_labels).drop('translated', axis = 1)
df_labels.head(5)

In [None]:
# Get the most likelly topic 
df_labels['topics'] = df_labels.bow\
            .apply(lambda bow : model.get_document_topics(bow, minimum_probability = 0.3))\
            .apply(lambda tpcs : [topics[t[0]] for t in tpcs][0])
# Compare the topics given by LDA with the ones given by us by giving 1 if the topics are the same
final_score = get_score(df_labels)
print("LDA has an accuracy of %.4f to label the tweets as we did."%(final_score))

### Coherence

In [7]:
documents = pd.DataFrame({"Document": cleaned_tweets,
                              "ID": range(len(cleaned_tweets))})

documents['bow'] = get_corpus_in_bow(documents['Document'],dictionary)
documents['Topic'] = documents.bow.apply(lambda bow : get_topic_from_bow(model, bow))
documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
documents_per_topic['Document'] = documents_per_topic['Document'].apply(lambda s : s.split(' '))

In [8]:
from collections import Counter
texts = documents_per_topic.Document.tolist()
texts = [[t[0] for t in Counter(texts[i]).most_common(10)]for i in range(1,10)]

In [9]:
cleaned_tweets

139      muslim commit multiple terrorist attack across...
232      toxic upbringing environment anti white hatred...
256         nothing allow white white problem need address
262      write wall white race slat genocide hyperbole ...
265      ficed rothschild return face face reality hyst...
                               ...                        
22749    hesitate worn bottom people see inside let go ...
22751                 india innovative thrust jai swadeshi
22767    around world white people get dangerously fed ...
22778    fuck petty go year old man weak pathetic creature
22785    accord news source suspect detain release name...
Name: clean, Length: 900, dtype: object

In [10]:
docs = cleaned_tweets.apply(lambda x: str(x).split()).to_numpy()

In [11]:
docs

array([list(['muslim', 'commit', 'multiple', 'terrorist', 'attack', 'across', 'europe', 'gt', 'muslim', 'look', 'back', 'anger', 'random']),
       list(['toxic', 'upbringing', 'environment', 'anti', 'white', 'hatred', 'racism', 'cultivate', 'hollywood', 'medium', 'school', 'un']),
       list(['nothing', 'allow', 'white', 'white', 'problem', 'need', 'address']),
       list(['write', 'wall', 'white', 'race', 'slat', 'genocide', 'hyperbole', 'exaggeration']),
       list(['ficed', 'rothschild', 'return', 'face', 'face', 'reality', 'hysterical', 'assa', 'traor']),
       list(['refer', 'white', 'people', 'disease', 'probably', 'great', 'way', 'convince', 'u', 'racist', 'go', 'ahead']),
       list(['one', 'good', 'thing', 'french', 'police', 'never', 'hesitate', 'fire', 'weapon', 'knife', 'wield', 'muslim']),
       list(['yes', 'af', 'slam', 'merkel', 'import', 'culture', 'hate', 'woman', 'turn', 'berlin', 'baghdad', 'migrant', 'willi']),
       list(['apart', 'weapon', 'way', 'mamma',

In [12]:
coherence_c_v = CoherenceModel(texts = docs, topics=texts, corpus=corpus, dictionary = dictionary,  coherence='c_v').get_coherence()
coherence_c_v

KeyError: 'white'

In [None]:
coherence_u_mass = CoherenceModel(model=model, corpus=corpus, coherence='u_mass').get_coherence()
coherence_u_mass