In [31]:
import pandas as pd
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer

from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from hdbscan import HDBSCAN
from umap import UMAP

from pre_processing import *
from helpers import *

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [32]:
# Load data
df = pd.read_csv('data/to_be_clustered.csv.gz', compression="gzip")

# array that can save the models
saved_models = []

In [33]:
import pickle
def save_model(model, params, message = ""):
    saved_models.append((model, params, message))
    
def write_good_params():
    pickle.dump(saved_models, open( "data/saved_models/save_models_for_"+country+".pkl", "wb" ))
    
def get_model(params, additional_stop_words =  []) :
    embedding_model = SentenceTransformer("all-mpnet-base-v2") #'digio/Twitter4SSE'
    s = list(stopwords.words('english'))
    vectorizer_model = CountVectorizer(stop_words=s)
    
    umap_model = UMAP(n_neighbors = params['UMAP']['n_neighbors'], 
                  n_components = params['UMAP']['n_components'], 
                  min_dist = params['UMAP']['min_dist'], 
                  metric = params['UMAP']['metric'], 
                  low_memory = params['UMAP']['low_memory'], 
                  random_state = params['UMAP']['random_state'])

    hdbscan_model = HDBSCAN(min_cluster_size = params['HDBSCAN']['min_cluster_size'],
                       min_samples = params['HDBSCAN']['min_samples'],
                       cluster_selection_epsilon = params['HDBSCAN']['cluster_selection_epsilon'],
                       metric = params['HDBSCAN']['metric'],                      
                       cluster_selection_method = params['HDBSCAN']['cluster_selection_method'],
                       prediction_data = params['HDBSCAN']['prediction_data'])

    model = BERTopic(
        umap_model = umap_model,
        vectorizer_model=vectorizer_model,
        hdbscan_model = hdbscan_model,
        embedding_model=embedding_model,
        language='english', calculate_probabilities=False,
        verbose=True
    )
    return model

## Chose the country

In [34]:
country = 'France'

## Set parameters

In [35]:
## Best params until now but cannot catch coronavirus
params = {
    'UMAP' : {
        'metric' : 'cosine', 
        'n_neighbors' : 15, 
        'n_components' : 20, 
        'min_dist' : 0.3, 
        'low_memory' : False,
        'random_state': 8
    },
    'HDBSCAN': {
        'min_cluster_size':15,
        'min_samples': 1,
        'cluster_selection_epsilon': 0.6,
        'metric': 'euclidean',                      
        'cluster_selection_method': 'eom',
        'prediction_data': True}
}

## Load Model

In [36]:
df = df[df.whcs == country]
df.drop(df[df.clean.isna()].index,inplace =True)
tweets = df.clean.to_list()
model = get_model(params, additional_stop_words= ['lol','true', 'amp'])

In [37]:
topics, probs = model.fit_transform(tweets)

Batches:   0%|          | 0/29 [00:00<?, ?it/s]

2022-12-13 20:42:08,229 - BERTopic - Transformed documents to Embeddings
2022-12-13 20:42:13,335 - BERTopic - Reduced dimensionality
2022-12-13 20:42:13,498 - BERTopic - Clustered reduced embeddings


In [38]:
df['topics'] = topics
my_topcis = ['white/black - racism', 'islam - terrorism', 'coronavirus - vaccination', 
          'american elections', 'India - Pakistan', 'jew - antisemtism', 'farright - anti communism'
          'tweeter acccounts', 'fantasy sex play', 'Alain Soral', 'immigration', 'homophobia']

In [39]:
model.visualize_heatmap()

In [40]:
model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,179,-1_people_get_want_mr
1,0,189,0_white_people_black_racist
2,1,165,1_muslim_biden_migrant_amp
3,2,101,2_nigga_lil_twitter_fuck
4,3,94,3_trump_cpac_power_communism
5,4,46,4_india_hindu_pakistan_modi
6,5,31,5_jew_anti_say_semitism
7,6,22,6_play_sit_fantasy_day
8,7,22,7_soral_alain_answer_episode
9,8,18,8_berry_jeane_manson_assange


In [41]:
model.get_topics()

{-1: [('people', 0.025229362252351013),
  ('get', 0.023595742716328572),
  ('want', 0.019696810594882617),
  ('mr', 0.019376256926973284),
  ('gun', 0.019376256926973284),
  ('america', 0.019132921995857796),
  ('amp', 0.01904753534984963),
  ('government', 0.018202150463911597),
  ('left', 0.017727129535394355),
  ('world', 0.01769474468343855)],
 0: [('white', 0.14023304184882154),
  ('people', 0.07314173581047005),
  ('black', 0.06148606504322554),
  ('racist', 0.028609679929422836),
  ('racism', 0.026848362660787595),
  ('diversity', 0.02468452847922403),
  ('war', 0.024261857489089413),
  ('supremacist', 0.024139906987789277),
  ('get', 0.023833679440132347),
  ('race', 0.023187324456103147)],
 1: [('muslim', 0.04763870485892607),
  ('biden', 0.039749697452471276),
  ('migrant', 0.03168321153429926),
  ('amp', 0.03072305701816145),
  ('islamic', 0.026065087685097042),
  ('attack', 0.0251655885114386),
  ('germany', 0.025009372566648858),
  ('break', 0.02460004398918951),
  ('refug

In [42]:
model.find_topics("terrorism")

([1, 5, 3, -1, 0],
 [0.5957646661869934,
  0.5302741385820826,
  0.5291504266860951,
  0.5257780479770615,
  0.4612791744735536])

In [43]:
model.visualize_documents(tweets)

In [44]:
save_model(model.get_topics(), params, "12 topics, 225 unmatchde, not bad with trump appeering twice but for different topics")

In [45]:
write_good_params()

### Topics 
We can give categories we used for labelling to these clusters
1. 'white/black - racism'
1. 'immigration', 'islam'
1. 'american elections', 'farright'
1. 'India - Pakistan'
1. 'tweeter acccounts'
1. 'jew - antisemtism'
1. 'fantasy sex play'
1. 'Alain Soral'
1. 'coronavirus - vaccination'
1. 'communism - against bankers, against state order', 'farright

categories = 

Don't have :
['terrorism', 'homophobia', 'women', 'other', "Don't know"]

In [46]:
my_topics = [['white/black - racism'],
    ['immigration', 'islam'],
    ['american elections', 'farright'],
    ['India - Pakistan'],
    ['tweeter acccounts'],
    ['jew - antisemtism'],
    ['fantasy sex play'],
    ['Alain Soral', 'farright'],
    ['coronavirus - vaccination'],
    ['communism - against bankers, against state order', 'farright'],
    ['other']]

In [47]:
df_labeled = get_labeled_tweets(country = 'France')

In [48]:
labels_idx = df_labeled.index
labeled_tweets = df_labeled.join(df.loc[labels_idx][['topics','clean']])

In [49]:
labeled_tweets

Unnamed: 0_level_0,translated,labels,topics,clean
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
262,The writing is on the wall. The White race is ...,[white/black - racism],0,write wall white race slat genocide hyperbole ...
443,"One good thing about French police, they never...","[islam, farright]",1,one good thing french police never hesitate fi...
470,No Muslim ever respects a religious teacher of...,[islam],1,muslim ever respect religious teacher religion...
731,WARNING: Communists Weaponize Fake Asian Hate ...,[jew - antisemtism],9,warn communist weaponize fake asian hate crime...
1065,Panic in the kitchen: rebel restaurateurs depr...,[other],7,panic kitchen rebel restaurateur deprive aid b...
...,...,...,...,...
21752,I didn't think it was possible for a man to pr...,[women],2,think possible man produce much estrogen
21823,This is how the new regime is going to force t...,"[communism - against bankers , against state ...",3,new regime go force mass obey send people disa...
22079,Appalling • homeless queuing for food last n...,[other],-1,appal homeless queue food last night soup kitc...
22390,You can have diversity or you can have strengt...,[homophobia],0,diversity strength army choose diversity


In [50]:
labeled_tweets['topics'] = labeled_tweets['topics'].apply(lambda x: my_topics[x])

In [51]:
labeled_tweets

Unnamed: 0_level_0,translated,labels,topics,clean
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
262,The writing is on the wall. The White race is ...,[white/black - racism],[white/black - racism],write wall white race slat genocide hyperbole ...
443,"One good thing about French police, they never...","[islam, farright]","[immigration, islam]",one good thing french police never hesitate fi...
470,No Muslim ever respects a religious teacher of...,[islam],"[immigration, islam]",muslim ever respect religious teacher religion...
731,WARNING: Communists Weaponize Fake Asian Hate ...,[jew - antisemtism],"[communism - against bankers, against state or...",warn communist weaponize fake asian hate crime...
1065,Panic in the kitchen: rebel restaurateurs depr...,[other],"[Alain Soral, farright]",panic kitchen rebel restaurateur deprive aid b...
...,...,...,...,...
21752,I didn't think it was possible for a man to pr...,[women],"[american elections, farright]",think possible man produce much estrogen
21823,This is how the new regime is going to force t...,"[communism - against bankers , against state ...",[India - Pakistan],new regime go force mass obey send people disa...
22079,Appalling • homeless queuing for food last n...,[other],[other],appal homeless queue food last night soup kitc...
22390,You can have diversity or you can have strengt...,[homophobia],[white/black - racism],diversity strength army choose diversity


In [52]:
get_score(labeled_tweets)

0.456

## Metric

In [53]:
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel

In [71]:
# Preprocess Documents
documents = pd.DataFrame({"Document": tweets,
                          "ID": range(len(tweets)),
                          "Topic": topics})
documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
cleaned_docs = model._preprocess_text(documents_per_topic.Document.values)

In [59]:
# Extract vectorizer and analyzer from BERTopic
vectorizer = model.vectorizer_model
analyzer = vectorizer.build_analyzer()

In [61]:
# Extract features for Topic Coherence evaluation
words = vectorizer.get_feature_names()
tokens = [analyzer(doc) for doc in cleaned_docs]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]
topic_words = [[words for words, _ in model.get_topic(topic)] 
               for topic in range(len(set(topics))-1)]

In [64]:
# Evaluate
coherence_model = CoherenceModel(topics=topic_words, 
                                 texts=tokens, 
                                 corpus=corpus,
                                 dictionary=dictionary, 
                                 coherence='u_mass')
coherence = coherence_model.get_coherence()

In [65]:
coherence

-0.43320278277544444