In [85]:
import pandas as pd
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer

from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from hdbscan import HDBSCAN
from umap import UMAP

from pre_processing import *
from helpers import *

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# Load data
df = pd.read_csv('data/to_be_clustered.csv.gz', compression="gzip")

# array that can save the models
saved_models = []

In [3]:
import pickle
def save_model(model, params, message = ""):
    saved_models.append((model, params, message))
    
def write_good_params():
    pickle.dump(saved_models, open( "data/saved_models/save_models_for_"+country+".pkl", "wb" ))
    
def get_model(params, additional_stop_words =  []) :
    embedding_model = SentenceTransformer("all-mpnet-base-v2") #'digio/Twitter4SSE'
    s = list(stopwords.words('english'))
    vectorizer_model = CountVectorizer(stop_words=s)
    
    umap_model = UMAP(n_neighbors = params['UMAP']['n_neighbors'], 
                  n_components = params['UMAP']['n_components'], 
                  min_dist = params['UMAP']['min_dist'], 
                  metric = params['UMAP']['metric'], 
                  low_memory = params['UMAP']['low_memory'], 
                  random_state = params['UMAP']['random_state'])

    hdbscan_model = HDBSCAN(min_cluster_size = params['HDBSCAN']['min_cluster_size'],
                       min_samples = params['HDBSCAN']['min_samples'],
                       cluster_selection_epsilon = params['HDBSCAN']['cluster_selection_epsilon'],
                       metric = params['HDBSCAN']['metric'],                      
                       cluster_selection_method = params['HDBSCAN']['cluster_selection_method'],
                       prediction_data = params['HDBSCAN']['prediction_data'])

    model = BERTopic(
        umap_model = umap_model,
        vectorizer_model=vectorizer_model,
        hdbscan_model = hdbscan_model,
        embedding_model=embedding_model,
        language='english', calculate_probabilities=False,
        verbose=True
    )
    return model

## Chose the country

In [4]:
country = 'France'

## Set parameters

In [151]:
## Best params until now but cannot catch coronavirus
params = {
    'UMAP' : {
        'metric' : 'cosine', 
        'n_neighbors' : 15, 
        'n_components' : 20, 
        'min_dist' : 0.3, 
        'low_memory' : False,
        'random_state': 8
    },
    'HDBSCAN': {
        'min_cluster_size':15,
        'min_samples': 1,
        'cluster_selection_epsilon': 0.6,
        'metric': 'euclidean',                      
        'cluster_selection_method': 'eom',
        'prediction_data': True}
}

## Load Model

In [152]:
df = df[df.whcs == country]
df.drop(df[df.clean.isna()].index,inplace =True)
tweets = df.clean.to_list()
model = get_model(params, additional_stop_words= ['lol','true', 'amp'])

In [153]:
topics, probs = model.fit_transform(tweets)

Batches:   0%|          | 0/29 [00:00<?, ?it/s]

2022-12-15 14:57:08,409 - BERTopic - Transformed documents to Embeddings
2022-12-15 14:57:22,298 - BERTopic - Reduced dimensionality
2022-12-15 14:57:22,598 - BERTopic - Clustered reduced embeddings


In [142]:
df['topics'] = topics
my_topcis = ['white/black - racism', 'islam - terrorism', 'coronavirus - vaccination', 
          'american elections', 'India - Pakistan', 'jew - antisemtism', 'farright - anti communism'
          'tweeter acccounts', 'fantasy sex play', 'Alain Soral', 'immigration', 'homophobia']

In [94]:
model.visualize_heatmap()

In [53]:
model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,207,-1_people_want_america_amp
1,0,292,0_white_people_black_nigga
2,1,166,1_muslim_biden_migrant_amp
3,2,61,2_trump_cpac_democrat_president
4,3,42,3_india_hindu_pakistan_modi
5,4,31,4_jew_anti_say_semitism
6,5,24,5_twitter_follower_many_account
7,6,22,6_play_sit_fantasy_day
8,7,22,7_soral_alain_answer_episode
9,8,17,8_reset_great_cleaning_lockdown


In [95]:
model.get_topics()

{-1: [('people', 0.0257345512272812),
  ('want', 0.02268441495277331),
  ('america', 0.020353168646884186),
  ('amp', 0.018462523593228673),
  ('government', 0.018085438793691132),
  ('go', 0.018014771074271855),
  ('get', 0.01772035416938935),
  ('attack', 0.01728125529290298),
  ('mr', 0.01704868360727458),
  ('gun', 0.01704868360727458)],
 0: [('white', 0.10759500818477581),
  ('people', 0.0608939013094943),
  ('black', 0.045279520246625744),
  ('nigga', 0.02265284131853321),
  ('really', 0.02217414391669522),
  ('racist', 0.021522313686534057),
  ('like', 0.021062578982979217),
  ('literally', 0.02019067271439289),
  ('racism', 0.02019067271439289),
  ('say', 0.020124231220722554)],
 1: [('muslim', 0.04873762386510598),
  ('biden', 0.040642116458368856),
  ('migrant', 0.03408951517120118),
  ('amp', 0.03147826659974266),
  ('islamic', 0.026575855012353437),
  ('attack', 0.025714233297801214),
  ('border', 0.025446362967878725),
  ('germany', 0.025446362967878725),
  ('break', 0.025

In [54]:
model.find_topics("terrorism")

([9, 1, -1, 4, 2],
 [0.6151157263833544,
  0.5927522283217049,
  0.5463597892060382,
  0.5299337499163588,
  0.4659209593942061])

In [23]:
model.visualize_documents(tweets)

In [None]:
save_model(model.get_topics(), params, "12 topics, 225 unmatchde, not bad with trump appeering twice but for different topics")

In [None]:
write_good_params()

### Topics 
We can give categories we used for labelling to these clusters
1. 'white/black - racism'
1. 'immigration', 'islam'
1. 'american elections', 'farright'
1. 'India - Pakistan'
1. 'tweeter acccounts'
1. 'jew - antisemtism'
1. 'fantasy sex play'
1. 'Alain Soral'
1. 'coronavirus - vaccination'
1. 'communism - against bankers, against state order', 'farright

categories = 

Don't have :
['terrorism', 'homophobia', 'women', 'other', "Don't know"]

In [154]:
my_topics = [['white/black - racism'],
    ['immigration', 'islam'],
    ['american elections', 'farright'],
    ['India - Pakistan'],
    ['tweeter acccounts'],
    ['jew - antisemtism'],
    ['fantasy sex play'],
    ['Alain Soral', 'farright'],
    ['coronavirus - vaccination'],
    ['communism - against bankers, against state order', 'farright'],
    ['terrorism', 'homophobia', 'women', 'other', "Don't know"]]

In [155]:
df_labeled = get_labeled_tweets(country = 'France')
labeled_tweets = df_labeled.join(df.loc[df_labeled.index][['topics','clean']])
labeled_tweets['topics'] = labeled_tweets['topics'].apply(lambda x: my_topics[x])

In [156]:
get_score(labeled_tweets)

0.576