In [2]:
import pandas as pd
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer

from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from hdbscan import HDBSCAN
from umap import UMAP

from pre_processing import *
from helpers import *

%load_ext autoreload
%autoreload 2

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mathi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\mathi\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\mathi\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mathi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mathi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [33]:
# Load data
df = pd.read_csv('data/to_be_clustered.csv.gz', compression="gzip")

# array that can save the models
saved_models = []

## Chose the country

In [17]:
country = 'France'

## Set parameters

In [76]:
params = {
    'UMAP' : {
        'metric' : 'cosine', 
        'n_neighbors' : 15, 
        'n_components' : 20, 
        'min_dist' : 0.3, 
        'low_memory' : False,
        'random_state': 8
    },
    'HDBSCAN': {
        'min_cluster_size':20,
        'min_samples': 1,
        'cluster_selection_epsilon': 0.6,
        'metric': 'euclidean',                      
        'cluster_selection_method': 'eom',
        'prediction_data': True}
}

## Load Model

In [83]:
tweets = df[df.whcs == country].clean.to_list()
embedding_model = SentenceTransformer("all-mpnet-base-v2") #'digio/Twitter4SSE'
s = list(stopwords.words('english')) + ['lol', 'true']
vectorizer_model = CountVectorizer(stop_words=s)
model = get_model(params)

In [84]:
topics, probs = model.fit_transform(tweets)

Batches:   0%|          | 0/29 [00:00<?, ?it/s]

2022-12-08 13:59:24,927 - BERTopic - Transformed documents to Embeddings
2022-12-08 13:59:30,129 - BERTopic - Reduced dimensionality
2022-12-08 13:59:30,221 - BERTopic - Clustered reduced embeddings


In [None]:
topcis = ['white/black - racism', 'islam - terrorism', 'coronavirus - vaccination', 
          'american elections', 'India - Pakistan', 'jew - antisemtism', 'farright - anti communism'
          'tweeter acccounts', 'fantasy sex play', 'Alain Soral', 'immigration']

In [85]:
model.visualize_heatmap()

In [71]:
model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,362,-1_people_year_want_new
1,0,128,0_white_people_black_racist
2,1,67,1_muslim_sweden_islamic_sharia
3,2,55,2_germany_german_coronavirus_warn
4,3,50,3_trump_cpac_democrat_party
5,4,46,4_woman_afrin_fight_cry
6,5,40,5_india_hindu_pakistan_banker
7,6,38,6_jew_anti_say_semitism
8,7,29,7_nigga_lil_fuck_know
9,8,23,8_twitter_follower_account_many


In [86]:
model.get_topics()

{-1: [('people', 0.024474605863375003),
  ('year', 0.023269604670209477),
  ('great', 0.022056617126275616),
  ('like', 0.018427810000464718),
  ('amp', 0.018293013940034453),
  ('left', 0.0175561013846154),
  ('america', 0.017064376758153615),
  ('see', 0.016819633198227213),
  ('reset', 0.016047192711665097),
  ('child', 0.01595091805524648)],
 0: [('muslim', 0.04723991215950561),
  ('biden', 0.03787839965276231),
  ('migrant', 0.03304375428148503),
  ('amp', 0.03189709230438733),
  ('terrorist', 0.027477851458930627),
  ('border', 0.02656383287670483),
  ('islamic', 0.025760485742747464),
  ('attack', 0.024924494061609882),
  ('germany', 0.024666416242654483),
  ('sweden', 0.024035730301690768)],
 1: [('white', 0.16090744258682363),
  ('people', 0.07806751537835034),
  ('black', 0.06819606931793223),
  ('racist', 0.03241798986657192),
  ('want', 0.03151476720342683),
  ('racism', 0.030412355104225592),
  ('diversity', 0.02790933574516534),
  ('war', 0.027585590039714163),
  ('suprem

In [87]:
model.visualize_documents(tweets)

In [66]:
save_model(model.get_topics(), params, "12 topics, 225 unmatchde, not bad with trump appeering twice but for different topics")

In [100]:
write_good_params()

## Functions 

In [99]:
import pickle
def save_model(model, params, message = ""):
    saved_models.append((model, params, message))
    
def write_good_params():
    pickle.dump(saved_models, open( "data/saved_models/save_models_for_"+country+".pkl", "wb" ))
    
def get_model(params) :
    umap_model = UMAP(n_neighbors = params['UMAP']['n_neighbors'], 
                  n_components = params['UMAP']['n_components'], 
                  min_dist = params['UMAP']['min_dist'], 
                  metric = params['UMAP']['metric'], 
                  low_memory = params['UMAP']['low_memory'], 
                  random_state = params['UMAP']['random_state'])

    hdbscan_model = HDBSCAN(min_cluster_size = params['HDBSCAN']['min_cluster_size'],
                       min_samples = params['HDBSCAN']['min_samples'],
                       cluster_selection_epsilon = params['HDBSCAN']['cluster_selection_epsilon'],
                       metric = params['HDBSCAN']['metric'],                      
                       cluster_selection_method = params['HDBSCAN']['cluster_selection_method'],
                       prediction_data = params['HDBSCAN']['prediction_data'])

    model = BERTopic(
        umap_model = umap_model,
        vectorizer_model=vectorizer_model,
        hdbscan_model = hdbscan_model,
        embedding_model=embedding_model,
        language='english', calculate_probabilities=False,
        verbose=True
    )
    return model