In [12]:
import pandas as pd
import random 
from bertopic import BERTopic

from bert_helpers import * 
import sys
sys.path.append('../helpers_python')
from pre_processing import *
from helpers import *

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Chose the country

In [8]:
country = 'Turkey'

# Load data
df = load_data_bert(country)

# array that can save the models
saved_models = []

## Set parameters

In [27]:
## Best params until now but cannot catch coronavirus
params_france = {
    'UMAP' : {
        'metric' : 'cosine', 
        'n_neighbors' : 15, 
        'n_components' : 20, 
        'min_dist' : 0.3, 
        'low_memory' : False,
        'random_state': 8
    },
    'HDBSCAN': {
        'min_cluster_size':15,
        'min_samples': 1,
        'cluster_selection_epsilon': 0.6,
        'metric': 'euclidean',                      
        'cluster_selection_method': 'eom',
        'prediction_data': True}
}

params= {
    'UMAP' : {
        'metric' : 'cosine', 
        'n_neighbors' : 15, 
        'n_components' : 10, 
        'min_dist' : 0.2, 
        'low_memory' : False,
        'random_state': 8
    },
    'HDBSCAN': {
        'min_cluster_size':15,
        'min_samples': 1,
        'cluster_selection_epsilon': 0.6,
        'metric': 'euclidean',                      
        'cluster_selection_method': 'eom',
        'prediction_data': True}
}

## Train model with french parameters

In [22]:
tweets = df.clean.to_list()
tweets_sub = random.sample(tweets,k=int(len(tweets)*0.1))


In [36]:
model_fr = get_model(params_france, additional_stop_words= [])
topics_fr, probs_fr = model_fr.fit_transform(tweets_sub)
model_fr.get_topic_info()

Batches:   0%|          | 0/17 [00:00<?, ?it/s]

2022-12-16 17:23:51,160 - BERTopic - Transformed documents to Embeddings
2022-12-16 17:23:58,237 - BERTopic - Reduced dimensionality
2022-12-16 17:23:58,341 - BERTopic - Clustered reduced embeddings


Unnamed: 0,Topic,Count,Name
0,-1,60,-1_pkk_time_go_link
1,0,231,0_turkey_turkish_state_kurdish
2,1,122,1_woman_right_say_allah
3,2,89,2_prison_mother_student_take
4,3,26,3_martyr_struggle_people_guerrilla


In [37]:
model_fr.get_topics()

{-1: [('pkk', 0.05568987137156217),
  ('time', 0.047734175461339),
  ('go', 0.03977847955111583),
  ('link', 0.03630596445269925),
  ('alternative', 0.03630596445269925),
  ('calan', 0.035336787996869026),
  ('month', 0.035336787996869026),
  ('year', 0.03355959993365784),
  ('climate', 0.031854529195505174),
  ('memory', 0.031854529195505174)],
 0: [('turkey', 0.05024801599855099),
  ('turkish', 0.04701590561904063),
  ('state', 0.0389868641185469),
  ('kurdish', 0.034889777997744884),
  ('people', 0.02935507047910019),
  ('lu', 0.027229784087240458),
  ('gergerlio', 0.02619986845486328),
  ('akp', 0.024511678343846162),
  ('hdp', 0.023037175730554492),
  ('erdogan', 0.023037175730554492)],
 1: [('woman', 0.046707385839345895),
  ('right', 0.046195122085384455),
  ('say', 0.040263524309493204),
  ('allah', 0.039690396374306686),
  ('people', 0.03794185327558143),
  ('day', 0.037250079914060236),
  ('great', 0.033889761501128025),
  ('follow', 0.033139697950369064),
  ('god', 0.0318436

In [38]:
model_fr.visualize_heatmap()

In [None]:
model_fr.visualize_documents(tweets_sub)

## Find parameters that are more adapted to Turkey 

In [None]:
model = get_model(params, additional_stop_words= [])
topics, probs = model.fit_transform(tweets_sub)
model.get_topic_info()

In [None]:
model.get_topics()

In [19]:
model.find_topics("armenia")

([0, 5, 2, -1, 6],
 [0.6815587825466685,
  0.4726839886601701,
  0.4124459038638667,
  0.3467415785867757,
  0.3430034015491334])

In [31]:
model.visualize_documents(tweets_sub)

## Metric

In [175]:
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel

In [176]:
# Preprocess Documents
documents = pd.DataFrame({"Document": tweets,
                          "ID": range(len(tweets)),
                          "Topic": topics})
documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
cleaned_docs = model._preprocess_text(documents_per_topic.Document.values)

In [177]:
# Extract vectorizer and analyzer from BERTopic
vectorizer = model.vectorizer_model
analyzer = vectorizer.build_analyzer()

In [178]:
# Extract features for Topic Coherence evaluation
words = vectorizer.get_feature_names()
tokens = [analyzer(doc) for doc in cleaned_docs]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]
topic_words = [[words for words, _ in model.get_topic(topic)] 
               for topic in range(len(set(topics))-1)]

In [179]:
# Evaluate
coherence_model = CoherenceModel(topics=topic_words, 
                                 texts=tokens, 
                                 corpus=corpus,
                                 dictionary=dictionary, 
                                 coherence='c_v')
coherence = coherence_model.get_coherence()

In [180]:
coherence

0.7936895693528356