#**Install**

In [1]:
!pip install bertopic
!pip install bertopic[visualization]
!pip install tweet-preprocessor

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bertopic
  Downloading bertopic-0.12.0-py2.py3-none-any.whl (90 kB)
[K     |████████████████████████████████| 90 kB 10.2 MB/s 
Collecting sentence-transformers>=0.4.1
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 6.7 MB/s 
[?25hCollecting hdbscan>=0.8.28
  Downloading hdbscan-0.8.29.tar.gz (5.2 MB)
[K     |████████████████████████████████| 5.2 MB 52.8 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting pyyaml<6.0
  Downloading PyYAML-5.4.1-cp38-cp38-manylinux1_x86_64.whl (662 kB)
[K     |████████████████████████████████| 662 kB 72.3 MB/s 
Collecting umap-learn>=0.5.0
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[K     |████████████████████████████████| 88 kB 10.2 MB/s 
Collecting tr

#**Import**

In [2]:
# Data processing
import re
import numpy as np
import pandas as pd
from datetime import datetime

# Tweet preprocessor
import preprocessor as p

# Count vectorization
from sklearn.feature_extraction.text import CountVectorizer

# Dimension reduction
from umap import UMAP
from sklearn.decomposition import PCA

# Clustering
from hdbscan import HDBSCAN
from sklearn.cluster import KMeans

# BERTopic
from bertopic import BERTopic

# Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#**Data**

###**Import dataset**

In [3]:
migr_twit = pd.read_csv('/content/drive/MyDrive/MA Computational Linguistics/UK-R-migr-RA-twit-all.csv')

###**Data exploratory**

In [4]:
migr_twit.shape

(6472, 6)

In [5]:
pd.set_option('display.max_colwidth', 100)
migr_twit.head()

Unnamed: 0,id,tweet,date,username,party,type
0,153810782696640512,Of course we'll miss our immigration targets: we have a government which doesn't want control of...,2012-01-02 12:12:02,@Nigel_Farage,far-right,tweet
1,153811659780141057,"@liontornado Subordinate to the EU immigration laws, of course.",2012-01-02 12:15:32,@Nigel_Farage,far-right,replied_to
2,156709735557644288,Unlimited immigration is costing British people jobs. Controlling non-EU immigration is merely c...,2012-01-10 12:11:27,@UKIP,far-right,tweet
3,156742504908210176,Patriotic Labour voters who have had enough of Miliband's spiel and want a Party that will contr...,2012-01-10 14:21:39,@UKIP,far-right,tweet
4,158972986379087872,RT @poljourno: UK's immigration system 'benefits no one'. By @Goddersukip - @GawainTowler @Nigel...,2012-01-16 18:04:48,@UKIP,far-right,retweeted


In [6]:
migr_twit['party'].value_counts()

far-right     5034
right-wing    1438
Name: party, dtype: int64

#**Corpus linguistics**

In [7]:
# Remove duplicates (?)
# migr_twit = migr_twit.drop_duplicates()

# Remove retweets (?)
# migr_twit = migr_twit.loc[(migr_twit.type == "retweet"), :]

#**Hyperparameters**

###**Dimension reduction**


In [8]:
# Initiate UMAP
# umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=100)

###**Clustering**


In [9]:
# Initiate HDBSCAN
# hdbscan_model = HDBSCAN(min_cluster_size=10, min_samples = 10,metric='euclidean', prediction_data=True)

###**Count vectorizer**

In [10]:
# Use CountVectorizer to remove stopwords and set ngram range
vectorizer_model_1 = CountVectorizer(stop_words="english", ngram_range=(1, 2))
vectorizer_model_2 = CountVectorizer(stop_words="english", ngram_range=(1, 3))

#**Pre-processing**

In [11]:
# Keep only some columns
migr_text = migr_twit.filter(['username','tweet', 'date', 'party'], axis=1)

In [12]:
# Customizing the preprocessor to exclude removing hashtags since they are valuable as a rich information
p.set_options(p.OPT.URL, p.OPT.EMOJI,p.OPT.MENTION,p.OPT.SMILEY,p.OPT.NUMBER)

# Forming a separate feature for cleaned tweets
for i,v in enumerate(migr_text['tweet']):
  migr_text.loc[i,'text'] = p.clean(v)
    
migr_text.head()

Unnamed: 0,username,tweet,date,party,text
0,@Nigel_Farage,Of course we'll miss our immigration targets: we have a government which doesn't want control of...,2012-01-02 12:12:02,far-right,Of course we'll miss our immigration targets: we have a government which doesn't want control of...
1,@Nigel_Farage,"@liontornado Subordinate to the EU immigration laws, of course.",2012-01-02 12:15:32,far-right,"Subordinate to the EU immigration laws, of course."
2,@UKIP,Unlimited immigration is costing British people jobs. Controlling non-EU immigration is merely c...,2012-01-10 12:11:27,far-right,Unlimited immigration is costing British people jobs. Controlling non-EU immigration is merely c...
3,@UKIP,Patriotic Labour voters who have had enough of Miliband's spiel and want a Party that will contr...,2012-01-10 14:21:39,far-right,Patriotic Labour voters who have had enough of Miliband's spiel and want a Party that will contr...
4,@UKIP,RT @poljourno: UK's immigration system 'benefits no one'. By @Goddersukip - @GawainTowler @Nigel...,2012-01-16 18:04:48,far-right,RT : UK's immigration system 'benefits no one'. By -


In [13]:
# Remove only the '#' not the word after
def remove_hashtag_sign(text):
  text = re.sub(r'#', '', text)
  return text

migr_text['text'] = migr_text['text'].apply(lambda x:remove_hashtag_sign(x))

In [14]:
# Remove extra white spaces, punctuation and apply lower casing
migr_text['text'] = migr_text['text'].str.lower().str.replace('[^\w\s]',' ').str.replace('\s\s+', ' ')
migr_text.head()

Unnamed: 0,username,tweet,date,party,text
0,@Nigel_Farage,Of course we'll miss our immigration targets: we have a government which doesn't want control of...,2012-01-02 12:12:02,far-right,of course we ll miss our immigration targets we have a government which doesn t want control of ...
1,@Nigel_Farage,"@liontornado Subordinate to the EU immigration laws, of course.",2012-01-02 12:15:32,far-right,subordinate to the eu immigration laws of course
2,@UKIP,Unlimited immigration is costing British people jobs. Controlling non-EU immigration is merely c...,2012-01-10 12:11:27,far-right,unlimited immigration is costing british people jobs controlling non eu immigration is merely co...
3,@UKIP,Patriotic Labour voters who have had enough of Miliband's spiel and want a Party that will contr...,2012-01-10 14:21:39,far-right,patriotic labour voters who have had enough of miliband s spiel and want a party that will contr...
4,@UKIP,RT @poljourno: UK's immigration system 'benefits no one'. By @Goddersukip - @GawainTowler @Nigel...,2012-01-16 18:04:48,far-right,rt uk s immigration system benefits no one by


#**BERTopic**

###**Training**



In [15]:
# Prepare tweet list
tweet_list = migr_text.text.tolist()
# Prepare date list which will be useful later for building the Dynamic Topic Modeling (DTM)
timestamps = migr_text.date.tolist()
# Prepare the party & username lists which will be useful later for building the Topics per class
tweet_party = migr_text.party.tolist()
tweet_username = migr_text.username.tolist()

# Initialize the Sentence model (1)
# topic_model_1 = BERTopic(embedding_model="all-MiniLM-L6-v2", low_memory=True, calculate_probabilities=True,verbose=True)
# Initialize the BERTweet model (2)
# topic_model_2 = BERTopic(embedding_model="vinai/bertweet-base", low_memory=True, calculate_probabilities=True,verbose=True)

# Train model 1
# topics_1, probs_1 = topic_model_1.fit_transform(tweet_list)
# Train model 2
# topics_2, probs_2 = topic_model_2.fit_transform(tweet_list)


In [16]:
# Get topic frequencies for model 1
# topic_model_1.get_topic_freq()

In [17]:
# Get topic frequencies for model 2
# topic_model_2.get_topic_freq()

In [18]:
# Get topic information for model 1
# topic_model_1.get_topic_info().head(21)

In [19]:
# Get topic information for model 2
# topic_model_2.get_topic_info().head(21)

In [20]:
# Save the Sentence Transformers model
# topic_model_1.save('/content/drive/MyDrive/MA Computational Linguistics/Sentence_Transformers')

In [21]:
# Save the BERTweet model
# topic_model_2.save('/content/drive/MyDrive/MA Computational Linguistics/BERTweets')

###**Load baseline model**

In [22]:
# Load the baseline topic model
# migr_tmodel = BERTopic.load('/content/drive/MyDrive/MA Computational Linguistics/Sentence_Transformers')

**Comment**: eventually, the Sentence Transformers model was chosen for the optimization step as it generated better results (i.e., more coherent topics) than the BERTweets model.

###**Optimization**

***Constant parameters***

*   Language set to *English* (language by defualt)
*   Dimension reduction set to *UMAP* (algorithm by default)
*   Clustering set to *HDBSCAN* (algorithm by default)
*   Embedding model set to *Sentence Transformers* (embedding model by default)
*   Low memory set to *True* to make sure less memory is used
*   Calculate probabilities set to *Yes* to calculate the probabilities of topics found in a document
*   Verbose set to *True* so that the model initiation process does not show messages

***Variable parameters***

1.   Count Vectorizer: [vectorizer_model_1; **vectorizer_model_2**] 
2.   Top number of words: [10 (default number); **12**; 14]
3.   Topic diversity: [**0.2**; 0.4]
4.   Minimum topic size: [**10** (default number); 15; 20]
6.   Number of topics: [**40**; 50]

####**1) Count vectorizer**

In [23]:
# Vectorizer model 1 (stopwords + ngram range set to (1,2))
# migr_tmodel.update_topics(tweet_list, vectorizer_model=vectorizer_model_1)

In [24]:
# Get topic info
# migr_tmodel.get_topic_info()

In [25]:
# Get topic info for the first 10 topics
# migr_tmodel.get_topic_info().head(11)

In [26]:
# Vectorizer model 2 (stopwords + ngram range set to (1,3))
# migr_tmodel.update_topics(tweet_list, vectorizer_model=vectorizer_model_2)

In [27]:
# Get topic info
# migr_tmodel.get_topic_info()

In [28]:
# Get topic info for the first 10 topics
# migr_tmodel.get_topic_info().head(11)

**Comment**: not much difference in terms of topics was found in the results when choosing one Count vectorizer or another. However, the second model seemed to us better than the second for at least two reasons. First, we felt that in most cases the n-gram range set to (1, 3) provided us with better information (e.g., topic 27, 1st vs 2nd model: *ukip immigration policy* is more accurate than *ukip* & *immigration policy*; or, topic 49, in the 2nd model: *labour immigration pledge* highlighs where the pledge comes from). Second, the first model appeared to be exaggeratedly skewed towards the first topic (EU), probably because the model was not able to detect the different topics covered when talking about immigration within the EU context. Finally, at this stage of the optimization, the number of words defining a topic is set to 10. The results are quite satisfactory in terms of topic coherence but let's see what happens when incrementing this number. In fact, it is recommended to keep this value below 30 and preferably between 10 and 20, but never below 10.

In [29]:
# We are satisfied with the results generated by the last model
# We then initialize a new model (3)
# topic_model_3 = BERTopic(embedding_model="all-MiniLM-L6-v2", low_memory=True, calculate_probabilities=True, verbose=True, vectorizer_model=vectorizer_model_2)

# Train model 3
# topics_3, probs_3 = topic_model_3.fit_transform(tweet_list)

In [30]:
# Save the Updated Sentence Transformers model
# topic_model_3.save('/content/drive/MyDrive/MA Computational Linguistics/ST_CV2_10')

####**2) Top number of words**

In [31]:
# Initialize new model (4) with the top 12 words
# topic_model_4 = BERTopic(embedding_model="all-MiniLM-L6-v2", low_memory=True, calculate_probabilities=True, verbose=True, vectorizer_model=vectorizer_model_2, top_n_words=12)

# Train model 4
# topics_4, probs_4 = topic_model_4.fit_transform(tweet_list)

In [32]:
# Get topic frequencies for model 4
# topic_model_4.get_topic_freq()

In [33]:
# Get topic information for model 4
# topic_model_4.get_topic_info().head(21)

**Comment**: incrementing the number of words up to 14 resulted in major number of topics and word intrusions within them. However, choosing the 12 top words allowed us to eliminate some outliers that were found in the previous model with the relevant parameter set to 10.
Next, we will test whether diversifing or not the topic diversity (i.e. by limiting the number of duplicate words in each topic) can further improve our BERT model. 

In [34]:
# Save the Updated Sentence Transformers model
# topic_model_4.save('/content/drive/MyDrive/MA Computational Linguistics/ST_CV2_12_None')

####**3) Topic diversity**

In [35]:
# Initialize new model (5) with topic diversity set to 0.2
# topic_model_5 = BERTopic(embedding_model="all-MiniLM-L6-v2", low_memory=True, calculate_probabilities=True, verbose=True, vectorizer_model=vectorizer_model_2, top_n_words=12, diversity=0.2)

# Train model 5
# topics_5, probs_5 = topic_model_5.fit_transform(tweet_list)

In [36]:
# Get topic frequencies for model 5
# topic_model_5.get_topic_freq()

In [37]:
# Get topic information for model 5
# topic_model_5.get_topic_info().head(21)

**Comment**: in a scale between 0 (not at all diverse) and 1 (very diverse), incrementing the topic diversity up to 0.4 had the result to rise the number of topics while not always giving coherent information within topics. However, we found 0.2 as the perfect value providing a much more diversified topic representation while also keeping the same number of topics as in the previous model.

In [38]:
# Save the Updated Sentence Transformers model
# topic_model_5.save('/content/drive/MyDrive/MA Computational Linguistics/ST_CV2_12_0.2_10')

####**4) Minimum topic size**

In [39]:
# Initialize new model (6) with topic size set to 15
# topic_model_6 = BERTopic(embedding_model="all-MiniLM-L6-v2", low_memory=True, calculate_probabilities=True, verbose=True, vectorizer_model=vectorizer_model_2, top_n_words=12, diversity=0.2, min_topic_size=15)

# Train model 6
# topics_6, probs_6 = topic_model_6.fit_transform(tweet_list)

In [40]:
# Get topic frequencies for model 6
# topic_model_6.get_topic_freq()

In [41]:
# Get topic information for model 6
# topic_model_6.get_topic_info().head(51)

**Comment**: many of the clusters of 15 documents, or less, were either repetitive or too specific so we decided to reduce the minimum topic size to 15. However, since we want to reduce the topic dimension to 30-50 topics, only the clusters with 30 documents or more will be considered, so we do not care too much about this parameter and we left it to 10.`

In [42]:
# Save the Updated Sentence Transformers model
# topic_model_6.save('/content/drive/MyDrive/MA Computational Linguistics/ST_CV2_12_0.2_15_No')

####**5) Number of topics**

In [43]:
# Initialize final model with 40 topics
# topic_model_fn = BERTopic(embedding_model="all-MiniLM-L6-v2", low_memory=True, calculate_probabilities=True, verbose=True, vectorizer_model=vectorizer_model_2, top_n_words=12, diversity=0.2, min_topic_size=10, nr_topics=50)

# Train final model 
# topics_fn, probs_fn = topic_model_fn.fit_transform(tweet_list)

In [44]:
# Get topic frequencies for the final model
# topic_model_fn.get_topic_freq().head(11)

In [45]:
# Get topic information for the final model
# topic_model_fn.get_topic_info()

In [46]:
# Intertopic distance map
# topic_model_fn.visualize_topics(width=1000, height=1000)

**Comment**: when picking only 40 topics, the clusters get more defined so we decided to confirm this number and update our final model. 
After training many models (*a*-to-*f* models), we have almost always detected the same 12/13 topic categories or discussions around:
- *Cameron's migration pledge* 
- *Issues related with immigration* 
- *UK Asylum system* 
- *Arrivals by boats* 
- *Refugee crisis*
- *UKIP's political campaign* 
- *Brexit debate 
- *Control immigration*
- *EU immigration*
- *Points based immigration system*
- *Welsh nation of sanctuary*
- *UK-Rwanda Migration and Economic Development Partnership*

In [47]:
# Save the final model
# topic_model_fn.save('/content/drive/MyDrive/MA Computational Linguistics/ST_CV2_12_0.2_10_50')

####**Further improvements**

*   Count Vectorizer: customize other hyperparameters such as *min_df* and *max_features* 
*   Dimension reduction: try *PCA* instead of *UMAP*
*   Clustering: try *Kmeans* instead of *HDBSCAN*

###**Visualization**

In [48]:
# Load the final model
# migr_topic_model = BERTopic.load('/content/drive/MyDrive/MA Computational Linguistics/ST_CV2_12_0.2_10_40_finalb')

In [49]:
# Get topic information for the final model
# migr_topic_model.get_topic_info()

In [50]:
# Intertopic distance map
# migr_topic_model.visualize_topics(width=750, height=750)

In [51]:
# Barchart
# migr_topic_model.visualize_barchart([27], n_words=8, width=300, height=300, custom_labels=False)

**Comment**: with the support of visualization we try to get a clearer sense of the topics extracted, refine the macro categories identified in the last optimization step by grouping up the topics within them. This work will be then useful for the labeling process.
Here below are the final macro categories:
1. *Cameron's migration policy* {7: "David Cameron"; 26: "Net migration"}
2. *Housing crisis* {1: "Housing crisis"}
3. *Population growth* {19: "Population growth"}
4. *Tories' migration policy* {8: "Tories"; 11: "Boris Johnson's government"; 33 "UKIP's election campaign"}
5. *Migrant crisis* {30: "Migrant crisis"; 32: "Terrorism"; 31: "Economic migrants"}
6. *Arrivals by boat* {12: "Calais"; 39: "Dover"; 28: "Smuggling"; 24: "Channel crossing"; 27: "Migrant boats"}
7. *EU immigration* {18: "Romanian and Bulgarian migration"; 29: "EU-Turkey relations"; 37: "Common EU asylum system"; 15: "EU migration statistics"}
8. *UKIP's propaganda* {16: "Migration figures"; 17: "UKIP's interventions", 38: "TV and radio appearences", 34: "Biased BBC"}
9. *Immigration issues* {13: "Immigration scale"; 35: "Migrant benefits"; 25: "Welsh Nation of Sanctuary"; 22: "Health service"}
10. *Labour's migration responsability* {20: "Labour's responsability"}
11. *Workers' low wages* {23: "Workers' low wages"}
12. *Pro-Brexit campaign* {4: "Nigel Farage's campaign"; 6: "Australian points-based system"; 10: "Control immigration"; 36:"UKIP's immigration policy"}
13. *Illegal migration* {2: "Syrian refugee crisis"; 3: "New immigration plan"; 9: "UK-Rwanda Migration and Economic Development Partnership"; 5: "About UK-France cooperation"}
14. *Points based immigration system* {0: "Points-Based System"; 21: "New immigration system"}
15. *About Home Office* {14: "About Home Office"}

In [52]:
# Documents
# migr_topic_model.visualize_documents(tweet_list, width=3000, height=3000)

In [53]:
# Find topics
# migr_topic_model.find_topics("brexit", top_n=10)

###**Labeling**

In [54]:
# Load the final model
migr_topic_model = BERTopic.load('/content/drive/MyDrive/MA Computational Linguistics/ST_CV2_12_0.2_10_40_finalb')

In [55]:
# Set topic labels
migr_topic_model.set_topic_labels({0: "points-based immigration system", 1: "housing crisis", 2: "syrian refugee crisis", 3: "new immigration plan", 4: "nigel farage's campaign", 5: "about uk-france cooperation", 6: "australian points-based system", 7: "david cameron's migration policy", 8: "tories", 9: "uk-rwanda partnership", 10: "control immigration", 11: "boris johnson's government", 12: "calais", 13: "immigration scale", 14: "about home office", 15: "eu migration statistics", 16: "migration figures", 17: "ukip's interventions", 18: "romanian and bulgarian migration", 19: "population growth", 20: "labour's responsability", 21: "new immigration system", 22: "health service", 23: "workers' low wages", 24: "channel crossing", 25: "welsh nation of sanctuary", 26: "net migration", 27: "migrant boats", 28: "smuggling", 29: "eu-turkey relations", 30: "migrant crisis", 31: "economic migrants", 32: "terrorism", 33: "ukip's election campaign", 34: "biased bbc", 35: "migrant benefits", 36:"ukip's immigration policy", 37: "common eu asylum system", 38: "tv and radio appearences", 39: "dover"})
migr_topic_model.get_topic_info().head(31)

Unnamed: 0,Topic,Count,Name,CustomName
0,-1,3622,-1_immigration_ukip_migration_migrants,-1_immigration_ukip_migration_migrants
1,0,153,0_points based immigration_based immigration_points based_points,points-based immigration system
2,1,148,1_housing_housing crisis_immigration_mayoral,housing crisis
3,2,143,2_refugees_syrian refugees_refugee_vulnerable syrian refugees,syrian refugee crisis
4,3,141,3_asylum_genuine need_nnewplanforimmigration_new plan immigration,new immigration plan
5,4,126,4_farage_nigel farage_rt nigel farage_leader nigel farage,nigel farage's campaign
6,5,126,5_illegal migration_interior minister_french_french interior,about uk-france cooperation
7,6,114,6_australian style points_style points based_points based_points based immigration,australian points-based system
8,7,100,7_david cameron_net migration_migration_mr cameron,david cameron's migration policy
9,8,92,8_tories_tory_immigration_rt tories,tories


In [56]:
# Access customised labels
# migr_topic_model.custom_labels_

In [57]:
# Barchart
# migr_topic_model.visualize_barchart(top_n_topics=4, n_words=12, width=350, height=350, custom_labels=migr_topic_model.custom_labels_)

In [58]:
# Documents
# migr_topic_model.visualize_documents(tweet_list, width=3000, height=3000, custom_labels=migr_topic_model.custom_labels_)


###**Merge topics**

In [59]:
# Create macro categories by merging topics
topics_to_merge = [[7, 26], [8, 11, 33], [30, 31, 32], [12, 24, 27, 28, 39], [15, 18, 29, 37], [16, 17, 34, 38], [13, 22, 25, 35], [4, 6, 10, 36], [2, 3, 5, 9], [0, 21]]
migr_topic_model.merge_topics(tweet_list, topics_to_merge)

In [60]:
# Get topic information for the final model
migr_topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,3622,-1_immigration_ukip_migration_migrants
1,0,498,0_asylum_rwanda_migration_refugees
2,1,365,1_farage_australian style_australian style points_ukip
3,2,260,2_migrants_calais_dover_boats
4,3,219,3_immigration_wales_nhs_ukip
5,4,217,4_eu_bulgarian_romania_bulgaria
6,5,215,5_tories_boris_priti patel_boris johnson
7,6,210,6_ukip_migration spokesman_ukip migration_ukip migration spokesman
8,7,206,7_based immigration_points based immigration_immigration_points based
9,8,148,8_housing_housing crisis_immigration_crisis


**Comment**: now that we have obtained our macro catageries, we can rename them again:

In [61]:
# Set topic labels
migr_topic_model.set_topic_labels({0: "illegal migration", 1: "pro-brexit campaign", 2: "arrivals by boat", 3: "immigration issues", 4: "eu immigration", 5: "tories' migration policy", 6: "ukip's propaganda", 7: "points based immigration system", 8: "housing crisis", 9: "cameron's migration policy", 10: "migrant crisis", 11: "about home office", 12: "population growth", 13: "labour's migration responsability", 14: "workers' low wages"})
migr_topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,CustomName
0,-1,3622,-1_immigration_ukip_migration_migrants,-1_immigration_ukip_migration_migrants
1,0,498,0_asylum_rwanda_migration_refugees,illegal migration
2,1,365,1_farage_australian style_australian style points_ukip,pro-brexit campaign
3,2,260,2_migrants_calais_dover_boats,arrivals by boat
4,3,219,3_immigration_wales_nhs_ukip,immigration issues
5,4,217,4_eu_bulgarian_romania_bulgaria,eu immigration
6,5,215,5_tories_boris_priti patel_boris johnson,tories' migration policy
7,6,210,6_ukip_migration spokesman_ukip migration_ukip migration spokesman,ukip's propaganda
8,7,206,7_based immigration_points based immigration_immigration_points based,points based immigration system
9,8,148,8_housing_housing crisis_immigration_crisis,housing crisis


###**Dynamic Topic Modeling (DTM)**

In [62]:
# Set topics over time
topics_over_time = migr_topic_model.topics_over_time(docs=tweet_list, 
                                                timestamps=timestamps, 
                                                global_tuning=True, 
                                                evolution_tuning=True, 
                                                nr_bins=20)

20it [00:17,  1.17it/s]


In [63]:
# Visualize topics over time
migr_topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=15, custom_labels=migr_topic_model.set_topic_labels, normalize_frequency=True, width=1350, height=550)

###**Topics per Class**

In [64]:
# Set topics per class
topics_per_party = migr_topic_model.topics_per_class(tweet_list, classes=tweet_party, global_tuning=True)

2it [00:01,  1.60it/s]


In [65]:
# Visualize topics per party
migr_topic_model.visualize_topics_per_class(topics_per_party, top_n_topics=15, custom_labels=migr_topic_model.set_topic_labels, normalize_frequency=True, width=1350, height=550)

In [66]:
# Set topics per political actor
topics_per_actor = migr_topic_model.topics_per_class(tweet_list, classes=tweet_username, global_tuning=True)

12it [00:05,  2.14it/s]


In [67]:
# Visualize topics per political actor
migr_topic_model.visualize_topics_per_class(topics_per_actor, top_n_topics=15, custom_labels=migr_topic_model.set_topic_labels, normalize_frequency=True, width=1350, height=550)