In [1]:
import re

import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from nltk.tokenize import TweetTokenizer
from matplotlib import rcParams
from matplotlib.patches import Patch
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from sklearn.cluster import KMeans

In [2]:
#rcParams default settings
"https://matplotlib.org/stable/tutorials/introductory/customizing.html"

#rcParams settings
plt.style.use('ggplot')

rcParams['font.family'] = 'sans-serif'
rcParams['font.style'] = 'normal'

rcParams['figure.facecolor'] = 'white'

rcParams['savefig.bbox'] = 'tight'
rcParams['savefig.dpi'] = 300
rcParams['savefig.transparent'] = True

rcParams['axes.spines.right'] = False
rcParams['axes.spines.top'] = False
rcParams['axes.labelsize'] = 20
rcParams['axes.labelcolor'] = 'black'
rcParams['axes.edgecolor'] = 'grey'
rcParams['axes.linewidth'] = 3
rcParams['axes.facecolor'] = 'white'
rcParams['axes.titlepad'] = 4
rcParams['axes.titlesize'] = 20
rcParams['axes.spines.right'] = True
rcParams['axes.spines.top'] = True

rcParams['xtick.color'] = 'grey'
rcParams['ytick.color'] = 'grey'
rcParams['xtick.labelsize'] = 15
rcParams['ytick.labelsize'] = 15
rcParams['xtick.major.width'] = 2
rcParams['ytick.major.width'] = 0
rcParams['xtick.major.size'] = 5
rcParams['ytick.major.size'] = 0

rcParams['lines.linewidth'] = 3
rcParams['lines.markersize'] = 10

rcParams['legend.title_fontsize'] = 15
rcParams['legend.fontsize'] = 10

rcParams['grid.color'] = 'grey'
rcParams['grid.linewidth'] = 0.1

icefire_palette = sns.color_palette("icefire")
sns.set_palette(icefire_palette)

## Topic Modelling Pipeline

In [3]:
df = pd.read_csv('cleaned_texts.csv', index_col=[0])

In [4]:
len(df)

10333

In [5]:
df = df.loc[~df.cleaned_text.isna()]

In [6]:
len(df)

10293

#### Text extraction

In [7]:
docs = df.cleaned_text.to_list()

#### Training procedure

In [8]:
model = BERTopic()

In [9]:
# Train model
topics, probs = model.fit_transform(docs)

In [12]:
model.visualize_documents(docs, hide_document_hover=True, hide_annotations=True)

In [11]:
model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,4173,-1_energy_building_learn_read,"[energy, building, learn, read, help, year, so...",[want enhance skill grow business myschneider ...
1,0,232,0_train_rail_passenger_metro,"[train, rail, passenger, metro, railway, fleet...",[train transport passenger seated accessibilit...
2,1,205,1_wind_turbine_farm_offshore,"[wind, turbine, farm, offshore, blade, project...",[contract includes supply installation service...
3,2,171,2_vessel_methanol_container_sailing,"[vessel, methanol, container, sailing, crew, g...",[thanks sharing news soon welcome fleet world ...
4,3,152,3_brick_bricktastic_sound_build,"[brick, bricktastic, sound, build, click, play...","[brick help, brick brick, brick]"
...,...,...,...,...,...
164,163,11,163_disability_accessible_visible_purple,"[disability, accessible, visible, purple, peop...",[remember disability visible provide accessibl...
165,164,11,164_visualization_booth_wall_innovation,"[visualization, booth, wall, innovation, persp...",[thank give perspective compare others head da...
166,165,11,165_ethic_integrityspeakupline_integrity_depar...,"[ethic, integrityspeakupline, integrity, depar...",[thank contacting please reach integrity speak...
167,166,11,166_emergency_bleeds_care_require,"[emergency, bleeds, care, require, clinician, ...",[major bleeds including intracerebral severe g...
