In [None]:
<h2 align=center> Topic Modelling with BERTopic B2B Case</h2>

In [None]:
<div align="center">
    <img width="1112px" src='Capture.PNG' />
    <p style="text-align: center;color:gray">Figure 1: BERTopic() Topic Modelling</p>
</div>

In [None]:
### Installing the dependencies

In [None]:
### Installing all the dependencies 
!pip install bertopic[visualization] --quiet

In [None]:
pip install pip==8.1.1

In [None]:
pip install numpy==1.20

In [None]:
!pip install WordCloud
from wordcloud import WordCloud

In [None]:
pip install openpyxl

In [None]:
#Importing Libraries
import numpy as np 
import pandas as pd
from ast import literal_eval
import openpyxl
from copy import deepcopy
from bertopic import BERTopic

import matplotlib.pyplot as plt

import plotly as py
import plotly.graph_objs as go
import ipywidgets as widgets
from scipy import special
import plotly.express as px

py.offline.init_notebook_mode(connected = True)
%matplotlib inline

### Loading the Dataset and Analysing

In [None]:
df = pd.read_excel('df')
df.head(20)

In [None]:
df.info()

In [None]:
wordcloud2 = WordCloud().generate(' '.join(df['col1']))
plt.figure(figsize = (10, 8), facecolor = None)
plt.imshow(wordcloud2)
plt.axis("off")
plt.show()

### `Use Case 1`: Input Keywords aka col1 column

In [None]:
docs = list(df.loc[:,'col1'].values)
docs[:5]

In [None]:
len(docs)

The default embedding model for english is `all-MiniLM-L6-v2`. While for multi-lingual it is `paraphrase-multilingual-MiniLM-L12-v2`.

In [None]:
model = BERTopic(embedding_model = "paraphrase-multilingual-mpnet-base-v2",language="multilingual",calculate_probabilities=True,verbose=True)

In [None]:
topics, probs = model.fit_transform(docs)

In [None]:
input_topics_freq = model.get_topic_info()

In [None]:
fig = px.bar(input_topics_freq,x='Topic',y='Count', title = 'Distribution of Input Topic Generated')
fig.show()

In [None]:
model.visualize_barchart(topics = [-1])

### `Use Case 02`: After Transformation aka Discovering New Possible Topics {Topics Column}

In [None]:
#removes the '' from string set values
df['col2'] = df.col2.apply(lambda x: literal_eval(str(x)))
df.head(20)

In [None]:
#Take the new topics column and explode each topic into a new row and add it into a pd Dataframe
newdf = df['col2']
topics = newdf.explode('col2')
topics = pd.DataFrame(topics)
topics.head()

In [None]:
topics[:3]

In [None]:
docs = list(topics['col2'])
docs[:5]

In [None]:
len(docs)

In [None]:
usecase_2_model = BERTopic(language="multilingual",calculate_probabilities=True,verbose=True,nr_topics='auto')

In [None]:
topics, probs = usecase_2_model.fit_transform(docs)

In [None]:
new_topics_freq = usecase_2_model.get_topic_freq()

In [None]:
fig = px.bar(new_topics_freq,x='col2',y='Count', title = 'Distribution of Topic Generated Uase Case 02')
fig.show()

In [None]:
def representativedocs(model, topics, docs, keywords):
    model.get_topic_info()
    
    #extracting the topic names/numbers 
    top_names = model.topic_names
    top_names = pd.DataFrame(top_names.items(), columns = [topics,docs])
    
    #extracting representative docs for all the topics 
    rep_docs = model.representative_docs
    rep_docs = pd.DataFrame(rep_docs.items(), columns = [topics, keywords])
    
    #get topics with probability 
    top_proba = model.get_topics()
    
    output = pd.merge(top_names, 
                rep_docs, 
                how='left', 
                left_on='topic_num', 
                right_on='topic_num')
    return output

In [None]:
representativedocs(usecase_2_model, 'topic_num','docs','keywords')

In [None]:
usecase_2_model.visualize_topics()

In [None]:
usecase_2_model.visualize_heatmap()

In [None]:
usecase_2_model.visualize_barchart(topics = [1,2,3,46,47,48])

In [None]:
usecase_2_model.get_topic(11)[:10]

### `Use Case 03`: After Transformation aka Discovering New Possible Topics {col2 Column}

In [None]:
#removes the '' from string set values
df['col3'] = df.col3.apply(lambda x: literal_eval(str(x)))
df.head(20)

In [None]:
#Take the new topics column and explode each topic into a new row and add it into a pd Dataframe
df2 = df['col3']
topics2 = df2.explode('col3')
topics2 = pd.DataFrame(topics2)
topics2.head()

In [None]:
docs_2 = list(topics2['col3'])
docs_2[:2]
print(len(docs_2))

In [None]:
usecase_3_model = BERTopic(language="multilingual",calculate_probabilities=True,verbose=True,nr_topics='auto')
topics, probs = usecase_3_model.fit_transform(docs_2)

In [None]:
topics_freq_3_use = usecase_3_model.get_topic_freq()
topics_freq_3_use

In [None]:
fig = px.bar(topics_freq_3_use,x='Topic',y='Count', title = 'Distribution of Topic Generated UseCase 03')
fig.show()

In [None]:
usecase_3_model.visualize_barchart(topics = [-1,0,1,2,3,4])

In [None]:
usecase_3_model.visualize_topics()

In [None]:
usecase_3_model.get_topic(3)

In [None]:
usecase_3_model.get_topic(4)

In [None]:
usecase_3_model.get_topic(1)

### Assigning New Keywords to Topics

In [None]:
similar_topics, similarity = usecase_3_model.find_topics("my account", top_n=5); 
print(similar_topics)
print(similarity)

In [None]:
usecase_3_model.get_topic(3)

In [None]:
topics, similarity = usecase_3_model.find_topics("我的賬戶", top_n=5);
print(topics)
print(similarity)