## Topic Modeling & Classification. A case study for Novo Nordisk and Scientific Intelligence. All rights reserved. Carl Johanson, 2023

Load the libraries

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import pandas as pd
import numpy as np
import pyLDAvis.gensim
import nltk
from pprint import pprint
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel
from gensim.models import phrases
from gensim.models.phrases import Phrases, Phraser
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from gensim.models import HdpModel
import matplotlib.pyplot as plt
import string
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import seaborn as sns
import random
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

Load the translated dataset

In [None]:
#load the translated dataset
df = pd.read_csv('/Users/carljohanson/Desktop/Speciale - Code Project/Code/Final/data/translated_text.csv')

#show the shape of the dataset
df.shape

#show the dataset
#df.head(50)

Corpus preprocessing - First Round

In [None]:
#tokenization of the clean and translated text
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

#defining a function to preprocess the text
def preprocess_text(text):
    # Tokenize text
    tokens = nltk.word_tokenize(text.lower())

    # Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words and "'" not in token and token not in string.punctuation]

    return tokens

#apply the function to the translated text
df['tokens_text'] = df['translated_text'].apply(preprocess_text)

#show the tokens
df['tokens_text'].head(50)

Corpus preprocessing - Second Round

In [None]:
#text with bigrams, trigrams
# English
connector_words = phrases.ENGLISH_CONNECTOR_WORDS

# Detect bigrams
bigram = Phrases(df['tokens_text'], min_count=5, threshold=100, connector_words=connector_words)
bigram_phraser = Phraser(bigram)

# Detect trigrams
trigram = Phrases(bigram_phraser[df['tokens_text']], min_count=5, threshold=100, connector_words=connector_words)
trigram_phraser = Phraser(trigram)

#defining a function to preprocess the text with bigrams and trigrams
def preprocess_text_bigram_trigram(tokens):

    # Detect and add bigrams and trigrams
    tokens = bigram_phraser[tokens]
    tokens = trigram_phraser[tokens]

    return tokens

#apply the function to the translated text
df['preprocessed_text'] = df['tokens_text'].apply(preprocess_text_bigram_trigram)

#show the tokens
df['preprocessed_text'].head(50)

#show the shape of the dataset
df.shape

Corpus preprocessing - Third Round

In [None]:
#remove rows with empty lists
df = df[df.astype(str)['preprocessed_text'] != '[]']

#remove rows with less than 5 words
df = df[df['preprocessed_text'].map(len) > 5]

#remove tokens with less than 3 characters
df['preprocessed_text'] = df['preprocessed_text'].apply(lambda x: [item for item in x if len(item) > 3])

#show the shape of the dataset
df.shape

Visualize the word count distribution

In [None]:
#word count distribution
#create a new column with number of words for each documents
df["word_count"] = df['preprocessed_text'].apply(lambda x: len(str(x).split(" ")))

#visualize it
plt.figure(figsize=(12, 6))
plt.hist(df["word_count"], bins=100)
plt.title("Word Count Distribution")
plt.xlabel("Number of Words")
plt.xticks(range(0, max(df["word_count"]), 20))  # Adjust the range and step size as needed
plt.ylabel("Frequency")
plt.show()

Prepare a corpus and a dictionary using the preprocessed text for the HDP and LDA algorithms

In [None]:
#create a list of list to mirror the structure of the preprocessed text data
doc_list = df['preprocessed_text'].tolist()

#create a dictionary from the preprocessed text data
dictionary = corpora.Dictionary(doc_list)

#convert the preprocessed text data into a bag-of-words representation
corpus = [dictionary.doc2bow(doc) for doc in doc_list]

Show the dictionary and the corpus

In [None]:
#print the dictionary
print(dictionary)
dictionary.token2id.items()

In [None]:
#show a portion of the corpus
for item in corpus:
    print(f'{item[0]}: {item[1]}', end=' ')

In [None]:
#word count ID
word_counts = [[(dictionary[id], count) for id, count in line] for line in corpus]
for item in word_counts:
    print(f'{item[0]}: {item[1]}', end=' ')

The HDP algorithm determines the best amount of topics

In [None]:
#store the inferred number of topics
num_topics_list = []

#run the HDP algorithm 20 times
for i in range(20):
    #randomly set the concentration parameters
    alpha = random.uniform(0.1, 2)
    gamma = random.uniform(0.1, 2)

    #create an HDP model and train it
    hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)

    #get the number of topics inferred by the HDP model
    num_topics = len(hdpmodel.show_topics())

    #store the inferred number of topics
    num_topics_list.append(num_topics)

    #show the number of topics inferred by the HDP model
    num_topics_list

Compute the best number of topics by coherence value

In [None]:
#Inspired by David Roepke from https://www.dataknowsall.com/topicmodels.html
#compute coherence values for different numbers of topics
limit = 100
start = 5
step = 5

#store the inferred values
coherence_values = []

#run the LDA algorithm for different numbers of topics to detect optimal number of topics
for num_topics in range(start, limit, step):
    model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=42)
    coherence_model = CoherenceModel(model=model, texts=doc_list, dictionary=dictionary, coherence='c_v')
    coherence_values.append(coherence_model. get_coherence())

#plot the coherence scores
plt.figure(figsize=(10,8))
x = range(start, limit, step)
ax = sns.lineplot(x=x, y=coherence_values, color='#238C8C')
plt.title("Best Number of Topics for LDA Model")
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.xlim(start, limit)
plt.xticks(range(2, limit, step))
plt.show()

Compute the optimal number of iterations for the LDA algorithm

In [None]:
#set the number of topics and define the range of iterations
iterations = range(10, 2000, 100)  # Adjust the range and step size as needed

#store the inferred perplexity values
perplexity_values = []
for num_iterations in iterations:
    model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=20, iterations=num_iterations, random_state=42)
    perplexity_values.append(model.log_perplexity(corpus))

#plot the perplexity scores
plt.figure(figsize=(10, 8))
plt.plot(iterations, perplexity_values, color='#238C8C')
plt.title("Perplexity for LDA Model")
plt.xlabel("Number of Iterations")
plt.ylabel("Perplexity")
legend_elements = [
    Line2D([0], [0], color='#238C8C', ls='-', label='Perplexity'),
]
plt.legend(handles=legend_elements, loc='upper right')
plt.show()

The LDA algorithm

In [None]:
#train the LDA model using the BoW vectors and the determined amount of topics
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=20, iterations=1010, alpha="auto", eta="auto", random_state = 42)

In [None]:
from gensim.test.utils import datapath

#Save model to disk.
temp_file = datapath("/Users/carljohanson/Desktop/Speciale - Code Project/Code/Models/lda_model")
lda_model.save(temp_file)

Visualize the results from the LDA algorithm

In [None]:
#visualise the word frequency in the topics
for doc in corpus:
   print([[dictionary[id], freq] for id, freq in doc])

In [None]:
#print the keywords in all the topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

In [None]:
#get the top 15 words for each topic
top_words_per_topic = []
for topic_idx, topic in lda_model.show_topics(num_topics=18, num_words=15, formatted=False):
    top_words = [word for word, _ in topic]
    top_words_per_topic.append(top_words)

    #print the output
    formatted_top_words = ", ".join(top_words)
    print(f"Topic {topic_idx}: {formatted_top_words}")

#create a DataFrame with topics as columns
topics_df = pd.DataFrame(top_words_per_topic).T
topics_df.columns = "topic_labels"

#print the DataFrame
topics_df

In [None]:
#get the topic distribution for each document in the corpus
topic_dist = [lda_model.get_document_topics(doc) for doc in corpus]

#define the number of documents to sample per topic
num_docs_per_topic = 5

#sample a few documents from each topic based on their topic probabilities
docs_per_topic = {}
for i, dist in enumerate(topic_dist):
    for topic, prob in dist:
        if topic not in docs_per_topic:
            docs_per_topic[topic] = []
        docs_per_topic[topic].append((i, prob))

for topic in range(lda_model.num_topics):
    print(f"\nTopic {topic+1}:\n")
    docs = sorted(docs_per_topic[topic], key=lambda x: x[1], reverse=True)[:num_docs_per_topic]
    for doc_id, _ in docs:
        print(df['preprocessed_text'][doc_id])

Evaluation

In [None]:
#get the baseline coherence score for the model (the average median). Hyperparameter tuning will change it.
cm = CoherenceModel(model=lda_model, corpus=corpus, texts=doc_list, coherence='c_v')
coherence_lda = cm.get_coherence()

#print the coherence score
print(coherence_lda)

In [None]:
#perplexity. The lower the better
print('\nPerplexity: ', lda_model.log_perplexity(corpus))

Visualise distribution, relevance and most salient terms in each topic

In [None]:
#create the pyLDAvis visualisation
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)

#display the visualisation
vis

Try clustering the documents to see similarity between documents

In [None]:
#create a new vector representation of the documents to avoid confusion with the previous BoW representation
vectorizer = CountVectorizer(lowercase=False)
X = vectorizer.fit_transform(df["translated_text"])

#try different values for the number of clusters
cluster_range = range(2, 30)
silhouette_scores = []

#document clustering using K-means and the vectorized documents
for k in cluster_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X)
    cluster_labels = kmeans.labels_
    silhouette_avg = silhouette_score(X, cluster_labels)
    silhouette_scores.append(silhouette_avg)

# Determine the optimal number of clusters based on silhouette scores
optimal_k = cluster_range[silhouette_scores.index(max(silhouette_scores))]

# Perform the clustering and assign cluster labels to the dataframe
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
kmeans.fit(X)
df["cluster"] = kmeans.labels_
print(df[["preprocessed_text", "cluster"]].head(10))

# Count the number of documents in each cluster
cluster_counts = df["cluster"].value_counts().reset_index()
cluster_counts.columns = ["cluster", "count"]

# Visualize the document distribution by cluster
plt.figure(figsize=(12, 6))
plt.bar(cluster_counts["cluster"], cluster_counts["count"])
plt.title("Document Distribution by Cluster")
plt.xlabel("Cluster")
plt.ylabel("Number of Documents")
plt.xticks(range(0, optimal_k))
plt.show()

In [None]:
#visualize the distribution of document clusters by similarity in a scatterplot
tsne = TSNE(n_components=2, init="random", random_state=42)
X_tsne = tsne.fit_transform(X)
df["tsne_x"] = X_tsne[:, 0]
df["tsne_y"] = X_tsne[:, 1]

#plot the document clusters
plt.figure(figsize=(12, 8))
for i in range(optimal_k):
    cluster_data = df[df["cluster"] == i]
    plt.scatter(cluster_data["tsne_x"], cluster_data["tsne_y"], label=f"Cluster {i}", alpha=0.6)

#add labels and legend
plt.title("Document Clustering with t-SNE")
plt.xlabel("t-SNE Component 1")
plt.ylabel("t-SNE Component 2")
plt.legend()

#display the plot
plt.show()

Determine the best (dominant) topic of each article

In [None]:
#loop through each document in the corpus and get its topic distribution
topic_distributions = []
for i, doc in enumerate(corpus):
    topic_dist = lda_model.get_document_topics(doc, minimum_probability=0.0)
    topic_distributions.append(topic_dist)

#extract the dominant topic for each document and add it as a new column in the dataframe
df['dominant_topic'] = [max(topic_dist, key=lambda x:x[1])[0] for topic_dist in topic_distributions]

#display the topic distribution for the first document
#print(topic_distributions[0])

#display the dataframe
#df[['description', 'dominant_topic']]

Visualise the distribution of dominant topics across the documents

In [None]:
#count the frequency of each dominant topic
topic_counts = df["dominant_topic"].value_counts().reset_index()
topic_counts.columns = ["topic", "count"]

#sort topics by their index
topic_counts = topic_counts.sort_values("topic")

#plot the distribution of dominant topics across the documents
plt.figure(figsize=(12, 6))
plt.bar(topic_counts["topic"], topic_counts["count"])
plt.title("Document Distribution by Dominant Topic")
plt.xlabel("Topic")
plt.ylabel("Number of Documents")
plt.xticks(range(0, len(topic_counts)))
plt.show()

In [None]:
#clustering documents based by their dominant topic value
from sklearn.manifold import TSNE
import matplotlib.cm as cm

#reduce the dimensionality to 2D
tsne = TSNE(n_components=2, init="random", random_state=42)
X_2d = tsne.fit_transform(X)

#use a colormap to map topic values to colors
plt.figure(figsize=(14, 8))
cmap = cm.get_cmap("viridis", np.unique(df["dominant_topic"]).size)

for topic in np.unique(df["dominant_topic"]):
    mask = df["dominant_topic"] == topic
    plt.scatter(X_2d[mask, 0], X_2d[mask, 1], c=[cmap(topic)], label=f"Topic {topic}")

#display the plot and add labels
plt.title("Scatter plot of documents by dominant topic")
plt.xlabel("t-SNE Component 1")
plt.ylabel("t-SNE Component 2")
plt.legend()
plt.show()

Prepare for the ML classifier

In [None]:
#export topic proportions and labels to ML classifier

#compute the topic proportions for each document in the corpus
topic_proportions = []
for doc in corpus:
    topic_vector = lda_model.get_document_topics(doc, minimum_probability=0.0)
    proportions = [topic_prob[1] for topic_prob in topic_vector]
    topic_proportions.append(proportions)

#add topic proportions to the dataframe
df['topic_proportions'] = topic_proportions

#show the column
#df['topic_proportions']

#Correcting a visual error in datalab from using PyLDAvis

from IPython.display import HTML
css_str = '<style> \
.jp-Button path { fill: #616161;} \
text.terms { fill: #616161;} \
.jp-icon-warn0 path {fill: var(--jp-warn-color0);} \
.bp3-button-text path { fill: var(--jp-inverse-layout-color3);} \
.jp-icon-brand0 path { fill: var(--jp-brand-color0);} \
text.terms { fill: #616161;} \
</style>'
display(HTML(css_str ))

Splitting the dataset into training and testing sets

In [None]:
#extract feature vectors and labels from the DataFrame
X = np.array(df['topic_proportions'].tolist())  #features (document-topic distributions)
y = df['dominant_topic'].values  #labels (categories/classes)

#display the shapes of X and y
#X.shape, y.shape

In [None]:
#split the dataset 80:20 for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#print the shapes of the training and testing sets
#X_train.shape, X_test.shape, y_train.shape, y_test.shape

#display the training set
#print(X_train)

Testing all classifiers at once using the lazypredict library to determine the best

In [None]:
#fit all models
all_clf = LazyClassifier(predictions=True, random_state=42)
models, predictions = all_clf.fit(X_train, X_test, y_train, y_test)

#print model performance metrics (accuracy and training time), full list of models sorted
#from highest to lowest accuracy and model prediction results
models

Random Forest Classifier

In [None]:
#detect the best parameters for the Random Forest Classifier using GridSearchCV
#define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300, 500, 1000],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4, 5, 6, 7, 8, 10, 15, 20, 30, 40, 50],
    'criterion' :['gini', 'entropy']
}

#create a base model
rf = RandomForestClassifier()

#instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid,
                           cv = 3, n_jobs = -1, verbose = 2)

#fit the grid search to the data
grid_search.fit(X_train, y_train)

#get the best parameters
grid_search.best_params_

In [None]:
#train the Random Forest Classifier with the best parameters
rf = RandomForestClassifier(n_estimators=300, max_depth=40, max_features='sqrt', criterion='gini', random_state=42)
rf.fit(X_train, y_train)

In [None]:
#predict the test set
y_pred = rf.predict(X_test)

In [None]:
from sklearn.metrics import f1_score

# Compute the accuracy of the classifier
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")

# Compute the F1-score of the classifier
f1 = f1_score(y_test, predictions, average='weighted')  # or 'macro'/'micro', depending on your task
print(f"F1-Score: {f1}")

In [None]:
#evaluate the model
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

In [None]:
#evaluate the model
print(classification_report(y_test, y_pred))

Save the random forest model

In [None]:
#save the random forest model
import joblib
joblib.dump(rf, '/Users/carljohanson/Desktop/Speciale - Code Project/Code/Models/rf_model', compress=9)