<a href="https://colab.research.google.com/github/AnittaNJ/Gym_Project/blob/main/Gym_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Applying NLP for topic modelling in a real-life context

**Objective** :

In this project, I have analysed PureGym's review data to uncover key drivers that provide actionable insights for enhancing customer experience.

## Importing Libraries and Datasets

In [None]:
!pip install bertopic
!pip install transformers
!pip install gensim nltk
!pip install pyLDAvis
!pip install torch

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import re
import matplotlib.pyplot as plt
import torch
import pyLDAvis.gensim_models
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForCausalLM
from gensim import corpora
from gensim.models import LdaModel
import nltk
import string
from bs4 import BeautifulSoup
from nltk.probability import FreqDist
from bertopic import BERTopic
nltk.download('all')

In [None]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
# importing dataset (Google)

In [None]:
# importing dataset (Trustpilot)

## Initial Data Investigation

In [None]:
df = pd.read_excel('filename.xlsx')
df.head()

In [None]:
df.info()

In [None]:
df.describe(include='object')

In [None]:
df.shape

In [None]:
df['Comment'].isna().sum()

In [None]:
df = df.dropna(subset=['Comment'])
df.head()

In [None]:
df.shape

In [None]:
google_uniq_count = df["Club's Name"].dropna().nunique()
google_uniq_count

In [None]:
trustpilot_uniq_count = tf['Location Name'].dropna().nunique()
trustpilot_uniq_count

In [None]:
common_locations = set(df["Club's Name"].dropna()).intersection(set(tf['Location Name'].dropna()))
common_loc_count = len(common_locations)
common_loc_count

## Preprocessing of Data

In [None]:
def preprocess_text(text):

    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()

    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)

    tokens = word_tokenize(text.lower())

    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    tokens = [word for word in tokens if word.isalpha()]

    return tokens

In [None]:
google_tokens = df['Comment'].dropna().apply(preprocess_text)
google_tokens

In [None]:
trust_tokens = tf['Review Content'].dropna().apply(preprocess_text)
trust_tokens

In [None]:
google_tokens_flat = [token for sublist in google_tokens for token in sublist]
trust_tokens_flat = [token for sublist in trust_tokens for token in sublist]

## Frequency Distribution

In [None]:
google_freq_dist = FreqDist(google_tokens_flat)
google_freq_dist

In [None]:
trust_freq_dist = FreqDist(trust_tokens_flat)
trust_freq_dist

In [None]:
print("Top 10 words in Google dataset:")
print(google_freq_dist.most_common(10))
print('\n')
print("Top 10 words in Trustpilot dataset:")
print(trust_freq_dist.most_common(10))

In [None]:
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)

google_words = []
google_counts = []

google_most_common = google_freq_dist.most_common(10)
for word, count in google_most_common:
    google_words.append(word)
    google_counts.append(count)

plt.bar(google_words, google_counts, color='skyblue')
plt.title('Top 10 words in Google dataset')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.xticks(rotation=45)


plt.subplot(1, 2, 2)

trust_words = []
trust_counts = []

trust_most_common = trust_freq_dist.most_common(10)
for word, count in trust_most_common:
    trust_words.append(word)
    trust_counts.append(count)

plt.bar(trust_words, trust_counts, color='pink')
plt.title('Top 10 words in Trustpilot dataset')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

In [None]:
google_text = ' '.join(google_tokens_flat)
trust_text = ' '.join(trust_tokens_flat)

google_wordcloud = WordCloud(width=800, height=400, background_color='white').generate(google_text)
trust_wordcloud = WordCloud(width=800, height=400, background_color='white').generate(trust_text)

In [None]:
plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
plt.imshow(google_wordcloud, interpolation='bilinear')
plt.title('Word Cloud - Google Dataset')
plt.axis('off')

plt.subplot(1, 2, 2)
plt.imshow(trust_wordcloud, interpolation='bilinear')
plt.title('Word Cloud - Trustpilot Dataset')
plt.axis('off')

plt.tight_layout()
plt.show()

## Topic Modelling

In [None]:
df_neg = df[df['Overall Score']<3]
tf_neg = tf[tf['Review Stars']<3]

In [None]:
google_tokens = df_neg['Comment'].dropna().apply(preprocess_text)
google_tokens

In [None]:
trust_tokens = tf_neg['Review Content'].dropna().apply(preprocess_text)
trust_tokens

In [None]:
# Flatten the list of tokens
google_tokens_flat = [token for sublist in google_tokens for token in sublist]
trust_tokens_flat = [token for sublist in trust_tokens for token in sublist]

google_freq_dist = FreqDist(google_tokens_flat)

trust_freq_dist = FreqDist(trust_tokens_flat)

In [None]:
print("Most common words in Google dataset:")
print(google_freq_dist.most_common(10))
print('\n')
print("Most common words in Trustpilot dataset:")
print(trust_freq_dist.most_common(10))

In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)

google_words = []
google_counts = []

google_most_common = google_freq_dist.most_common(10)
for word, count in google_most_common:
    google_words.append(word)
    google_counts.append(count)

plt.bar(google_words, google_counts, color='skyblue')
plt.title('Most Common Words in Google Dataset')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.xticks(rotation=45)

plt.subplot(1, 2, 2)

trust_words = []
trust_counts = []

trust_most_common = trust_freq_dist.most_common(10)
for word, count in trust_most_common:
    trust_words.append(word)
    trust_counts.append(count)

plt.bar(trust_words, trust_counts, color='pink')
plt.title('Most Common Words in Trustpilot Dataset')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

Generating wordcloud for negative reviews

In [None]:
google_text = ' '.join(google_tokens_flat)
trust_text = ' '.join(trust_tokens_flat)

google_wordcloud = WordCloud(width=800, height=400, background_color='white').generate(google_text)
trust_wordcloud = WordCloud(width=800, height=400, background_color='white').generate(trust_text)

In [None]:
plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
plt.imshow(google_wordcloud, interpolation='bilinear')
plt.title('Word Cloud - Google Dataset')
plt.axis('off')

plt.subplot(1, 2, 2)
plt.imshow(trust_wordcloud, interpolation='bilinear')
plt.title('Word Cloud - Trustpilot Dataset')
plt.axis('off')

plt.tight_layout()
plt.show()

In [None]:
common_locations = set(df_neg["Club's Name"].dropna()).intersection(set(tf_neg['Location Name'].dropna()))
df_common = df_neg[df_neg["Club's Name"].isin(common_locations)]
tf_common = tf_neg[tf_neg['Location Name'].isin(common_locations)]
common_reviews = pd.concat([df_common, tf_common])
common_reviews

In [None]:
common_reviews['Combined Reviews'] = common_reviews['Comment'].fillna('') + ' ' + common_reviews['Review Content'].fillna('')
common_reviews

In [None]:
Processed_Review = common_reviews['Combined Reviews'].apply(preprocess_text)
Processed_Review

### BERTopic

In [None]:
text = [' '.join(word) for word in Processed_Review]
text

topic_model = BERTopic()
topics, _ = topic_model.fit_transform(text)

print("Topics found:")
topic_info = topic_model.get_topic_info()
topic_info.head()

In [None]:
for i, row in topic_model.get_topic_info().iterrows():
    print(f"Topic {i}: {row['Name']} (Frequency: {row['Count']})")

In [None]:
# Get the top 2 topics
top_2 = topic_info['Topic'].iloc[1:3]

for topic in top_2:
    print(f"\nTop words for Topic {topic}:")
    print(topic_model.get_topic(topic))

In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_barchart(top_n_topics=5)

In [None]:
topic_model.visualize_heatmap(top_n_topics=10, n_clusters=5, width=1000, height=1000)

### Clustering

In [None]:
topic_embeddings = topic_model.topic_embeddings_

n_clusters = 10
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
clusters = kmeans.fit_predict(topic_embeddings)

topic_info['Cluster'] = clusters

In [None]:
cluster_summaries = {}

for cluster_id in range(n_clusters):
    cluster_topics = topic_info[topic_info['Cluster'] == cluster_id]['Topic']

    top_words = []
    for topic in cluster_topics:
        if topic != -1:  # Skip outliers
            top_words.extend([word for word, _ in topic_model.get_topic(topic)])

    word_freq = nltk.FreqDist(top_words)
    cluster_summaries[cluster_id] = word_freq.most_common(10)


In [None]:
for cluster_id, top_words in cluster_summaries.items():
    print(f"\nCluster {cluster_id}:")
    print(f"Top Words: {[word for word, _ in top_words]}")
    print(f"General Theme: {', '.join([word for word, _ in top_words[:3]])}")


List out the top 20 locations with the highest number of negative reviews.

In [None]:
google_neg_loc = df_neg["Club's Name"].value_counts().head(20)
google_neg_loc

In [None]:
trust_neg_loc = tf_neg['Location Name'].value_counts().head(20)
trust_neg_loc

In [None]:
common_locations = set(google_neg_loc.index).intersection(set(trust_neg_loc.index))
common_locations

In [None]:
com_locations = set(df_neg["Club's Name"].dropna()).intersection(set(tf_neg['Location Name'].dropna()))
df_com = df_neg[df_neg["Club's Name"].isin(com_locations)]
tf_com = tf_neg[tf_neg['Location Name'].isin(com_locations)]
merged_df = pd.concat([df_com, tf_com])
merged_df.head()

In [None]:
merged_df['Location'] = merged_df["Club's Name"].fillna('') + merged_df['Location Name'].fillna('')

merged_df[['Location','Review Content','Comment']]

In [None]:
loc_stats = merged_df.groupby("Location").agg(
    Trust_Reviews=('Review Content', 'count'),
    Google_Reviews=('Comment', 'count')
)

loc_stats['Total_Reviews'] = loc_stats['Trust_Reviews'] + loc_stats['Google_Reviews']

loc_stats = loc_stats.sort_values('Total_Reviews', ascending=False)

loc_stats = loc_stats.reset_index()
loc_stats

In [None]:
top_30_loc = loc_stats["Location"].head(30).tolist()

top_30_reviews = merged_df[merged_df["Location"].isin(top_30_loc)]

top_30_reviews['Combined Reviews'] = top_30_reviews['Comment'].fillna('') + ' ' + top_30_reviews['Review Content'].fillna('')

top_30_tokens = top_30_reviews['Combined Reviews'].dropna().apply(preprocess_text)
top_30_tokens

In [None]:
top_30_tokens_flat = [token for sublist in top_30_tokens for token in sublist]

top_30_freq_dist = FreqDist(top_30_tokens_flat)

print("Most common words in top 30 locations:")
top_30_freq_dist.most_common(10)

In [None]:
plt.figure(figsize=(12, 6))
top_30_words = [word for word, _ in top_30_freq_dist.most_common(10)]
top_30_counts = [count for _, count in top_30_freq_dist.most_common(10)]
plt.bar(top_30_words, top_30_counts, color='skyblue')
plt.title('Most Common Words in Top 30 Locations')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()

In [None]:
top_30_text = ' '.join(top_30_tokens_flat)
top_30_wordcloud = WordCloud(width=800, height=400, background_color='white').generate(top_30_text)
plt.figure(figsize=(8, 8))
plt.imshow(top_30_wordcloud, interpolation='bilinear')
plt.title('Word Cloud - Top 30 Locations')
plt.axis('off')
plt.show()

In [None]:
Processed_Review = top_30_reviews['Combined Reviews'].apply(preprocess_text)

text = [' '.join(word) for word in Processed_Review]

topic_model = BERTopic()
topics, _ = topic_model.fit_transform(text)

print("Topics found:")
topic_info = topic_model.get_topic_info()
topic_info.head()

In [None]:
for i, row in topic_model.get_topic_info().iterrows():
    print(f"Topic {i}: {row['Name']} (Frequency: {row['Count']})")

In [None]:
top_2 = topic_info['Topic'].iloc[1:3]

for topic in top_2:
    print(f"\nTop words for Topic {topic}:")
    print(topic_model.get_topic(topic))

In [None]:
topic_model.visualize_barchart(top_n_topics=5)

In [None]:
topic_model.visualize_heatmap(top_n_topics=10, n_clusters=5, width=1000, height=1000)

## Sentiment Analysis

In [None]:
# Set the device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
emotion_classifier = pipeline("text-classification",
                              model="bhadresh-savani/bert-base-uncased-emotion",
                              tokenizer="bhadresh-savani/bert-base-uncased-emotion",
                              device=0)

In [None]:
df['Top Emotion'] = df['Comment'].dropna().apply(lambda text: emotion_classifier(text[:510])[0]['label'])
df

In [None]:
tf['Top Emotion'] = tf['Review Content'].dropna().apply(lambda text: emotion_classifier(text[:510])[0]['label'])
tf

In [None]:
df_neg = df[df['Overall Score'] < 3]
tf_neg = tf[tf['Review Stars'] < 3]

all_neg_reviews = pd.concat([df_neg, tf_neg])

emotion_counts = all_neg_reviews['Top Emotion'].value_counts()

plt.figure(figsize=(10, 6))
plt.bar(emotion_counts.index, emotion_counts.values)
plt.xlabel('Emotion')
plt.ylabel('Count')
plt.title('Distribution of Top Emotions in Negative Reviews')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
all_neg_reviews['Combined Reviews'] = all_neg_reviews['Comment'].fillna('') + ' ' + all_neg_reviews['Review Content'].fillna('')
all_neg_reviews

In [None]:
all_angry_reviews = all_neg_reviews[all_neg_reviews['Top Emotion'] == 'anger']['Combined Reviews']
all_angry_reviews = all_angry_reviews.reset_index()
all_angry_reviews['Combined Reviews']

### BERTopic

In [None]:
processed_angry_reviews = all_angry_reviews['Combined Reviews'].apply(preprocess_text)

angry_text = [' '.join(word) for word in processed_angry_reviews]

model_angry = BERTopic()
topics_angry, _ = model_angry.fit_transform(angry_text)

info_angry = model_angry.get_topic_info()
info_angry

In [None]:
model_angry.visualize_topics()

## LLM

In [None]:
df_neg = df[df['Overall Score']<3]

tf_neg = tf[tf['Review Stars']<3]

In [None]:
# creating a subset of the bad reviews, since the execution time is too high.
bad_rev = pd.concat([df_neg,tf_neg]).sample(n=2000, random_state=42)

bad_rev['Combined Reviews'] = bad_rev['Comment'].fillna('') + ' ' + bad_rev['Review Content'].fillna('')
bad_rev

### Generating topics

In [None]:
text_generator = pipeline(
    "text-generation",
    model="tiiuae/falcon-7b-instruct",
    tokenizer="tiiuae/falcon-7b-instruct",
    max_new_tokens=1000,
    device=0
)

prompt = "In the following customer review, pick out the main 3 topics. Return them in a numbered list format, with each one on a new line."

bad_rev['prompted_review'] = bad_rev['Combined Reviews'].apply(lambda x: f"{prompt} {x}")

bad_rev['generated_topics'] = bad_rev['prompted_review'].apply(lambda x: text_generator(x)[0]['generated_text'])

In [None]:
def extract_topics(text):
  lines = text.split('\n')
  topics = [line for line in lines if line.strip() and line.strip()[0].isdigit()]
  return '\n'.join(topics)

bad_rev['extracted_topics'] = bad_rev['generated_topics'].dropna().apply(extract_topics)


In [None]:
all_topics = []
for topics_str in bad_rev['extracted_topics']:
  for line in topics_str.split('\n'):
    all_topics.append(line.strip())

all_topics

### BERTopic

In [None]:
model_all_topics = BERTopic()
topics_all_topics, _ = model_all_topics.fit_transform(all_topics)

info_all_topics = model_all_topics.get_topic_info()
info_all_topics

### Generating insights

In [None]:
prefix = "For the following text topics obtained from negative customer reviews, can you give some actionable insights that would help this gym company? "

insights = []
for topic in all_topics:
  prompt = prefix + topic
  generated_insight = text_generator(prompt)[0]['generated_text']
  insights.append(generated_insight)

In [None]:
for i, insight in enumerate(insights):
  print(f"Insight for topic {all_topics[i]}: {insight}\n")

## LDA Model

In [None]:
all_negative_reviews = bad_rev['Combined Reviews'].dropna().apply(preprocess_text)
processed_reviews = all_negative_reviews.tolist()

In [None]:
dictionary = corpora.Dictionary(processed_reviews)
corpus = [dictionary.doc2bow(review) for review in processed_reviews]

In [None]:
num_topics = 10  # Specify the number of topics = 10
lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)

In [None]:
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic: {idx}\nWords: {topic}\n")

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
vis