In [38]:
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim import corpora, models
import gensim
import pandas as pd

In [39]:
df_new = pd.read_csv('Vodaphone_review_dataset.csv')

df_new.head()

Unnamed: 0,Rating,Review,Name,Location,Date,Clean_Review,Sentiment,Sentiment Label,Churn_Keyword,Churn,Review_Date,Review_Time,Weekday,Month,Review_Length,Word_Count
0,1,"Trying to buy broadband through Uswitch, then ...",Alfie Calas,GB,2025-06-05 22:03:24+00:00,trying to buy broadband through uswitch then h...,-0.02,neutral,1,1,2025-06-05,22:03:24,Thursday,June,280,48
1,5,Sona did a great job either my trade in and co...,Julliette,GB,2025-06-05 21:51:17+00:00,sona did a great job either my trade in and co...,0.7,positive,0,0,2025-06-05,21:51:17,Thursday,June,68,13
2,1,"One of the worst , if not the worst when it co...",Vlad Ureche,GB,2025-06-05 21:28:36+00:00,one of the worst if not the worst when it come...,-0.2,neutral,0,1,2025-06-05,21:28:36,Thursday,June,154,34
3,1,How those people get 4.6 rate that’s a joke my...,Adam Farbotko,GB,2025-06-05 21:14:08+00:00,how those people get rate thats a joke my full...,0.275,positive,0,1,2025-06-05,21:14:08,Thursday,June,240,44
4,1,Held to ransom by a ‘reputable’ company. Purch...,Chloe,GB,2025-06-05 20:24:59+00:00,held to ransom by a reputable company purchase...,-0.003634,neutral,1,1,2025-06-05,20:24:59,Thursday,June,1620,320


In [40]:
nltk.download('punkt_tab', download_dir='/home/codespace/nltk_data')
nltk.download('stopwords', download_dir='/home/codespace/nltk_data')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [41]:
# Preprocessing text

nltk.data.path.append('/home/codespace/nltk_data')

stop_words = set(stopwords.words('english'))
def preprocess(text):
    tokens = word_tokenize(text.lower())
    return [word for word in tokens if word.isalpha() and word not in stop_words]

df_new['tokens'] = df_new['Clean_Review'].apply(
    lambda x: preprocess(x) if pd.notnull(x) else []
)



In [42]:
df_new.tokens.isna().sum()

0

In [43]:
# Creating dictionary and corpus
dictionary = corpora.Dictionary(df_new['tokens'])
corpus = [dictionary.doc2bow(text) for text in df_new['tokens']]


In [44]:
# Train LDA model (start with 5 topics)

lda_model = gensim.models.LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=5,
    passes=10,
    random_state=42
)


In [45]:
topics = lda_model.print_topics(num_words=6)
for idx, topic in topics:
    print(f"Topic {idx+1}: {topic}")

Topic 1: 0.024*"vodafone" + 0.011*"service" + 0.009*"broadband" + 0.009*"customer" + 0.009*"phone" + 0.009*"would"
Topic 2: 0.047*"service" + 0.039*"helpful" + 0.029*"great" + 0.021*"customer" + 0.018*"store" + 0.018*"friendly"
Topic 3: 0.034*"helpful" + 0.019*"us" + 0.016*"everything" + 0.013*"explained" + 0.012*"best" + 0.012*"thank"
Topic 4: 0.044*"helpful" + 0.034*"service" + 0.023*"excellent" + 0.015*"problem" + 0.011*"much" + 0.010*"thank"
Topic 5: 0.057*"phone" + 0.048*"new" + 0.025*"service" + 0.025*"good" + 0.025*"helped" + 0.024*"great"


In [46]:
def get_topic(doc):
    bow = dictionary.doc2bow(doc)
    topic_probs = lda_model.get_document_topics(bow)
    dominant_topic = max(topic_probs, key=lambda x: x[1])[0]
    return dominant_topic

In [47]:
df_new['Topic'] = df_new['tokens'].apply(get_topic)

In [54]:
df_new.head()

Unnamed: 0,Rating,Review,Name,Location,Date,Clean_Review,Sentiment,Sentiment Label,Churn_Keyword,Churn,Review_Date,Review_Time,Weekday,Month,Review_Length,Word_Count,tokens,Topic
0,1,"Trying to buy broadband through Uswitch, then ...",Alfie Calas,GB,2025-06-05 22:03:24+00:00,trying to buy broadband through uswitch then h...,-0.02,neutral,1,1,2025-06-05,22:03:24,Thursday,June,280,48,"[trying, buy, broadband, uswitch, multiple, er...",0
1,5,Sona did a great job either my trade in and co...,Julliette,GB,2025-06-05 21:51:17+00:00,sona did a great job either my trade in and co...,0.7,positive,0,0,2025-06-05,21:51:17,Thursday,June,68,13,"[sona, great, job, either, trade, contract, am...",4
2,1,"One of the worst , if not the worst when it co...",Vlad Ureche,GB,2025-06-05 21:28:36+00:00,one of the worst if not the worst when it come...,-0.2,neutral,0,1,2025-06-05,21:28:36,Thursday,June,154,34,"[one, worst, worst, comes, signal, get, n, mes...",0
3,1,How those people get 4.6 rate that’s a joke my...,Adam Farbotko,GB,2025-06-05 21:14:08+00:00,how those people get rate thats a joke my full...,0.275,positive,0,1,2025-06-05,21:14:08,Thursday,June,240,44,"[people, get, rate, thats, joke, full, fibre, ...",0
4,1,Held to ransom by a ‘reputable’ company. Purch...,Chloe,GB,2025-06-05 20:24:59+00:00,held to ransom by a reputable company purchase...,-0.003634,neutral,1,1,2025-06-05,20:24:59,Thursday,June,1620,320,"[held, ransom, reputable, company, purchased, ...",0


In [55]:
df_new.Review

0       Trying to buy broadband through Uswitch, then ...
1       Sona did a great job either my trade in and co...
2       One of the worst , if not the worst when it co...
3       How those people get 4.6 rate that’s a joke my...
4       Held to ransom by a ‘reputable’ company. Purch...
                              ...                        
9974    absolutely awful, been with vodaphone for 4 mo...
9975    Had a great experience- until we decided to mo...
9976    Karan and Isaac were both amazing. Give them a...
9977                          Thank you mani your amazing
9978    16 Years of Loyalty, But the Last 1.5 Years Ha...
Name: Review, Length: 9979, dtype: object