In [27]:
import pandas as pd

df = pd.read_csv("vodafone_reviews.csv")

df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Rating    10000 non-null  int64 
 1   Review    9423 non-null   object
 2   Name      9998 non-null   object
 3   Location  10000 non-null  object
 4   Date      10000 non-null  object
dtypes: int64(1), object(4)
memory usage: 390.8+ KB


In [28]:
df.head()

Unnamed: 0,Rating,Review,Name,Location,Date
0,1,"Trying to buy broadband through Uswitch, then ...",Alfie Calas,GB,2025-06-05T22:03:24.000Z
1,5,Sona did a great job either my trade in and co...,Julliette,GB,2025-06-05T21:51:17.000Z
2,1,"One of the worst , if not the worst when it co...",Vlad Ureche,GB,2025-06-05T21:28:36.000Z
3,1,How those people get 4.6 rate that’s a joke my...,Adam Farbotko,GB,2025-06-05T21:14:08.000Z
4,1,Held to ransom by a ‘reputable’ company. Purch...,Chloe,GB,2025-06-05T20:24:59.000Z


In [29]:
df.isnull().sum()


Rating        0
Review      577
Name          2
Location      0
Date          0
dtype: int64

In [30]:
df.Rating.value_counts()

Rating
5    9130
1     727
4      80
2      42
3      21
Name: count, dtype: int64

In [31]:
import re

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)  # remove URLs
    text = re.sub(r"[^a-z\s]", '', text)  # keep letters only
    text = re.sub(r"\s+", ' ', text).strip()
    return text

df['Clean_Review'] = df['Review'].apply(clean_text)


In [32]:
df

Unnamed: 0,Rating,Review,Name,Location,Date,Clean_Review
0,1,"Trying to buy broadband through Uswitch, then ...",Alfie Calas,GB,2025-06-05T22:03:24.000Z,trying to buy broadband through uswitch then h...
1,5,Sona did a great job either my trade in and co...,Julliette,GB,2025-06-05T21:51:17.000Z,sona did a great job either my trade in and co...
2,1,"One of the worst , if not the worst when it co...",Vlad Ureche,GB,2025-06-05T21:28:36.000Z,one of the worst if not the worst when it come...
3,1,How those people get 4.6 rate that’s a joke my...,Adam Farbotko,GB,2025-06-05T21:14:08.000Z,how those people get rate thats a joke my full...
4,1,Held to ransom by a ‘reputable’ company. Purch...,Chloe,GB,2025-06-05T20:24:59.000Z,held to ransom by a reputable company purchase...
...,...,...,...,...,...,...
9995,1,"absolutely awful, been with vodaphone for 4 mo...",Ross Harker,GB,2025-03-05T11:25:39.000Z,absolutely awful been with vodaphone for month...
9996,1,Had a great experience- until we decided to mo...,Fern Jones,GB,2025-03-04T22:32:25.000Z,had a great experience until we decided to mov...
9997,5,Karan and Isaac were both amazing. Give them a...,Evan Calitz,GB,2025-03-04T22:25:37.000Z,karan and isaac were both amazing give them a ...
9998,5,Thank you mani your amazing,annalisa torciano,IT,2025-03-04T21:50:39.000Z,thank you mani your amazing


In [33]:
from textblob import TextBlob

df['Sentiment'] = df['Clean_Review'].apply(lambda x: TextBlob(x).sentiment.polarity)


In [34]:
df

Unnamed: 0,Rating,Review,Name,Location,Date,Clean_Review,Sentiment
0,1,"Trying to buy broadband through Uswitch, then ...",Alfie Calas,GB,2025-06-05T22:03:24.000Z,trying to buy broadband through uswitch then h...,-0.020000
1,5,Sona did a great job either my trade in and co...,Julliette,GB,2025-06-05T21:51:17.000Z,sona did a great job either my trade in and co...,0.700000
2,1,"One of the worst , if not the worst when it co...",Vlad Ureche,GB,2025-06-05T21:28:36.000Z,one of the worst if not the worst when it come...,-0.200000
3,1,How those people get 4.6 rate that’s a joke my...,Adam Farbotko,GB,2025-06-05T21:14:08.000Z,how those people get rate thats a joke my full...,0.275000
4,1,Held to ransom by a ‘reputable’ company. Purch...,Chloe,GB,2025-06-05T20:24:59.000Z,held to ransom by a reputable company purchase...,-0.003634
...,...,...,...,...,...,...,...
9995,1,"absolutely awful, been with vodaphone for 4 mo...",Ross Harker,GB,2025-03-05T11:25:39.000Z,absolutely awful been with vodaphone for month...,-0.280000
9996,1,Had a great experience- until we decided to mo...,Fern Jones,GB,2025-03-04T22:32:25.000Z,had a great experience until we decided to mov...,0.100000
9997,5,Karan and Isaac were both amazing. Give them a...,Evan Calitz,GB,2025-03-04T22:25:37.000Z,karan and isaac were both amazing give them a ...,0.400000
9998,5,Thank you mani your amazing,annalisa torciano,IT,2025-03-04T21:50:39.000Z,thank you mani your amazing,0.600000


In [35]:
def get_sentiment_label(polarity):
    # Define sentiment labels based on polarity
    if polarity > 0.2:
        return 'positive'
    elif polarity < -0.2:
        return 'negative'
    else:
        return 'neutral'

df['Sentiment Label'] = df['Sentiment'].apply(get_sentiment_label)

In [36]:
df

Unnamed: 0,Rating,Review,Name,Location,Date,Clean_Review,Sentiment,Sentiment Label
0,1,"Trying to buy broadband through Uswitch, then ...",Alfie Calas,GB,2025-06-05T22:03:24.000Z,trying to buy broadband through uswitch then h...,-0.020000,neutral
1,5,Sona did a great job either my trade in and co...,Julliette,GB,2025-06-05T21:51:17.000Z,sona did a great job either my trade in and co...,0.700000,positive
2,1,"One of the worst , if not the worst when it co...",Vlad Ureche,GB,2025-06-05T21:28:36.000Z,one of the worst if not the worst when it come...,-0.200000,neutral
3,1,How those people get 4.6 rate that’s a joke my...,Adam Farbotko,GB,2025-06-05T21:14:08.000Z,how those people get rate thats a joke my full...,0.275000,positive
4,1,Held to ransom by a ‘reputable’ company. Purch...,Chloe,GB,2025-06-05T20:24:59.000Z,held to ransom by a reputable company purchase...,-0.003634,neutral
...,...,...,...,...,...,...,...,...
9995,1,"absolutely awful, been with vodaphone for 4 mo...",Ross Harker,GB,2025-03-05T11:25:39.000Z,absolutely awful been with vodaphone for month...,-0.280000,negative
9996,1,Had a great experience- until we decided to mo...,Fern Jones,GB,2025-03-04T22:32:25.000Z,had a great experience until we decided to mov...,0.100000,neutral
9997,5,Karan and Isaac were both amazing. Give them a...,Evan Calitz,GB,2025-03-04T22:25:37.000Z,karan and isaac were both amazing give them a ...,0.400000,positive
9998,5,Thank you mani your amazing,annalisa torciano,IT,2025-03-04T21:50:39.000Z,thank you mani your amazing,0.600000,positive


 STEP 3: Churn Keyword Flags

In [37]:
churn_keywords = ['cancel', 'leave', 'switch', 'disconnect', 'terminate', 'quit', 'move', 'gone']

df['Churn_Keyword'] = df['Clean_Review'].apply(
    lambda x: int(any(word in x for word in churn_keywords))
)

In [38]:
df = df[df['Rating'].isin([1, 2, 4, 5])].copy()
df['Churn'] = df['Rating'].apply(lambda x: 1 if x <= 2 else 0)


In [39]:
df['Sentiment'].value_counts()



Sentiment
 0.000000    834
 0.200000    566
 0.800000    361
 0.500000    305
 1.000000    255
            ... 
 0.011453      1
 0.931667      1
 0.233232      1
 0.202020      1
-0.133333      1
Name: count, Length: 2863, dtype: int64

In [40]:
df['Churn'].value_counts()
pd.crosstab(df['Churn'], df['Sentiment'])

Sentiment,-1.000000,-0.910000,-0.900000,-0.833333,-0.800000,-0.750000,-0.740000,-0.725000,-0.715000,-0.688061,...,0.902500,0.910000,0.931667,0.933333,0.950000,0.953333,0.955000,0.976667,0.984444,1.000000
Churn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,1,68,1,6,20,7,7,2,1,255
1,8,1,1,1,3,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [41]:
df.head()

Unnamed: 0,Rating,Review,Name,Location,Date,Clean_Review,Sentiment,Sentiment Label,Churn_Keyword,Churn
0,1,"Trying to buy broadband through Uswitch, then ...",Alfie Calas,GB,2025-06-05T22:03:24.000Z,trying to buy broadband through uswitch then h...,-0.02,neutral,1,1
1,5,Sona did a great job either my trade in and co...,Julliette,GB,2025-06-05T21:51:17.000Z,sona did a great job either my trade in and co...,0.7,positive,0,0
2,1,"One of the worst , if not the worst when it co...",Vlad Ureche,GB,2025-06-05T21:28:36.000Z,one of the worst if not the worst when it come...,-0.2,neutral,0,1
3,1,How those people get 4.6 rate that’s a joke my...,Adam Farbotko,GB,2025-06-05T21:14:08.000Z,how those people get rate thats a joke my full...,0.275,positive,0,1
4,1,Held to ransom by a ‘reputable’ company. Purch...,Chloe,GB,2025-06-05T20:24:59.000Z,held to ransom by a reputable company purchase...,-0.003634,neutral,1,1


In [42]:
df['Churn_Keyword'].value_counts()

Churn_Keyword
0    9328
1     651
Name: count, dtype: int64

In [43]:
df['Churn'].value_counts()

Churn
0    9210
1     769
Name: count, dtype: int64

In [44]:
df['Date'] = pd.to_datetime(df['Date']).copy()


In [45]:
df['Review_Date'] = df['Date'].dt.date.copy()
df['Review_Time'] = df['Date'].dt.time.copy()

df.head()

Unnamed: 0,Rating,Review,Name,Location,Date,Clean_Review,Sentiment,Sentiment Label,Churn_Keyword,Churn,Review_Date,Review_Time
0,1,"Trying to buy broadband through Uswitch, then ...",Alfie Calas,GB,2025-06-05 22:03:24+00:00,trying to buy broadband through uswitch then h...,-0.02,neutral,1,1,2025-06-05,22:03:24
1,5,Sona did a great job either my trade in and co...,Julliette,GB,2025-06-05 21:51:17+00:00,sona did a great job either my trade in and co...,0.7,positive,0,0,2025-06-05,21:51:17
2,1,"One of the worst , if not the worst when it co...",Vlad Ureche,GB,2025-06-05 21:28:36+00:00,one of the worst if not the worst when it come...,-0.2,neutral,0,1,2025-06-05,21:28:36
3,1,How those people get 4.6 rate that’s a joke my...,Adam Farbotko,GB,2025-06-05 21:14:08+00:00,how those people get rate thats a joke my full...,0.275,positive,0,1,2025-06-05,21:14:08
4,1,Held to ransom by a ‘reputable’ company. Purch...,Chloe,GB,2025-06-05 20:24:59+00:00,held to ransom by a reputable company purchase...,-0.003634,neutral,1,1,2025-06-05,20:24:59


In [46]:
df['Weekday'] = df['Date'].dt.day_name()


In [47]:
df['Month'] = df['Date'].dt.month_name()

df.head()

Unnamed: 0,Rating,Review,Name,Location,Date,Clean_Review,Sentiment,Sentiment Label,Churn_Keyword,Churn,Review_Date,Review_Time,Weekday,Month
0,1,"Trying to buy broadband through Uswitch, then ...",Alfie Calas,GB,2025-06-05 22:03:24+00:00,trying to buy broadband through uswitch then h...,-0.02,neutral,1,1,2025-06-05,22:03:24,Thursday,June
1,5,Sona did a great job either my trade in and co...,Julliette,GB,2025-06-05 21:51:17+00:00,sona did a great job either my trade in and co...,0.7,positive,0,0,2025-06-05,21:51:17,Thursday,June
2,1,"One of the worst , if not the worst when it co...",Vlad Ureche,GB,2025-06-05 21:28:36+00:00,one of the worst if not the worst when it come...,-0.2,neutral,0,1,2025-06-05,21:28:36,Thursday,June
3,1,How those people get 4.6 rate that’s a joke my...,Adam Farbotko,GB,2025-06-05 21:14:08+00:00,how those people get rate thats a joke my full...,0.275,positive,0,1,2025-06-05,21:14:08,Thursday,June
4,1,Held to ransom by a ‘reputable’ company. Purch...,Chloe,GB,2025-06-05 20:24:59+00:00,held to ransom by a reputable company purchase...,-0.003634,neutral,1,1,2025-06-05,20:24:59,Thursday,June


In [48]:
# Create review length features
df['Review_Length'] = df['Clean_Review'].str.len()
df['Word_Count'] = df['Clean_Review'].str.split().str.len()

In [50]:
df.head()

Unnamed: 0,Rating,Review,Name,Location,Date,Clean_Review,Sentiment,Sentiment Label,Churn_Keyword,Churn,Review_Date,Review_Time,Weekday,Month,Review_Length,Word_Count
0,1,"Trying to buy broadband through Uswitch, then ...",Alfie Calas,GB,2025-06-05 22:03:24+00:00,trying to buy broadband through uswitch then h...,-0.02,neutral,1,1,2025-06-05,22:03:24,Thursday,June,280,48
1,5,Sona did a great job either my trade in and co...,Julliette,GB,2025-06-05 21:51:17+00:00,sona did a great job either my trade in and co...,0.7,positive,0,0,2025-06-05,21:51:17,Thursday,June,68,13
2,1,"One of the worst , if not the worst when it co...",Vlad Ureche,GB,2025-06-05 21:28:36+00:00,one of the worst if not the worst when it come...,-0.2,neutral,0,1,2025-06-05,21:28:36,Thursday,June,154,34
3,1,How those people get 4.6 rate that’s a joke my...,Adam Farbotko,GB,2025-06-05 21:14:08+00:00,how those people get rate thats a joke my full...,0.275,positive,0,1,2025-06-05,21:14:08,Thursday,June,240,44
4,1,Held to ransom by a ‘reputable’ company. Purch...,Chloe,GB,2025-06-05 20:24:59+00:00,held to ransom by a reputable company purchase...,-0.003634,neutral,1,1,2025-06-05,20:24:59,Thursday,June,1620,320


In [51]:
df.columns

Index(['Rating', 'Review', 'Name', 'Location', 'Date', 'Clean_Review',
       'Sentiment', 'Sentiment Label', 'Churn_Keyword', 'Churn', 'Review_Date',
       'Review_Time', 'Weekday', 'Month', 'Review_Length', 'Word_Count'],
      dtype='object')