# CUSTOMER SENTIMENT ANALYSIS & PREDICTION

## Using Supervised Machine Learning (Natural Processing Language)

In [1]:
#!pip install contractions

In [2]:
import pandas as pd
import random
import nltk
import contractions
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
import re
import nltk
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Loading and viewing the first 5 rows of the data
df = pd.read_csv('IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
# Checking for NaN values
df.isna().sum()

review       0
sentiment    0
dtype: int64

In [5]:
# Checking for duplicates
df.duplicated().sum()

418

In [6]:
# dropping the duplicated rows
df = df.drop_duplicates()

In [7]:
# Rechecking for duplicates
df.duplicated().sum()

0

In [8]:
# Viewing the number of classes in the dataset
df.sentiment.unique()

array(['positive', 'negative'], dtype=object)

In [9]:
# Checking the class balance
df.sentiment.value_counts(normalize=True)*100

sentiment
positive    50.187568
negative    49.812432
Name: proportion, dtype: float64

## Data Cleaning

In [11]:
# Creating the word count column
df['word_counts'] = df['review'].apply(lambda x:len(x.split()))
df.head()

Unnamed: 0,review,sentiment,word_counts
0,One of the other reviewers has mentioned that ...,positive,307
1,A wonderful little production. <br /><br />The...,positive,162
2,I thought this was a wonderful way to spend ti...,positive,166
3,Basically there's a family where a little boy ...,negative,138
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,230


In [12]:
# Converting the sentiment column into numeric 
sentiment_mapping = {
    'positive':0,
    'negative':1
}

df['sentiment'] = df['sentiment'].replace(sentiment_mapping)
df.head()

  df['sentiment'] = df['sentiment'].replace(sentiment_mapping)


Unnamed: 0,review,sentiment,word_counts
0,One of the other reviewers has mentioned that ...,0,307
1,A wonderful little production. <br /><br />The...,0,162
2,I thought this was a wonderful way to spend ti...,0,166
3,Basically there's a family where a little boy ...,1,138
4,"Petter Mattei's ""Love in the Time of Money"" is...",0,230


In [13]:
# View the random 5 reviews
random_reviews = df.sample(5)

for _,row in random_reviews.iterrows():
    sentiment = row['sentiment']
    print(f'Sentiment: {sentiment}', '(Positive)' if sentiment == 0 else '(Negative)')
    print(f'Review:\n{row['review']}\n')
    print('-------\n')

Sentiment: 1 (Negative)
Review:
Circus could have been so much better if they had reduced the number of twists and developed each better the film features a very gifted cast that mostly perform well , however it totally loses the audience basically everyone is back stabbing everyone else and not back stabbing them at the same time because they are backstabbing someone else and working with .... did i lose you? well the film is even more confusing clearly written by a first timer writer , it has some redeeming qualities though in the acting especially Famke Janssen shines as Lily but sadly gets a lot of USELESS screen time just standing there doing nothing.....the dialogue is a bit cheesy and the accents sometimes irritating but its still worth watching if you're a fan of any of the actors especially John and Famke who get the most screen time just remember to watch it with a notebook so you can write down who is double crossing who every second....

-------

Sentiment: 0 (Positive)
Rev

In [14]:
# defining a function that handle's contractions
def expand_contractions(text):
    return contractions.fix(text)

In [15]:
df['exanded_review'] = df['review'].apply(expand_contractions)
df[['review','exanded_review']]

Unnamed: 0,review,exanded_review
0,One of the other reviewers has mentioned that ...,One of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...,A wonderful little production. <br /><br />The...
2,I thought this was a wonderful way to spend ti...,I thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...,Basically there is a family where a little boy...
4,"Petter Mattei's ""Love in the Time of Money"" is...","Petter Mattei's ""Love in the Time of Money"" is..."
...,...,...
49995,I thought this movie did a down right good job...,I thought this movie did a down right good job...
49996,"Bad plot, bad dialogue, bad acting, idiotic di...","Bad plot, bad dialogue, bad acting, idiotic di..."
49997,I am a Catholic taught in parochial elementary...,I am a Catholic taught in parochial elementary...
49998,I'm going to have to disagree with the previou...,I am going to have to disagree with the previo...


In [16]:
# Creating the word count column
df['expanded_word_counts_'] = df['exanded_review'].apply(lambda x:len(x.split()))
df.head()

Unnamed: 0,review,sentiment,word_counts,exanded_review,expanded_word_counts_
0,One of the other reviewers has mentioned that ...,0,307,One of the other reviewers has mentioned that ...,314
1,A wonderful little production. <br /><br />The...,0,162,A wonderful little production. <br /><br />The...,162
2,I thought this was a wonderful way to spend ti...,0,166,I thought this was a wonderful way to spend ti...,168
3,Basically there's a family where a little boy ...,1,138,Basically there is a family where a little boy...,141
4,"Petter Mattei's ""Love in the Time of Money"" is...",0,230,"Petter Mattei's ""Love in the Time of Money"" is...",230


In [17]:
def clean_text(text):
    def hmtl_tags_removal(text):
        clean = re.compile('<.*?>')
        return re.sub(clean, '', text)

    txt = text.lower()
    txt = hmtl_tags_removal(txt)
    tokens = word_tokenize(txt)
    stemm = PorterStemmer()
    stop_words = set(stopwords.words('english'))
    stemmed_tokens = [stemm.stem(token) for token in tokens if token not in stop_words]

    clean_text = " ".join(stemmed_tokens)
    return clean_text

In [18]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [19]:
df['clean_text'] = df['exanded_review'].apply(clean_text)
df.head()

Unnamed: 0,review,sentiment,word_counts,exanded_review,expanded_word_counts_,clean_text
0,One of the other reviewers has mentioned that ...,0,307,One of the other reviewers has mentioned that ...,314,one review mention watch 1 oz episod hook . ri...
1,A wonderful little production. <br /><br />The...,0,162,A wonderful little production. <br /><br />The...,162,wonder littl product . film techniqu unassumin...
2,I thought this was a wonderful way to spend ti...,0,166,I thought this was a wonderful way to spend ti...,168,thought wonder way spend time hot summer weeke...
3,Basically there's a family where a little boy ...,1,138,Basically there is a family where a little boy...,141,basic famili littl boy ( jake ) think zombi cl...
4,"Petter Mattei's ""Love in the Time of Money"" is...",0,230,"Petter Mattei's ""Love in the Time of Money"" is...",230,petter mattei 's `` love time money '' visual ...


In [20]:
df.clean_text[1], df['review'][1]

("wonder littl product . film techniqu unassuming- old-time-bbc fashion give comfort , sometim discomfort , sens realism entir piec . actor extrem well chosen- michael sheen `` got polari '' voic pat ! truli see seamless edit guid refer william ' diari entri , well worth watch terrificli written perform piec . master product one great master 's comedi life . realism realli come home littl thing : fantasi guard , rather use tradit 'dream ' techniqu remain solid disappear . play knowledg sens , particularli scene concern orton halliwel set ( particularli flat halliwel 's mural decor everi surfac ) terribl well done .",
 'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless ed

In [21]:
df.to_csv('IMBD Cleand.csv', index= False)

In [22]:
df1 = pd.read_csv('IMBD Cleand.csv')
df1.head()

Unnamed: 0,review,sentiment,word_counts,exanded_review,expanded_word_counts_,clean_text
0,One of the other reviewers has mentioned that ...,0,307,One of the other reviewers has mentioned that ...,314,one review mention watch 1 oz episod hook . ri...
1,A wonderful little production. <br /><br />The...,0,162,A wonderful little production. <br /><br />The...,162,wonder littl product . film techniqu unassumin...
2,I thought this was a wonderful way to spend ti...,0,166,I thought this was a wonderful way to spend ti...,168,thought wonder way spend time hot summer weeke...
3,Basically there's a family where a little boy ...,1,138,Basically there is a family where a little boy...,141,basic famili littl boy ( jake ) think zombi cl...
4,"Petter Mattei's ""Love in the Time of Money"" is...",0,230,"Petter Mattei's ""Love in the Time of Money"" is...",230,petter mattei 's `` love time money '' visual ...


In [23]:
# creating a new dataframe with clean text and sentiment
df1 = df1[['clean_text', 'sentiment']]
df1.head()

Unnamed: 0,clean_text,sentiment
0,one review mention watch 1 oz episod hook . ri...,0
1,wonder littl product . film techniqu unassumin...,0
2,thought wonder way spend time hot summer weeke...,0
3,basic famili littl boy ( jake ) think zombi cl...,1
4,petter mattei 's `` love time money '' visual ...,0


In [24]:
df1['Clean_word_counts'] =  df1['clean_text'].apply(lambda x: len(x.split()))
df1.head()

Unnamed: 0,clean_text,sentiment,Clean_word_counts
0,one review mention watch 1 oz episod hook . ri...,0,206
1,wonder littl product . film techniqu unassumin...,0,105
2,thought wonder way spend time hot summer weeke...,0,108
3,basic famili littl boy ( jake ) think zombi cl...,1,80
4,petter mattei 's `` love time money '' visual ...,0,151


In [25]:
from sklearn.model_selection import train_test_split

In [26]:
X = df1['clean_text']
y = df1['sentiment']

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=45)

## 
TF-IDF in NLP
--
TF-IDF stands for Term Frequency–Inverse Document Frequency. It is a popular statistical measure used in Natural Language Processing (NLP) and Information Retrieval to evaluate how important a word is to a document in a collection (or corpus). 
TF checks how many times a term appears.

🔹 Components of TF-IDF
Term Frequency (TF):
Measures how often a term appears in a document.

TF(𝑡,𝑑)=Number of times term 𝑡 appears in document 𝑑 /
Total number of terms in document 𝑑
--
TF(t,d)= 
Total number of terms in document d
Number of times term t appears in document d
🔹 Why Use TF-IDF?

Filters out common words (e.g., "the", "is", "and") that appear in almost all documents.
Highlights meaningful and distinctive terms within a document.

Helps in:
--
Document classification,
Search engines (ranking documents),
Text similarity and clustering,
Keyword extraction

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [30]:
import datetime

In [31]:
#Training the baseline Model
start_time = datetime.datetime.now()
baseline_model = Pipeline([
                            ("tfid", TfidfVectorizer()),
                            ("clf", MultinomialNB())
])
baseline_model.fit(X_train, y_train)
stop_time = datetime.datetime.now()
nb_training_time = stop_time - start_time
print(f"The Naive Bayes  Model took {nb_training_time} seconds to train")

The Naive Bayes  Model took 0:00:08.147024 seconds to train


# Creating an Evaluation function

In [33]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def evaluator(y_test, y_pred):
    """
    Calculate the model accuracy score, precision score, recalll score and f1 score
    Args:
    ------
    y_tes = The actual labels in form of a 1D array
    y_pred = The predicted labels in form of a 1D array
    
    Returns a dictionary of accuracy score, precision score, recalll score and f1 score. 
    """
    # Calaculating the accurcy score
    model_accuracy = accuracy_score(y_test, y_pred) * 100
    # Calaculating precision score, recall score and f1 score
    model_precision, model_recall, model_f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average="weighted")

    model_results = {
        "accuracy": model_accuracy,
        "precision": model_precision * 100,
        "recall": model_recall * 100,
        "f1": model_f1_score * 100
    }
    return model_results
    

In [34]:
y_pred = baseline_model.predict(X_test)
evaluator(y_test, y_pred)

{'accuracy': 86.03025210084033,
 'precision': 86.03268593545793,
 'recall': 86.03025210084033,
 'f1': 86.0298763027932}

In [35]:
review = ["The movie looks awesome", "The main character was annoying"]
prediction = baseline_model.predict(review)
if prediction [0] == 0:
    print(f"{review}\n is a positive review")
else:
        print(f"{review}\n is a nagative review")

['The movie looks awesome', 'The main character was annoying']
 is a positive review


In [36]:
review = ["The customer service rep was incredibly helpful… in making me never want to call again.", "The movie looks awesome", 
          "The main character was annoying"]
prediction = baseline_model.predict(review)
for i in range (len(prediction + 1)):
    if prediction [i] == 0:
        print(f"{review[i]}\n is a positive review")
    else:
        print(f"{review[i]}\n is a nagative review")

The customer service rep was incredibly helpful… in making me never want to call again.
 is a nagative review
The movie looks awesome
 is a positive review
The main character was annoying
 is a nagative review


In [37]:
from sklearn.feature_extraction.text import CountVectorizer

In [38]:
#Training the baseline Model using Countvectorizer
start_time = datetime.datetime.now()
first_model = Pipeline([
                            ("count", CountVectorizer()),
                            ("clf_c", MultinomialNB())
])
first_model.fit(X_train, y_train)
stop_time = datetime.datetime.now()
fm_training_time = stop_time - start_time
print(f"The Naive Bayes  Model took {fm_training_time} seconds to train")

The Naive Bayes  Model took 0:00:07.869640 seconds to train


In [39]:
# Evaluating the model
y_pred = first_model.predict(X_test)
evaluator(y_test, y_pred)

{'accuracy': 85.25042016806724,
 'precision': 85.28684780543416,
 'recall': 85.25042016806724,
 'f1': 85.24604701650715}

In [40]:
# Training the model using randomforest algorithm
from sklearn.ensemble import RandomForestClassifier
# training the model using randomforestclassifier with TFIDFVectorizer
start_time = datetime.datetime.now()
randomforest_model = Pipeline([
                            ('tfid', TfidfVectorizer()),
                            ("rdf", RandomForestClassifier())
])
randomforest_model.fit(X_train, y_train)
stop_time = datetime.datetime.now()
rdf_training_time = stop_time - start_time
print(f' The Naive Bayes model took {rdf_training_time} seconds to train')

 The Naive Bayes model took 0:05:22.947186 seconds to train


In [41]:
# Evaluating the model
y_pred = randomforest_model.predict(X_test)
evaluator(y_test, y_pred)

{'accuracy': 84.75966386554622,
 'precision': 84.81264046786464,
 'recall': 84.75966386554622,
 'f1': 84.75456218959737}

In [42]:
# training the model using randomforestclassifier with TFIDFVectorizer
start_time = datetime.datetime.now()
randomforest_model2 = Pipeline([
                            ("count", CountVectorizer()),
                            ("rdf", RandomForestClassifier())
])
randomforest_model2.fit(X_train, y_train)
stop_time = datetime.datetime.now()
rdf_training_time = stop_time - start_time
print(f' The Naive Bayes model took {rdf_training_time} seconds to train')

 The Naive Bayes model took 0:06:05.328241 seconds to train


In [43]:
# Evaluating the model
y_pred = randomforest_model2.predict(X_test)
evaluator(y_test, y_pred)

{'accuracy': 84.83361344537815,
 'precision': 84.89080252895185,
 'recall': 84.83361344537815,
 'f1': 84.82811703985021}