In [42]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns 
import contractions
import tensorflow as tf

import spacy
import textblob
from textblob import TextBlob
import nltk
from nltk.corpus import words
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.util import ngrams
from wordcloud import WordCloud

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from sklearn.model_selection import train_test_split

nltk.download('wordnet')
nltk.download('omw-1.4')
import swifter
import re
pd.options.display.max_rows = 100000

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\BIBHU\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\BIBHU\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [43]:
def lemmatization(text):
    lem = WordNetLemmatizer()
    return " ".join([lem.lemmatize(i) for i in text.split()])

def contract_fix(text):
    return " ".join([contractions.fix(word) for word in text.split()])

def word_freq_func(corpus):
    word_freq = dict()
    for token in corpus.split():
        if token not in word_freq.keys():
            word_freq[token]=1
        else:
            word_freq[token]+= 1
        
    word_freq_df = pd.DataFrame({'words':word_freq.keys(),'values':word_freq.values()})
    word_freq_df = word_freq_df.sort_values(by='values',ascending=False)
    
    return word_freq_df

def noise_removal(text):
    return " ".join(i for i in text.split() if i not in stop_words)

# Define a function to plot word cloud
def plot_cloud(wordcloud):
    fig = plt.figure(figsize=(25, 17), dpi=80)
    plt.tight_layout(pad=0)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.box(False)
    plt.show()
    plt.close() 
    
def predict_sentiment_textblob(text):
    return TextBlob(text).sentiment.polarity

def getAnalysis(score):
    if score < 0:
        return -1
    elif score == 0:
        return 0
    else:
        return 1
    
def predict_sentiment_vader(text):
    sentiment = SentimentIntensityAnalyzer()
    return sentiment.polarity_scores(text)

In [3]:
df_original = pd.read_csv('reviews.csv')
df = df_original.copy(deep=True)
df.head()

In [6]:
df.shape

(61594, 5)

In [7]:
df.describe(include='all')

Unnamed: 0,Time_submitted,Review,Rating,Total_thumbsup,Reply
count,61594,61594,61594.0,61594.0,216
unique,61300,61356,,,180
top,2022-03-08 18:49:19,Too many ads,,,"Hey, sorry to hear that. If you haven't tried ..."
freq,4,36,,,7
mean,,,3.155989,7.649381,
std,,,1.673285,89.323143,
min,,,1.0,0.0,
25%,,,1.0,0.0,
50%,,,3.0,0.0,
75%,,,5.0,1.0,


In [8]:
df.isna().sum()

Time_submitted        0
Review                0
Rating                0
Total_thumbsup        0
Reply             61378
dtype: int64

In [9]:
df = df.drop(['Reply','Time_submitted'],axis=1)

### Data Pre-Processing

##### Converting App Rating to Sentiments - 1,2 = Negative , 3 = Neutral, 4,5 = Positive

In [10]:
df['Sentiment'] = df['Rating'].apply(lambda x : -1 if x in range(1,3) else (1 if x in range(4,6) else 0))
df = df.drop(['Rating','Total_thumbsup'],axis=1)

#### Pre-Processing Reviews & make them model ready

In [11]:
df['Review_Contraction_Fixed'] = df['Review'].swifter.apply(contract_fix)
df['Review_Cleaned'] = df['Review_Contraction_Fixed'].swifter.apply(lambda x : re.sub('[^a-zA-Z]',' ',x))
df['Review_Cleaned_lemma'] = df['Review_Cleaned'].swifter.apply(lemmatization)
df['Review_Cleaned_lemma'] = df['Review_Cleaned_lemma'].str.lower()

Pandas Apply:   0%|          | 0/61594 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/61594 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/61594 [00:00<?, ?it/s]

#### Creating the vocabulary

In [12]:
corpus = " ".join(df['Review_Cleaned_lemma'])
vocab = list(set(corpus.split()))

#### Creating Word Frequency Dataframe

In [15]:
word_freq_df = word_freq_func(corpus)

#### Creating custom stop words of len=1,2 and removing 'no' from it

In [16]:
stop_words = word_freq_df[word_freq_df['words'].str.len() < 3].words.to_list()
stop_words.remove('no')

In [17]:
df['Review_Cleaned_lemma_noise_rm'] = df['Review_Cleaned_lemma'].swifter.apply(noise_removal)

Pandas Apply:   0%|          | 0/61594 [00:00<?, ?it/s]

#### Defining X & Y 

In [20]:
x = df['Review_Cleaned_lemma_noise_rm']
y = df['Sentiment']

#### Using TextBlob 

In [31]:
df['Sentiment_Textblob']= df['Review_Cleaned_lemma_noise_rm'].swifter.apply(predict_sentiment_textblob)
df['Sentiment_Textblob_Label']= df['Sentiment_Textblob'].swifter.apply(getAnalysis)

Pandas Apply:   0%|          | 0/61594 [00:00<?, ?it/s]

In [40]:
from sklearn.metrics import classification_report,auc,roc_auc_score,accuracy_score
target_names = ['Negative', 'Neutral', 'Positive']
print(classification_report(df['Sentiment'],df['Sentiment_Textblob_Label'],target_names=target_names))

              precision    recall  f1-score   support

    Negative       0.74      0.42      0.54     24771
     Neutral       0.13      0.13      0.13      6886
    Positive       0.65      0.88      0.75     29937

    accuracy                           0.61     61594
   macro avg       0.51      0.48      0.47     61594
weighted avg       0.63      0.61      0.59     61594



###### Conclusiong : The TextBlob based sentiment predictor words well for Positive class as compared to Negative class but fails miserably to classify Neutral class

###### Results can be improved changing the threshold