In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk import corpus,tokenize
from nltk.corpus import stopwords
import re
from nltk.stem import PorterStemmer,WordNetLemmatizer,porter
from wordcloud import WordCloud,STOPWORDS
import string
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report,accuracy_score

In [2]:
df = pd.read_csv('hotstar_reviews.csv')
df.head()

Unnamed: 0,ID,UserName,Created_Date,Reviews,Lower_Case_Reviews,Sentiment_Manual_BP,Sentiment_Manual,Review_Length,DataSource,Year,Month,Date,Sentiment_Polarity
0,1,,08-10-2017,Hh,hh,Negative,Negative,2,Google_PlayStore,2017,8,10,Neutral
1,2,,08-11-2017,No,no,Negative,Negative,2,Google_PlayStore,2017,8,11,Neutral
2,3,asadynwa,08-12-2017,@hotstar_helps during paymnt for premium subsc...,@hotstar_helps during paymnt for premium subsc...,Help,Negative,140,Twitter,2017,8,12,Negative
3,4,jineshroxx,08-11-2017,@hotstartweets I am currently on Jio network a...,@hotstartweets i am currently on jio network a...,Help,Negative,140,Twitter,2017,8,11,Positive
4,5,YaminiSachar,08-05-2017,@hotstartweets the episodes of Sarabhai vs Sar...,@hotstartweets the episodes of sarabhai vs sar...,Help,Negative,140,Twitter,2017,8,5,Neutral


In [3]:
df.shape


(5053, 13)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5053 entries, 0 to 5052
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ID                   5053 non-null   int64 
 1   UserName             4331 non-null   object
 2   Created_Date         5053 non-null   object
 3   Reviews              5053 non-null   object
 4   Lower_Case_Reviews   5053 non-null   object
 5   Sentiment_Manual_BP  5053 non-null   object
 6   Sentiment_Manual     5053 non-null   object
 7   Review_Length        5053 non-null   int64 
 8   DataSource           5053 non-null   object
 9   Year                 5053 non-null   int64 
 10  Month                5053 non-null   int64 
 11  Date                 5053 non-null   int64 
 12  Sentiment_Polarity   5053 non-null   object
dtypes: int64(5), object(8)
memory usage: 513.3+ KB


In [5]:
df['Sentiment_Manual'].value_counts()

Neutral     1738
Positive    1733
Negative    1582
Name: Sentiment_Manual, dtype: int64

In [6]:
# percentage of sentiments moods - 
df['Sentiment_Manual'].value_counts() / df['Sentiment_Manual'].size

Neutral     0.343954
Positive    0.342965
Negative    0.313081
Name: Sentiment_Manual, dtype: float64

In [7]:
df['Sentiment_Manual'].value_counts() /len(df['Sentiment_Manual']) *100

Neutral     34.395409
Positive    34.296458
Negative    31.308134
Name: Sentiment_Manual, dtype: float64

In [8]:
# datasource distribution - 
df['DataSource'].value_counts() / df.DataSource.size

Twitter             0.559272
Google_PlayStore    0.440728
Name: DataSource, dtype: float64

In [9]:
pd.pivot_table(df,index = 'Sentiment_Manual',columns = 'DataSource',values = 'ID',
              aggfunc = 'count')/df['DataSource'].size

DataSource,Google_PlayStore,Twitter
Sentiment_Manual,Unnamed: 1_level_1,Unnamed: 2_level_1
Negative,0.12923,0.183851
Neutral,0.051257,0.292697
Positive,0.260241,0.082723


In [11]:
table = pd.crosstab(df['DataSource'], df['Sentiment_Manual'])
table['Total'] = table['Negative'] + table['Neutral'] + table['Positive']
# table['Pecentage'] = table['Yes']/table['Total']
table

Sentiment_Manual,Negative,Neutral,Positive,Total
DataSource,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Google_PlayStore,653,259,1315,2227
Twitter,929,1479,418,2826


In [None]:
# Data Cleansing

In [12]:
df = df[['Lower_Case_Reviews','DataSource','Sentiment_Manual']]
df.head()

Unnamed: 0,Lower_Case_Reviews,DataSource,Sentiment_Manual
0,hh,Google_PlayStore,Negative
1,no,Google_PlayStore,Negative
2,@hotstar_helps during paymnt for premium subsc...,Twitter,Negative
3,@hotstartweets i am currently on jio network a...,Twitter,Negative
4,@hotstartweets the episodes of sarabhai vs sar...,Twitter,Negative


In [13]:
# renaming colus-
df.columns = ['Reviews','Source','Sentiment']
df.head()

Unnamed: 0,Reviews,Source,Sentiment
0,hh,Google_PlayStore,Negative
1,no,Google_PlayStore,Negative
2,@hotstar_helps during paymnt for premium subsc...,Twitter,Negative
3,@hotstartweets i am currently on jio network a...,Twitter,Negative
4,@hotstartweets the episodes of sarabhai vs sar...,Twitter,Negative


In [14]:
# df = df.rename(columns = {'Reviews':'TEST'})
# df.head()

In [15]:
df = pd.get_dummies(df,columns=['Source'],drop_first=True)
df.head()

Unnamed: 0,Reviews,Sentiment,Source_Twitter
0,hh,Negative,0
1,no,Negative,0
2,@hotstar_helps during paymnt for premium subsc...,Negative,1
3,@hotstartweets i am currently on jio network a...,Negative,1
4,@hotstartweets the episodes of sarabhai vs sar...,Negative,1


In [16]:
stopw = stopwords.words('english')
stopw

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [17]:
re_pattern = """@[a-zA-Z0-9_:]+|b['"]rt|[\d]+[a-zA-Z_+='?]+[\d]+|[a-zA-Z_*+-=]+[\d]+[a-zA-Z_*+-=]+[\d]+"""

In [18]:
re_pattern = re_pattern + """|https:+[a-zA-Z0-9/._+-=]+[&amp;]|rt"""

In [21]:
reviewText = [re.sub(pattern=re_pattern,string=text,repl="")for text in df.Reviews.map(str).values]

In [22]:
reviewText[9]

' i have a premium accnt at hotstar but now it is showing tht i m not a premium member can u pls chk  com'

In [23]:
punctuation = list(string.punctuation)

In [27]:
data_cleansed = []

for review in reviewText:
    stop_free = " ".join([txt for txt in review.lower().split()
                         if txt not in stopw]) # NLTK Stopwords
    stop_free1 = " ".join([txt for txt in stop_free.lower().split()
                          if txt not in stopw]) # WordCloud Stopwords
    puct_free = " ".join([txt for txt in stop_free1.lower().split()
                         if txt not in punctuation])
    data_cleansed.append(puct_free)

In [29]:
data_cleansed[7]

'today epi #lovekahaiintezaar nt available available morning showing nt available due expiry'

In [31]:
# remove hashtags
data_cleansed_final = []
for rvw in data_cleansed:
    final_word = rvw.replace('#', '')
    data_cleansed_final.append(final_word)

In [33]:
data_cleansed_final[7]

'today epi lovekahaiintezaar nt available available morning showing nt available due expiry'

In [34]:
## remove unnecessary words and lemmatize -


wd = WordNetLemmatizer()
data_cleansed = []

for rvw in data_cleansed_final:
    clean_review = " ".join([txt for txt in rvw.split() if txt not in
                            ['hi', 'hello','u','i','hey','sir','maam',
                            'madam']])
    clean_review_lm = " ".join(wd.lemmatize(word) for word in 
                              clean_review.split())
    data_cleansed.append(clean_review_lm)

In [35]:
df['Clean_review'] = data_cleansed
df.head()

Unnamed: 0,Reviews,Sentiment,Source_Twitter,Clean_review
0,hh,Negative,0,hh
1,no,Negative,0,
2,@hotstar_helps during paymnt for premium subsc...,Negative,1,paymnt premium subscription transaction failed...
3,@hotstartweets i am currently on jio network a...,Negative,1,currently jio network would like know whether ...
4,@hotstartweets the episodes of sarabhai vs sar...,Negative,1,episode sarabhai v sarabhai season 1 downloada...


In [None]:
# split data -

In [37]:
x_train, x_test, y_train, y_test = train_test_split(df['Clean_review'],
                                                  df['Sentiment'],
                                                   test_size=0.2,
                                                   random_state=101)

In [None]:
#Vectorize the text data using Count Vectorizer

In [38]:
vectorizer = CountVectorizer(min_df=5).fit(x_train)
x_train_vecotrized = vectorizer.transform(x_train)
x_test_vecotrized = vectorizer.transform(x_test)

In [None]:
# print(vectorizer.get_feature_names())

In [40]:
pd.DataFrame(x_train_vecotrized.toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,902,903,904,905,906,907,908,909,910,911
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4037,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4038,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4039,0,0,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4040,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [41]:
x_test_vecotrized.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [None]:
# Naive Bayes Theorem

In [53]:
naive_bayes_mult = MultinomialNB().fit(x_train_vecotrized, y_train)

In [54]:
naive_bayes_mult.score(x_train_vecotrized, y_train),naive_bayes_mult.score(x_test_vecotrized, y_test)

(0.7874814448292924, 0.7299703264094956)

In [56]:
naive_bayes_mult.classes_

array(['Negative', 'Neutral', 'Positive'], dtype='<U8')

In [58]:
naive_bayes_mult.feature_log_prob_

array([[-9.02014792, -9.02014792, -9.02014792, ..., -6.94070638,
        -6.45519856, -9.02014792],
       [-6.77227914, -6.77227914, -5.04951254, ..., -7.68856987,
        -6.07913196, -7.28310476],
       [-8.64699263, -8.64699263, -8.64699263, ..., -8.64699263,
        -6.85523316, -8.64699263]])

In [47]:
predict_train = naive_bayes_mult.predict(x_train_vecotrized)
predict_test = naive_bayes_mult.predict(x_test_vecotrized)

In [48]:
def get_num(data):
    data_new = pd.DataFrame(data, columns=['Sentiment'])
    data_new['y_label'] = np.where(data_new['Sentiment']=='Neutral',0,
                                  np.where(data_new['Sentiment']=='Positive',1,2))
    return data_new

In [49]:
y_train_new = get_num(y_train)
y_test_new = get_num(y_test)
predict_train_new = get_num(predict_train)
predict_test_new = get_num(predict_test)

In [50]:
print(classification_report(y_train_new['y_label'],predict_train_new['y_label']))

              precision    recall  f1-score   support

           0       0.83      0.67      0.74      1388
           1       0.77      0.89      0.82      1395
           2       0.77      0.80      0.79      1259

    accuracy                           0.79      4042
   macro avg       0.79      0.79      0.79      4042
weighted avg       0.79      0.79      0.79      4042



In [51]:
print(classification_report(y_test_new['y_label'],predict_test_new['y_label']))

              precision    recall  f1-score   support

           0       0.78      0.61      0.69       350
           1       0.73      0.85      0.78       338
           2       0.69      0.73      0.71       323

    accuracy                           0.73      1011
   macro avg       0.73      0.73      0.73      1011
weighted avg       0.73      0.73      0.73      1011

