In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import os

# Import functions for data preprocessing & data preparation
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample
from sklearn.feature_extraction.text import CountVectorizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer, LancasterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import string
from string import punctuation
import nltk
import re

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [13]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [7]:
# Reading data
# For the first dataset: Sucharita: Darlings Review

data = pd.read_csv('E:/SEM 7/H SET PROJECT/comments CSVs/scrappedfile_sucharita.csv')
data.columns

Index(['Unnamed: 0', 'Comment'], dtype='object')

In [8]:
data1=data.drop(['Unnamed: 0'],axis=1)
data1

Unnamed: 0,Comment
0,You got 50k subscribers!!!!
1,Loved the movie. The scene where Alia is break...
2,Noone could have reviewed this better than you...
3,The ‘Women telling Women Stories’ Dance! It’s ...
4,This movie was way too good. Wished this came ...
...,...
114,Bollywood people consideing fhe kjo papa nepo ...
115,I don't like your reviews at all! I'm sorry!
116,Fake review u nonsense. It's yet to release on...
117,The film is misandrist potraying men as villai...


In [9]:
# Data Labelling

nltk.download('vader_lexicon')
sentiments = SentimentIntensityAnalyzer()
data1["Positive"] = [sentiments.polarity_scores(i)["pos"] for i in data1["Comment"]]
data1["Negative"] = [sentiments.polarity_scores(i)["neg"] for i in data1["Comment"]]
data1["Neutral"] = [sentiments.polarity_scores(i)["neu"] for i in data1["Comment"]]
data1['Compound'] = [sentiments.polarity_scores(i)["compound"] for i in data1["Comment"]]
score = data1["Compound"].values
sentiment = []
for i in score:
    if i >= 0.05 :
        sentiment.append('Positive')
    elif i <= -0.05 :
        sentiment.append('Negative')
    else:
        sentiment.append('Neutral')
data1["Sentiment"] = sentiment
data1.head()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Unnamed: 0,Comment,Positive,Negative,Neutral,Compound,Sentiment
0,You got 50k subscribers!!!!,0.0,0.0,1.0,0.0,Neutral
1,Loved the movie. The scene where Alia is break...,0.236,0.098,0.665,0.7326,Positive
2,Noone could have reviewed this better than you...,0.277,0.0,0.723,0.8006,Positive
3,The ‘Women telling Women Stories’ Dance! It’s ...,0.0,0.0,1.0,0.0,Neutral
4,This movie was way too good. Wished this came ...,0.102,0.091,0.807,0.0772,Positive


In [10]:
# Final Data

data2=data1.drop(['Positive','Negative','Neutral','Compound'],axis=1)
data2.head()

Unnamed: 0,Comment,Sentiment
0,You got 50k subscribers!!!!,Neutral
1,Loved the movie. The scene where Alia is break...,Positive
2,Noone could have reviewed this better than you...,Positive
3,The ‘Women telling Women Stories’ Dance! It’s ...,Neutral
4,This movie was way too good. Wished this came ...,Positive


In [17]:
# Data Transformation

stop_words = stopwords.words('english')
porter_stemmer = PorterStemmer()
lancaster_stemmer = LancasterStemmer() 
snowball_stemer = SnowballStemmer(language="english")
lzr = WordNetLemmatizer()

In [18]:
def text_processing(text):   
    # convert text into lowercase
    text = text.lower()

    # remove new line characters in text
    text = re.sub(r'\n',' ', text)
    
    # remove punctuations from text
    text = re.sub('[%s]' % re.escape(punctuation), "", text)
    
    # remove references and hashtags from text
    text = re.sub("^a-zA-Z0-9$,.", "", text)
    
    # remove multiple spaces from text
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    
    # remove special characters from text
    text = re.sub(r'\W', ' ', text)

    text = ' '.join([word for word in word_tokenize(text) if word not in stop_words])
    
    # stemming using porter stemmer from nltk package - msh a7sn 7aga - momken: lancaster, snowball
    # text=' '.join([porter_stemmer.stem(word) for word in word_tokenize(text)])
    # text=' '.join([lancaster_stemmer.stem(word) for word in word_tokenize(text)])
    # text=' '.join([snowball_stemer.stem(word) for word in word_tokenize(text)])
    
    # lemmatizer using WordNetLemmatizer from nltk package
    text=' '.join([lzr.lemmatize(word) for word in word_tokenize(text)])

    return text

In [21]:
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('wordnet')
data_copy = data2.copy()
data_copy.Comment = data_copy.Comment.apply(lambda text: text_processing(text))

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...


In [22]:
le = LabelEncoder()
data_copy['Sentiment'] = le.fit_transform(data_copy['Sentiment'])

In [23]:
processed_data = {
    'Sentence':data_copy.Comment,
    'Sentiment':data_copy['Sentiment']
}

processed_data = pd.DataFrame(processed_data)
processed_data.head()

Unnamed: 0,Sentence,Sentiment
0,got 50k subscriber,1
1,loved movie scene alia breaking plate anger ge...,2
2,noone could reviewed better sucharita thank he...,2
3,woman telling woman story dance finally,1
4,movie way good wished came cinema see uncomfor...,2


In [24]:
processed_data['Sentiment'].value_counts()

2    63
1    36
0    20
Name: Sentiment, dtype: int64

In [25]:
df_neutral = processed_data[(processed_data['Sentiment']==1)] 
df_negative = processed_data[(processed_data['Sentiment']==0)]
df_positive = processed_data[(processed_data['Sentiment']==2)]

# upsample minority classes
df_negative_upsampled = resample(df_negative, 
                                 replace=True,    
                                 n_samples= 205, 
                                 random_state=42)  

df_neutral_upsampled = resample(df_neutral, 
                                 replace=True,    
                                 n_samples= 205, 
                                 random_state=42)  


# Concatenate the upsampled dataframes with the neutral dataframe
final_data = pd.concat([df_negative_upsampled,df_neutral_upsampled,df_positive])

In [26]:
final_data['Sentiment'].value_counts()

0    205
1    205
2     63
Name: Sentiment, dtype: int64

In [27]:
corpus = []
for sentence in final_data['Sentence']:
    corpus.append(sentence)
corpus[0:5]

['dont get revenge abuse story domestic violence ka badla domestic violence thar also come fold really boring film thar haider film like dont celebrate revenge domestic violence celebrated told root domestic violence nonsense',
 'film misandrist potraying men villain narrative new narrative taken film world day',
 'unsubscribedbecause constant irritating rant 50k',
 'aint no1 calling u darling matesjokingss',
 'second half hopeless like someone kindergarten wrote']

In [28]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
y = final_data.iloc[:, -1].values

In [29]:
#Machine Learning Model

In [30]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB()

In [31]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm

array([[64,  0,  0],
       [ 0, 57,  0],
       [ 5,  1, 15]], dtype=int64)

In [32]:
nb_score = accuracy_score(y_test, y_pred)
print('accuracy',nb_score)

accuracy 0.9577464788732394
