In [1]:
#Libraries Required

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import os

# Import functions for data preprocessing & data preparation
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample
from sklearn.feature_extraction.text import CountVectorizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer, LancasterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import string
from string import punctuation
import nltk
import re

In [6]:
# Reading data
# For the second dataset: Nutshell: Black Money

data = pd.read_csv('E:/SEM 7/H SET PROJECT/comments CSVs/scrappedfile_nutshell.csv')
data.columns

Index(['Unnamed: 0', 'Comment'], dtype='object')

In [7]:
data1=data.drop(['Unnamed: 0'],axis=1)
data1

Unnamed: 0,Comment
0,What other finance-related videos should we ma...
1,Great way of making new generation understand ...
2,This is something amazing I've saw on YouTube ...
3,Randomly landed on this channel while watching...
4,I am still counfused why this 2 k people will ...
...,...
86,Hi 0
87,Ll
88,Can you make a video about how many people pay...
89,P


In [8]:
# Data Labelling

sentiments = SentimentIntensityAnalyzer()
data1["Positive"] = [sentiments.polarity_scores(i)["pos"] for i in data1["Comment"]]
data1["Negative"] = [sentiments.polarity_scores(i)["neg"] for i in data1["Comment"]]
data1["Neutral"] = [sentiments.polarity_scores(i)["neu"] for i in data1["Comment"]]
data1['Compound'] = [sentiments.polarity_scores(i)["compound"] for i in data1["Comment"]]
score = data1["Compound"].values
sentiment = []
for i in score:
    if i >= 0.05 :
        sentiment.append('Positive')
    elif i <= -0.05 :
        sentiment.append('Negative')
    else:
        sentiment.append('Neutral')
data1["Sentiment"] = sentiment
data1.head()

Unnamed: 0,Comment,Positive,Negative,Neutral,Compound,Sentiment
0,What other finance-related videos should we ma...,0.17,0.0,0.83,0.4329,Positive
1,Great way of making new generation understand ...,0.397,0.0,0.603,0.836,Positive
2,This is something amazing I've saw on YouTube ...,0.412,0.0,0.588,0.8271,Positive
3,Randomly landed on this channel while watching...,0.25,0.0,0.75,0.7178,Positive
4,I am still counfused why this 2 k people will ...,0.315,0.168,0.517,0.5065,Positive


In [9]:
# Final Data

data2=data1.drop(['Positive','Negative','Neutral','Compound'],axis=1)
data2.head()

Unnamed: 0,Comment,Sentiment
0,What other finance-related videos should we ma...,Positive
1,Great way of making new generation understand ...,Positive
2,This is something amazing I've saw on YouTube ...,Positive
3,Randomly landed on this channel while watching...,Positive
4,I am still counfused why this 2 k people will ...,Positive


In [10]:
# Data Transformation

stop_words = stopwords.words('english')
porter_stemmer = PorterStemmer()
lancaster_stemmer = LancasterStemmer() 
snowball_stemer = SnowballStemmer(language="english")
lzr = WordNetLemmatizer()

In [11]:
def text_processing(text):   
    # convert text into lowercase
    text = text.lower()

    # remove new line characters in text
    text = re.sub(r'\n',' ', text)
    
    # remove punctuations from text
    text = re.sub('[%s]' % re.escape(punctuation), "", text)
    
    # remove references and hashtags from text
    text = re.sub("^a-zA-Z0-9$,.", "", text)
    
    # remove multiple spaces from text
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    
    # remove special characters from text
    text = re.sub(r'\W', ' ', text)

    text = ' '.join([word for word in word_tokenize(text) if word not in stop_words])
    
    # stemming using porter stemmer from nltk package - msh a7sn 7aga - momken: lancaster, snowball
    # text=' '.join([porter_stemmer.stem(word) for word in word_tokenize(text)])
    # text=' '.join([lancaster_stemmer.stem(word) for word in word_tokenize(text)])
    # text=' '.join([snowball_stemer.stem(word) for word in word_tokenize(text)])
    
    # lemmatizer using WordNetLemmatizer from nltk package
    text=' '.join([lzr.lemmatize(word) for word in word_tokenize(text)])

    return text

In [12]:
data_copy = data2.copy()
data_copy.Comment = data_copy.Comment.apply(lambda text: text_processing(text))

In [13]:
le = LabelEncoder()
data_copy['Sentiment'] = le.fit_transform(data_copy['Sentiment'])

In [14]:
processed_data = {
    'Sentence':data_copy.Comment,
    'Sentiment':data_copy['Sentiment']
}

processed_data = pd.DataFrame(processed_data)
processed_data.head()

Unnamed: 0,Sentence,Sentiment
0,financerelated video make whatd like u break next,2
1,great way making new generation understand his...,2
2,something amazing ive saw youtube day loved,2
3,randomly landed channel watching another video...,2
4,still counfused 2 k people dislike masterpiece,2


In [15]:
processed_data['Sentiment'].value_counts()

2    45
1    35
0    11
Name: Sentiment, dtype: int64

In [17]:
df_neutral = processed_data[(processed_data['Sentiment']==1)] 
df_negative = processed_data[(processed_data['Sentiment']==0)]
df_positive = processed_data[(processed_data['Sentiment']==2)]

# upsample minority classes
df_negative_upsampled = resample(df_negative, 
                                 replace=True,    
                                 n_samples= 205, 
                                 random_state=42)  

df_neutral_upsampled = resample(df_neutral, 
                                 replace=True,    
                                 n_samples= 205, 
                                 random_state=42)  


# Concatenate the upsampled dataframes with the neutral dataframe
final_data = pd.concat([df_negative_upsampled,df_neutral_upsampled,df_positive])

In [18]:
final_data['Sentiment'].value_counts()

0    205
1    205
2     45
Name: Sentiment, dtype: int64

In [19]:
corpus = []
for sentence in final_data['Sentence']:
    corpus.append(sentence)
corpus[0:5]

['hard economy hit demonetization introduction gst covid happened within period 354 year way recovery',
 'hell video disliked much',
 'make video many people pay tax india much total transaction black popular case people got arrested',
 'low tax control',
 'indian govt trying hard']

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
y = final_data.iloc[:, -1].values

In [21]:
# Machine Learning Model

from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB()

In [22]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm

array([[65,  0,  0],
       [ 0, 59,  0],
       [ 2,  1, 10]], dtype=int64)

In [23]:
nb_score = accuracy_score(y_test, y_pred)
print('accuracy',nb_score)

accuracy 0.9781021897810219
