In [None]:
import numpy as np 
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/blackadam-trailer-comments/comments.csv


# Dataextraction

In [2]:
# from youtube_comment_scraper_python import *
# import pandas as pd

# link = input("Youtube links: ")
# saved = input("Output name: ")
# youtube.open(link)

# response = youtube.video_comments()
# all_data = []
# for i in range(0, 20): # It will scroll 10 times
#     response = youtube.video_comments()
#     data = response['body']
#     all_data.extend(data)
# df = pd.DataFrame(data)
# df.to_csv(saved)

# Datatransformation

* **Libraries required**

In [None]:
 
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline
import os


from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample
from sklearn.feature_extraction.text import CountVectorizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer, LancasterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import string
from string import punctuation
import nltk
import re

* **Read data**

In [4]:
data = pd.read_csv('../input/blackadam-trailer-comments/comments.csv')
data.columns
data1=data.drop(['Unnamed: 0','Likes','Time','user','UserLink'],axis=1)
data1

Unnamed: 0,Comment
0,Love how Dr. Fate's design looks and how cool ...
1,I can’t get over how good everything looks. Dr...
2,Really hoping that this can save DC's movie un...
3,U cant deny how good this looks.Now if they ca...
4,"From this trailer, I have a feeling that this ..."
...,...
275,I want to see this. It may be one of his most ...
276,wow thats very amazing. I can't wait to see.
277,Doctor Fate is why i'm watching
278,This looks fire. DC looks like they stepping t...


* **Data labelling**

In [5]:
nltk.download('vader_lexicon')
sentiments = SentimentIntensityAnalyzer()
data1["Positive"] = [sentiments.polarity_scores(i)["pos"] for i in data1["Comment"]]
data1["Negative"] = [sentiments.polarity_scores(i)["neg"] for i in data1["Comment"]]
data1["Neutral"] = [sentiments.polarity_scores(i)["neu"] for i in data1["Comment"]]
data1['Compound'] = [sentiments.polarity_scores(i)["compound"] for i in data1["Comment"]]
score = data1["Compound"].values
sentiment = []
for i in score:
    if i >= 0.05 :
        sentiment.append('Positive')
    elif i <= -0.05 :
        sentiment.append('Negative')
    else:
        sentiment.append('Neutral')
data1["Sentiment"] = sentiment
data1.head()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Unnamed: 0,Comment,Positive,Negative,Neutral,Compound,Sentiment
0,Love how Dr. Fate's design looks and how cool ...,0.384,0.0,0.616,0.891,Positive
1,I can’t get over how good everything looks. Dr...,0.153,0.0,0.847,0.6801,Positive
2,Really hoping that this can save DC's movie un...,0.375,0.0,0.625,0.9216,Positive
3,U cant deny how good this looks.Now if they ca...,0.302,0.049,0.649,0.9262,Positive
4,"From this trailer, I have a feeling that this ...",0.131,0.0,0.869,0.4416,Positive


* **Final data**

In [6]:
data2=data1.drop(['Positive','Negative','Neutral','Compound'],axis=1)
data2.head()

Unnamed: 0,Comment,Sentiment
0,Love how Dr. Fate's design looks and how cool ...,Positive
1,I can’t get over how good everything looks. Dr...,Positive
2,Really hoping that this can save DC's movie un...,Positive
3,U cant deny how good this looks.Now if they ca...,Positive
4,"From this trailer, I have a feeling that this ...",Positive


* **Data transformation**

In [7]:
stop_words = stopwords.words('english')
porter_stemmer = PorterStemmer()
lancaster_stemmer = LancasterStemmer() 
snowball_stemer = SnowballStemmer(language="english")
lzr = WordNetLemmatizer()

In [None]:
def text_processing(text):   

    text = text.lower()

    
    text = re.sub(r'\n',' ', text)
    
   
    text = re.sub('[%s]' % re.escape(punctuation), "", text)
    
    
    text = re.sub("^a-zA-Z0-9$,.", "", text)
    
   
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    
    
    text = re.sub(r'\W', ' ', text)

    text = ' '.join([word for word in word_tokenize(text) if word not in stop_words])
    
    
    text=' '.join([lzr.lemmatize(word) for word in word_tokenize(text)])

    return text

In [9]:
nltk.download('omw-1.4')
data_copy = data2.copy()
data_copy.Comment = data_copy.Comment.apply(lambda text: text_processing(text))

[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...


In [10]:
le = LabelEncoder()
data_copy['Sentiment'] = le.fit_transform(data_copy['Sentiment'])

In [11]:
processed_data = {
    'Sentence':data_copy.Comment,
    'Sentiment':data_copy['Sentiment']
}

processed_data = pd.DataFrame(processed_data)
processed_data.head()

Unnamed: 0,Sentence,Sentiment
0,love dr fate design look cool scene look power...,2
1,get good everything look dr fate magic cyclone...,2
2,really hoping save dc movie universe looking n...,2
3,u cant deny good looksnow follow rest movie go...,2
4,trailer feeling movie going one movie would ne...,2


In [12]:
processed_data['Sentiment'].value_counts()

2    205
1     39
0     36
Name: Sentiment, dtype: int64

* **Balancing data**

In [13]:
df_neutral = processed_data[(processed_data['Sentiment']==1)] 
df_negative = processed_data[(processed_data['Sentiment']==0)]
df_positive = processed_data[(processed_data['Sentiment']==2)]

# upsample minority classes
df_negative_upsampled = resample(df_negative, 
                                 replace=True,    
                                 n_samples= 205, 
                                 random_state=42)  

df_neutral_upsampled = resample(df_neutral, 
                                 replace=True,    
                                 n_samples= 205, 
                                 random_state=42)  


# Concatenate the upsampled dataframes with the neutral dataframe
final_data = pd.concat([df_negative_upsampled,df_neutral_upsampled,df_positive])

In [14]:
final_data['Sentiment'].value_counts()

0    205
1    205
2    205
Name: Sentiment, dtype: int64

In [15]:
corpus = []
for sentence in final_data['Sentence']:
    corpus.append(sentence)
corpus[0:5]

['trailer look sick im definitely watching movie',
 'actually look like villain trailer',
 'movie going push dc top comic book movie disaster early dceu new msheu mess two awesome projekts behind suicide squad peacemaker yeah going rock pun kinda intended',
 'damn sure im gon na watchdc seems going right track',
 'okay look absolutely incredible dc making look foolish ever even skeptical film definitely seeing opening weekend']

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
y = final_data.iloc[:, -1].values

* **Machine learning model**

In [17]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB()

* **Evaluation**

In [18]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm

array([[58,  0,  0],
       [ 0, 70,  0],
       [11,  1, 45]])

In [19]:
nb_score = accuracy_score(y_test, y_pred)
print('accuracy',nb_score)

accuracy 0.9351351351351351
