# YouTube Comments Sentiment Analysis

### Import packages

In [None]:
# Basics
import pandas as pd; import os
import csv; import numpy as np
import re; import warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Reading Pre-Labeled YouTube Video Comments

here we are taking pre-labeled comments of 5 popular youtube videos for training & testing

In [None]:
# training data
okgo = pd.read_csv('/content/drive/MyDrive/Distributed/OKGO.csv', delimiter=";", skiprows=2, encoding='latin-1', engine='python') # read in the data
trump = pd.read_csv('/content/drive/MyDrive/Distributed/trump.csv', delimiter=",", skiprows=2, encoding='utf-8', error_bad_lines=False, engine='python')
swift = pd.read_csv('/content/drive/MyDrive/Distributed/TaylorSwift.csv', delimiter=",", skiprows=2, nrows=180, encoding='utf-8', engine='python')
royal = pd.read_csv('/content/drive/MyDrive/Distributed/RoyalWedding.csv', delimiter=",", skiprows=2, nrows=61, encoding='utf-8', engine='python')
paul = pd.read_csv('/content/drive/MyDrive/Distributed/LoganPaul.csv', delimiter=",", skiprows=2, nrows=200, encoding='utf-8', engine='python')

### Reading Pre-Labeled Tweets & Blog Comments

In [None]:
blogs = pd.read_csv('/content/drive/MyDrive/Distributed/Kagel.csv', delimiter=",", skiprows=2, encoding='latin-1', engine='python') # read in the data
tweets = pd.read_csv('/content/drive/MyDrive/Distributed/twitter.csv', delimiter=",", skiprows=2, encoding='latin-1', engine='python') # read in the data

## Data Preprocessing

In [None]:
# clean dataframes
tweets = tweets.drop(['Topic', 'TweetId', "TweetDate"], axis = 1).dropna()
tweets.head()

Unnamed: 0,Sentiment,TweetText
0,positive,Now all @Apple has to do is get swype on the i...
1,positive,@Apple will be adding more carrier support to ...
2,positive,Hilarious @youtube video - guy does a duet wit...
3,positive,@RIM you made it too easy for me to switch to ...
4,positive,I just realized that the reason I got into twi...


In [None]:
def fix_cols(DF):
    DF = DF.iloc[:,:2]
    DF.columns = ["label", "comment"]
    return DF

In [None]:
okgo = fix_cols(okgo)
trump = fix_cols(trump)
swift = fix_cols(swift)
royal = fix_cols(royal)
paul = fix_cols(paul)
tweets = fix_cols(tweets)


okgo.head()

Unnamed: 0,label,comment
0,-1.0,Everyone knows brand's papers from.\rBut -No o...
1,0.0,ÒYour paper cut balance is: \r-£25279102771Ó
2,1.0,OH SHIT WHEN I SAW THIS ON MY FRONT PAGE.........
3,1.0,Blowing my mind yet again
4,0.0,Should have gone with Dunder Mifflin


In [None]:
tweets.label = tweets.label.replace({'positive': '1.0', 'negative':'-1.0', 'neutral': '0.0', 'irrelevant': '0.0'}, regex=True)
tweets['label'] = pd.to_numeric(tweets['label'], errors='coerce')

In [None]:
tweets = fix_cols(tweets)
blogs = fix_cols(blogs)

tweets.head()

Unnamed: 0,label,comment
0,1.0,Now all @Apple has to do is get swype on the i...
1,1.0,@Apple will be adding more carrier support to ...
2,1.0,Hilarious @youtube video - guy does a duet wit...
3,1.0,@RIM you made it too easy for me to switch to ...
4,1.0,I just realized that the reason I got into twi...


### Create Datasets

In [None]:
yt_comments = pd.concat([okgo, trump, swift, royal, paul], ignore_index=True)
yt_comments.head()

Unnamed: 0,label,comment
0,-1.0,Everyone knows brand's papers from.\rBut -No o...
1,0.0,ÒYour paper cut balance is: \r-£25279102771Ó
2,1.0,OH SHIT WHEN I SAW THIS ON MY FRONT PAGE.........
3,1.0,Blowing my mind yet again
4,0.0,Should have gone with Dunder Mifflin


In [None]:
non_yt_comments = pd.concat([blogs, tweets], ignore_index=True)
non_yt_comments.head()

Unnamed: 0,label,comment
0,1.0,i liked the Da Vinci Code a lot
1,1.0,i liked the Da Vinci Code a lot
2,1.0,I liked the Da Vinci Code but it ultimatly di...
3,1.0,that's not even an exaggeration ) and at midn...
4,1.0,I loved the Da Vinci Code but now I want some...


In [None]:
comments = pd.concat([yt_comments, non_yt_comments], ignore_index=True)
comments.head()

Unnamed: 0,label,comment
0,-1.0,Everyone knows brand's papers from.\rBut -No o...
1,0.0,ÒYour paper cut balance is: \r-£25279102771Ó
2,1.0,OH SHIT WHEN I SAW THIS ON MY FRONT PAGE.........
3,1.0,Blowing my mind yet again
4,0.0,Should have gone with Dunder Mifflin


### Remove Non-Alphabetic Characters (including numbers)

In [None]:
def convert_to_string(DF):
    DF["comment"]= DF["comment"].astype(str)

In [None]:
convert_to_string(comments)

In [None]:
def cleanerFn(b):
    # keeps only words with alphabetic characters in comments
    for row in range(len(b)):
        line = b.loc[row, "comment"]
        b.loc[row,"comment"] = re.sub("[^a-zA-Z]", " ", line)

In [None]:
cleanerFn(comments)
comments.head()

Unnamed: 0,label,comment
0,-1.0,Everyone knows brand s papers from But No on...
1,0.0,Your paper cut balance is
2,1.0,OH SHIT WHEN I SAW THIS ON MY FRONT PAGE ...
3,1.0,Blowing my mind yet again
4,0.0,Should have gone with Dunder Mifflin


### Natural Language Processing

In [None]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import *
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
nltk.download('stopwords')
sw = stopwords.words('english')
ps = PorterStemmer()
lemmatizer = nltk.stem.WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


#### Tokenization, Remove Stop Words, Lemmatization & Stemming

In [None]:
def nlpFunction(DF):
    DF['com_token'] = DF['comment'].str.lower().str.split()
    DF['com_remv'] = DF['com_token'].apply(lambda x: [y for y in x if y not in sw])
    DF["com_lemma"] = DF['com_remv'].apply(lambda x : [lemmatizer.lemmatize(y) for y in x]) # lemmatization
    DF['com_stem'] = DF['com_lemma'].apply(lambda x : [ps.stem(y) for y in x]) # stemming
    DF["com_tok_str"] = DF["com_stem"].apply(', '.join)
    DF["com_full"] = DF["com_remv"].apply(' '.join)
    return DF

In [None]:
nltk.download('wordnet')
comments = nlpFunction(comments)
comments.head()

[nltk_data] Downloading package wordnet to /root/nltk_data...


Unnamed: 0,label,comment,com_token,com_remv,com_lemma,com_stem,com_tok_str,com_full
0,-1.0,Everyone knows brand s papers from But No on...,"[everyone, knows, brand, s, papers, from, but,...","[everyone, knows, brand, papers, one, knows, w...","[everyone, know, brand, paper, one, know, welf...","[everyon, know, brand, paper, one, know, welfa...","everyon, know, brand, paper, one, know, welfar...",everyone knows brand papers one knows welfare ...
1,0.0,Your paper cut balance is,"[your, paper, cut, balance, is]","[paper, cut, balance]","[paper, cut, balance]","[paper, cut, balanc]","paper, cut, balanc",paper cut balance
2,1.0,OH SHIT WHEN I SAW THIS ON MY FRONT PAGE ...,"[oh, shit, when, i, saw, this, on, my, front, ...","[oh, shit, saw, front, page, love, song]","[oh, shit, saw, front, page, love, song]","[oh, shit, saw, front, page, love, song]","oh, shit, saw, front, page, love, song",oh shit saw front page love song
3,1.0,Blowing my mind yet again,"[blowing, my, mind, yet, again]","[blowing, mind, yet]","[blowing, mind, yet]","[blow, mind, yet]","blow, mind, yet",blowing mind yet
4,0.0,Should have gone with Dunder Mifflin,"[should, have, gone, with, dunder, mifflin]","[gone, dunder, mifflin]","[gone, dunder, mifflin]","[gone, dunder, mifflin]","gone, dunder, mifflin",gone dunder mifflin


In [None]:
def drop_cols_after_nlp(comments):
    comments = comments.drop(columns = ['comment', 'com_token', 'com_remv', 'com_lemma', 'com_stem', 'com_tok_str'], axis = 1)
    return comments
comments = drop_cols_after_nlp(comments)
comments.head()

Unnamed: 0,label,com_full
0,-1.0,everyone knows brand papers one knows welfare ...
1,0.0,paper cut balance
2,1.0,oh shit saw front page love song
3,1.0,blowing mind yet
4,0.0,gone dunder mifflin


In [None]:
comments.rename(columns = {'com_full': 'comment'}, inplace=True)
comments.head()

Unnamed: 0,label,comment
0,-1.0,everyone knows brand papers one knows welfare ...
1,0.0,paper cut balance
2,1.0,oh shit saw front page love song
3,1.0,blowing mind yet
4,0.0,gone dunder mifflin


In [None]:
def remove_missing_vals(comments):
    comments['comment'] = comments['comment'].str.strip()
    comments = comments[comments.comment != 'nan'] # remove nan values from data
    comments = comments[comments.comment != '']

remove_missing_vals(comments)

In [None]:
comments.head()

Unnamed: 0,label,comment
0,-1.0,everyone knows brand papers one knows welfare ...
1,0.0,paper cut balance
2,1.0,oh shit saw front page love song
3,1.0,blowing mind yet
4,0.0,gone dunder mifflin


In [None]:
comments['label'].isna().sum()

2355

In [None]:
comments = comments[comments['label'].notna()]
comments['label'].isna().sum()

0

In [None]:
len(comments)

14830

In [None]:
X = comments['comment']
y = comments.label

In [None]:
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=53, test_size=0.25)

### Vectorize the tweets
<p>We have the training and testing data all set up, but we need to create vectorized representations of the tweets in order to apply machine learning.</p>
<p>To do so, we will utilize the <code>CountVectorizer</code> and <code>TfidfVectorizer</code> classes which we will first need to fit to the data.</p>
<p>Once this is complete, we can start modeling with the new vectorized tweets!</p>

In [None]:
# Initialize count vectorizer
count_vectorizer = CountVectorizer(stop_words='english',
                                   min_df=0.05, max_df=0.9)

# Create count train and test variables
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

# Initialize tfidf vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english',
                                   min_df=0.05, max_df=0.9)

# Create tfidf train and test variables
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

## Model Building

In [None]:
# Set seed for reproducibility
import random; random.seed(5)

# Import all we need from sklearn
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn import metrics

### Multinomial Naive-Bayes Model
Training a multinomial naive Bayes model
<p>Now that we have the data in vectorized form, we can train the first model. Investigate using the Multinomial Naive Bayes model with both the <code>CountVectorizer</code> and <code>TfidfVectorizer</code> data. Which do will perform better? How come?</p>
<p>To assess the accuracies, we will print the test sets accuracy scores for both models.</p>

In [None]:
# Create a MulitnomialNB model
tfidf_nb = MultinomialNB()
tfidf_nb.fit(tfidf_train,y_train)
# Run predict on your TF-IDF test data to get your predictions
tfidf_nb_pred = tfidf_nb.predict(tfidf_test)

# Calculate the accuracy of your predictions
tfidf_nb_score = metrics.accuracy_score(y_test,tfidf_nb_pred)

# Create a MulitnomialNB model
count_nb = MultinomialNB()
count_nb.fit(count_train,y_train)

# Run predict on your count test data to get your predictions
count_nb_pred = count_nb.predict(count_test)

# Calculate the accuracy of your predictions
count_nb_score = metrics.accuracy_score(count_nb_pred,y_test)

print('NaiveBayes Tfidf Score: ', tfidf_nb_score)
print('NaiveBayes Count Score: ', count_nb_score)

NaiveBayes Tfidf Score:  0.7909924487594391
NaiveBayes Count Score:  0.7831715210355987


### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression()
lr_model.fit(tfidf_train,y_train)
accuracy_lr = lr_model.score(tfidf_test,y_test)
print("Logistic Regression accuracy is (for Tfidf) :",accuracy_lr)

Logistic Regression accuracy is (for Tfidf) : 0.7880258899676376


In [None]:
lr_model = LogisticRegression()
lr_model.fit(count_train,y_train)
accuracy_lr = lr_model.score(count_test,y_test)
print("Logistic Regression accuracy is (for Count) :",accuracy_lr)

Logistic Regression accuracy is (for Count) : 0.7877562028047465


### SVC

In [None]:
# Create a SVM model
from sklearn import svm
tfidf_svc = svm.SVC(kernel='linear', C=1)

tfidf_svc.fit(tfidf_train,y_train)
# Run predict on your tfidf test data to get your predictions
tfidf_svc_pred = tfidf_svc.predict(tfidf_test)

# Calculate your accuracy using the metrics module
tfidf_svc_score = metrics.accuracy_score(y_test,tfidf_svc_pred)

print("LinearSVC Score (for tfidf):   %0.3f" % tfidf_svc_score)

LinearSVC Score (for tfidf):   0.792


In [None]:
count_svc = svm.SVC(kernel='linear', C=1)

count_svc.fit(count_train,y_train)
# Run predict on your count test data to get your predictions
count_svc_pred = count_svc.predict(count_test)

# Calculate your accuracy using the metrics module
count_svc_score = metrics.accuracy_score(y_test,count_svc_pred)

print("LinearSVC Score (for Count):   %0.3f" % tfidf_svc_score)

LinearSVC Score (for Count):   0.792


### Desicion Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier()
dt_model.fit(tfidf_train,y_train)
accuracy_dt = dt_model.score(tfidf_test,y_test)
print("Decision Tree accuracy is (for Tfidf):",accuracy_dt)

Decision Tree accuracy is (for Tfidf): 0.7982740021574973


In [None]:
dt_model = DecisionTreeClassifier()
dt_model.fit(count_train,y_train)
accuracy_dt = dt_model.score(count_test,y_test)
print("Decision Tree accuracy is (for Count):",accuracy_dt)

Decision Tree accuracy is (for Count): 0.7977346278317152


### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_model_initial = RandomForestClassifier(n_estimators = 5, random_state = 1)
rf_model_initial.fit(tfidf_train,y_train)
print("Random Forest accuracy for 5 trees is (Tfidf):",rf_model_initial.score(tfidf_test,y_test))

Random Forest accuracy for 5 trees is (Tfidf): 0.7977346278317152


In [None]:
rf_model_initial = RandomForestClassifier(n_estimators = 5, random_state = 1)
rf_model_initial.fit(count_train,y_train)
print("Random Forest accuracy for 5 trees is (Count):",rf_model_initial.score(count_test,y_test))

Random Forest accuracy for 5 trees is (Count): 0.7974649406688241


# Predicting Sentiment For YouTube video

**Steps:**
1. Run main.py file (<code>python main.py</code>).
2. Enter YouTube Video of Your choice. The comments for the YouTube video will be downloaded into Comments.csv file.
3. Then execute the below files.

## Reading Data

### Reading Testing YouTube Video Comments

Comments.csv files has comments of youtube video

In [None]:
pip install extractor


Collecting extractor
  Downloading Extractor-0.5.tar.gz (6.0 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: extractor
  Building wheel for extractor (setup.py) ... [?25l[?25hdone
  Created wheel for extractor: filename=Extractor-0.5-py3-none-any.whl size=7050 sha256=21c00ebbe63fb4769c3e10f4d184027ffb1d3555dc02fb101fb456676fcca350
  Stored in directory: /root/.cache/pip/wheels/89/17/06/d6c326d8875ed87617a38d9079d67fc5d9139d96049fc00f31
Successfully built extractor
Installing collected packages: extractor
Successfully installed extractor-0.5


In [None]:
pip install youtube-comment-downloader


Collecting youtube-comment-downloader
  Downloading youtube_comment_downloader-0.1.70-py3-none-any.whl (7.9 kB)
Collecting dateparser (from youtube-comment-downloader)
  Downloading dateparser-1.2.0-py2.py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dateparser, youtube-comment-downloader
Successfully installed dateparser-1.2.0 youtube-comment-downloader-0.1.70


In [None]:
prediction_comments = pd.read_csv('/content/drive/MyDrive/Distributed/output_comments.csv', delimiter=",", encoding='utf-8', engine='python')
prediction_comments = prediction_comments.iloc[:,:1]
prediction_comments.columns=['comment']
prediction_comments.head()

Unnamed: 0,comment
0,Thank you all for the love towards the music o...
1,He is the SRK we girls fell in love with back ...
2,After around ten years we are going to come ba...
3,You can ignore the whole Bollywood industry bu...
4,It doesn't matter how many times i listen to t...


In [None]:
# Lets use SVC to predict on our youtube video comments
prediction_comments.head()

Unnamed: 0,comment
0,Thank you all for the love towards the music o...
1,He is the SRK we girls fell in love with back ...
2,After around ten years we are going to come ba...
3,You can ignore the whole Bollywood industry bu...
4,It doesn't matter how many times i listen to t...


In [None]:
len(prediction_comments['comment'])

1001

In [None]:
convert_to_string(prediction_comments)
cleanerFn(prediction_comments)
prediction_comments = nlpFunction(prediction_comments)
prediction_comments = drop_cols_after_nlp(prediction_comments)
prediction_comments.rename(columns = {'com_full': 'comment'}, inplace=True)
remove_missing_vals(prediction_comments)
prediction_comments.head()

Unnamed: 0,comment
0,thank love towards music jawan
1,srk girls fell love back lost feel like back
2,around ten years going come back feel nostalgi...
3,ignore whole bollywood industry ignore movie s...
4,matter many times listen song still give fresh...


In [None]:
tfidf_pred = tfidf_vectorizer.transform(prediction_comments['comment'])
tfidf_svc_pred = tfidf_svc.predict(tfidf_pred)

In [None]:
neutral = (tfidf_svc_pred == 0.0).sum()
positive = (tfidf_svc_pred == 1.0).sum()
negative = (tfidf_svc_pred < 0).sum()

In [None]:
print(neutral, positive, negative)

878 123 0


In [None]:
print("Good video" if positive > negative else "Bad video")

Good video
