In [1]:
# YouTube Comments Sentiment Analysis 

### Import packages

# Basics
import pandas as pd; import os
import csv; import numpy as np
import re; import warnings
warnings.filterwarnings('ignore')


In [2]:
### Reading Pre-Labeled YouTube Video Comments
positiveones = pd.read_csv('positiveones.csv')
negativeones = pd.read_csv('negativeones.csv')


In [3]:
def fix_cols(DF):
    DF = DF.iloc[:,:2]
    DF.columns = ["comment", "label"]
    return DF

positiveones = fix_cols(positiveones)
negativeones = fix_cols(negativeones)



In [4]:
comments= pd.concat([positiveones,negativeones], ignore_index=True)
comments.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   comment  25000 non-null  object
 1   label    25000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 390.8+ KB


In [5]:
def convert_to_string(DF):
    DF["comment"]= DF["comment"].astype(str) 

convert_to_string(comments)

def cleanerFn(b):
    # keeps only words with alphabetic characters in comments
    for row in range(len(b)):
        line = b.loc[row, "comment"]
        b.loc[row,"comment"] = re.sub("[^a-zA-Z]", " ", line)

cleanerFn(comments)
comments.head()


Unnamed: 0,comment,label
0,Bromwell High is a cartoon comedy It ran at t...,1
1,Homelessness or Houselessness as George Carli...,1
2,Brilliant over acting by Lesley Ann Warren Be...,1
3,This is easily the most underrated film inn th...,1
4,This is not the typical Mel Brooks film It wa...,1


In [6]:
### Natural Language Processing

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import *
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

sw = stopwords.words('english')
ps = PorterStemmer()
lemmatizer = nltk.stem.WordNetLemmatizer()

#### Tokenization, Remove Stop Words, Lemmatization & Stemming

def nlpFunction(DF):
    DF['com_token'] = DF['comment'].str.lower().str.split()
    DF['com_remv'] = DF['com_token'].apply(lambda x: [y for y in x if y not in sw])
    DF["com_lemma"] = DF['com_remv'].apply(lambda x : [lemmatizer.lemmatize(y) for y in x]) # lemmatization
    DF['com_stem'] = DF['com_lemma'].apply(lambda x : [ps.stem(y) for y in x]) # stemming
    DF["com_tok_str"] = DF["com_stem"].apply(', '.join)
    DF["com_full"] = DF["com_remv"].apply(' '.join)
    return DF

comments = nlpFunction(comments)
comments.head()

Unnamed: 0,comment,label,com_token,com_remv,com_lemma,com_stem,com_tok_str,com_full
0,Bromwell High is a cartoon comedy It ran at t...,1,"[bromwell, high, is, a, cartoon, comedy, it, r...","[bromwell, high, cartoon, comedy, ran, time, p...","[bromwell, high, cartoon, comedy, ran, time, p...","[bromwel, high, cartoon, comedi, ran, time, pr...","bromwel, high, cartoon, comedi, ran, time, pro...",bromwell high cartoon comedy ran time programs...
1,Homelessness or Houselessness as George Carli...,1,"[homelessness, or, houselessness, as, george, ...","[homelessness, houselessness, george, carlin, ...","[homelessness, houselessness, george, carlin, ...","[homeless, houseless, georg, carlin, state, is...","homeless, houseless, georg, carlin, state, iss...",homelessness houselessness george carlin state...
2,Brilliant over acting by Lesley Ann Warren Be...,1,"[brilliant, over, acting, by, lesley, ann, war...","[brilliant, acting, lesley, ann, warren, best,...","[brilliant, acting, lesley, ann, warren, best,...","[brilliant, act, lesley, ann, warren, best, dr...","brilliant, act, lesley, ann, warren, best, dra...",brilliant acting lesley ann warren best dramat...
3,This is easily the most underrated film inn th...,1,"[this, is, easily, the, most, underrated, film...","[easily, underrated, film, inn, brooks, cannon...","[easily, underrated, film, inn, brook, cannon,...","[easili, underr, film, inn, brook, cannon, sur...","easili, underr, film, inn, brook, cannon, sure...",easily underrated film inn brooks cannon sure ...
4,This is not the typical Mel Brooks film It wa...,1,"[this, is, not, the, typical, mel, brooks, fil...","[typical, mel, brooks, film, much, less, slaps...","[typical, mel, brook, film, much, le, slapstic...","[typic, mel, brook, film, much, le, slapstick,...","typic, mel, brook, film, much, le, slapstick, ...",typical mel brooks film much less slapstick mo...


In [7]:
def drop_cols_after_nlp(comments):
    comments = comments.drop(columns = ['comment', 'com_token', 'com_remv', 'com_lemma', 'com_stem', 'com_tok_str'], axis = 1)
    return comments
comments = drop_cols_after_nlp(comments)
comments.head()

comments.rename(columns = {'com_full': 'comment'}, inplace=True)
comments.head()


Unnamed: 0,label,comment
0,1,bromwell high cartoon comedy ran time programs...
1,1,homelessness houselessness george carlin state...
2,1,brilliant acting lesley ann warren best dramat...
3,1,easily underrated film inn brooks cannon sure ...
4,1,typical mel brooks film much less slapstick mo...


In [8]:
def remove_missing_vals(comments): 
    comments['comment'] = comments['comment'].str.strip()
    comments = comments[comments.comment != 'nan'] # remove nan values from data
    comments = comments[comments.comment != '']
    
remove_missing_vals(comments)

comments.head()

comments['label'].isna().sum()

comments = comments[comments['label'].notna()]



In [9]:
comments.to_csv('mergeddataset.csv')

In [10]:
X = comments['comment']
y = comments.label

import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.utils import shuffle

# Assuming you have already loaded the dataset into X and y
# Shuffle the data to avoid keeping all samples of one class together
X, y = shuffle(X, y, random_state=42)

# Now proceed with train-test split and model training

# Use train_test_split with stratify to maintain the same class proportions in the splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Now, X_train, y_train, X_test, y_test will have both positive and negative samples in random order


In [11]:
# Initialize count vectorizer
count_vectorizer = CountVectorizer(stop_words='english', 
                                   min_df=0.05, max_df=0.9)

# Create count train and test variables
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

# Initialize tfidf vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', 
                                   min_df=0.05, max_df=0.9)

# Create tfidf train and test variables
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)


In [12]:
## Model Building

# Set seed for reproducibility
import random; random.seed(5)

# Import all we need from sklearn
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn import metrics

### Multinomial Naive-Bayes Model
Training a multinomial naive Bayes model
<p>Now that we have the data in vectorized form, we can train the first model. Investigate using the Multinomial Naive Bayes model with both the <code>CountVectorizer</code> and <code>TfidfVectorizer</code> data. Which do will perform better? How come?</p>
<p>To assess the accuracies, we will print the test sets accuracy scores for both models.</p>


In [13]:

# Create a MulitnomialNB model
tfidf_nb = MultinomialNB()
tfidf_nb.fit(tfidf_train,y_train)
# Run predict on your TF-IDF test data to get your predictions
tfidf_nb_pred = tfidf_nb.predict(tfidf_test)

# Calculate the accuracy of your predictions
tfidf_nb_score = metrics.accuracy_score(y_test,tfidf_nb_pred)

# Create a MulitnomialNB model
count_nb = MultinomialNB()
count_nb.fit(count_train,y_train)

# Run predict on your count test data to get your predictions
count_nb_pred = count_nb.predict(count_test)

# Calculate the accuracy of your predictions
count_nb_score = metrics.accuracy_score(count_nb_pred,y_test)

print('NaiveBayes Tfidf Score: ', tfidf_nb_score)
print('NaiveBayes Count Score: ', count_nb_score)


NaiveBayes Tfidf Score:  0.797
NaiveBayes Count Score:  0.7898


In [14]:
### Logistic Regression

from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression()
lr_model.fit(tfidf_train,y_train)
accuracy_lr = lr_model.score(tfidf_test,y_test)
y_lr_pred = lr_model.predict(tfidf_test)
print("Logistic Regression accuracy is (for Tfidf) :",accuracy_lr)

lr_model = LogisticRegression()
lr_model.fit(count_train,y_train)
accuracy_lr = lr_model.score(count_test,y_test)
print("Logistic Regression accuracy is (for Count) :",accuracy_lr)


Logistic Regression accuracy is (for Tfidf) : 0.8054
Logistic Regression accuracy is (for Count) : 0.8078


In [15]:
### SVC

# Create a SVM model
from sklearn import svm
tfidf_svc = svm.SVC(kernel='linear', C=1)

tfidf_svc.fit(tfidf_train,y_train)
# Run predict on your tfidf test data to get your predictions
tfidf_svc_pred = tfidf_svc.predict(tfidf_test)

# Calculate your accuracy using the metrics module
tfidf_svc_score = metrics.accuracy_score(y_test,tfidf_svc_pred)

print("LinearSVC Score (for tfidf):   %0.3f" % tfidf_svc_score)

count_svc = svm.SVC(kernel='linear', C=1)

count_svc.fit(count_train,y_train)
# Run predict on your count test data to get your predictions
count_svc_pred = count_svc.predict(count_test)

# Calculate your accuracy using the metrics module
count_svc_score = metrics.accuracy_score(y_test,count_svc_pred)

print("LinearSVC Score (for Count):   %0.3f" % tfidf_svc_score)

LinearSVC Score (for tfidf):   0.807
LinearSVC Score (for Count):   0.807


In [16]:
### Desicion Tree

from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier()
dt_model.fit(tfidf_train,y_train)
accuracy_dt = dt_model.score(tfidf_test,y_test)
print("Decision Tree accuracy is (for Tfidf):",accuracy_dt)

dt_model = DecisionTreeClassifier()
dt_model.fit(count_train,y_train)
accuracy_dt = dt_model.score(count_test,y_test)
print("Decision Tree accuracy is (for Count):",accuracy_dt)


Decision Tree accuracy is (for Tfidf): 0.6806
Decision Tree accuracy is (for Count): 0.6756


In [29]:
### Random Forest

from sklearn.ensemble import RandomForestClassifier
rf_model_initial = RandomForestClassifier(n_estimators = 9, random_state = 1)
rf_model_initial.fit(tfidf_train,y_train)
print("Random Forest accuracy for 9 trees is (Tfidf):",rf_model_initial.score(tfidf_test,y_test))

rf_model_initial = RandomForestClassifier(n_estimators = 9, random_state = 1)
rf_model_initial.fit(count_train,y_train)
print("Random Forest accuracy for 9 trees is (Count):",rf_model_initial.score(count_test,y_test))


Random Forest accuracy for 9 trees is (Tfidf): 0.752
Random Forest accuracy for 9 trees is (Count): 0.7444


In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics



# Evaluate the models
print("Naive Bayes Tfidf:")
print("Precision:", metrics.precision_score(y_test, tfidf_nb_pred))
print("Recall:", metrics.recall_score(y_test, tfidf_nb_pred))
print("F1-score:", metrics.f1_score(y_test, tfidf_nb_pred))
print("ROC-AUC:", metrics.roc_auc_score(y_test, tfidf_nb.predict_proba(tfidf_test)[:, 1]))
print()

print("SVC Tfidf:")
print("Precision:", metrics.precision_score(y_test,tfidf_svc_pred))
print("Recall:",metrics.recall_score(y_test,tfidf_svc_pred))
print("F1-score:", metrics.f1_score(y_test, tfidf_svc_pred))
decision_scores_svc_tfidf = tfidf_svc.decision_function(tfidf_test)
print("ROC-AUC:", metrics.roc_auc_score(y_test, decision_scores_svc_tfidf))
print()

print("Logistic Regression Tfidf:")
print("Precision:", metrics.precision_score(y_test, y_lr_pred))
print("Recall:", metrics.recall_score(y_test, y_lr_pred))
print("F1-score:", metrics.f1_score(y_test, y_lr_pred))
print("ROC-AUC:", metrics.roc_auc_score(y_test, lr_model.predict_proba(tfidf_test)[:, 1]))
print()

y_pred_dt_tfidf = dt_model.predict(tfidf_test)
print("Decision Tree Tfidf:")
print("Precision:", metrics.precision_score(y_test, y_pred_dt_tfidf))
print("Recall:", metrics.recall_score(y_test, y_pred_dt_tfidf))
print("F1-score:", metrics.f1_score(y_test, y_pred_dt_tfidf))
print("ROC-AUC:", metrics.roc_auc_score(y_test, dt_model.predict_proba(tfidf_test)[:, 1]))
print()
y_pred_rf_tfidf = rf_model_initial.predict(tfidf_test)
print("Random Forest Tfidf:")
print("Precision:", metrics.precision_score(y_test, y_pred_rf_tfidf))
print("Recall:", metrics.recall_score(y_test, y_pred_rf_tfidf))
print("F1-score:", metrics.f1_score(y_test, y_pred_rf_tfidf))
print("ROC-AUC:", metrics.roc_auc_score(y_test, rf_model_initial.predict_proba(tfidf_test)[:, 1]))
print()


Naive Bayes Tfidf:
Precision: 0.7837218188765762
Recall: 0.8204
F1-score: 0.8016415868673051
ROC-AUC: 0.87701472

SVC Tfidf:
Precision: 0.7915557246101179
Recall: 0.8324
F1-score: 0.811464223045428
ROC-AUC: 0.8869832

Logistic Regression Tfidf:
Precision: 0.7967353284104158
Recall: 0.82
F1-score: 0.808200275970826
ROC-AUC: 0.8839177599999999

Decision Tree Tfidf:
Precision: 0.5070774354704413
Recall: 0.9744
F1-score: 0.6670317634173055
ROC-AUC: 0.5136

Random Forest Tfidf:
Precision: 0.5078125
Recall: 0.988
F1-score: 0.6708310700706138
ROC-AUC: 0.52232056



In [None]:
# prediction_comments = pd.read_csv('Comments.csv', delimiter=",", encoding='utf-8', engine='python')
# prediction_comments = prediction_comments.iloc[:,:1]
# prediction_comments.columns=['comment']
# prediction_comments.head()

# convert_to_string(prediction_comments)
# cleanerFn(prediction_comments)
# prediction_comments = nlpFunction(prediction_comments)
# prediction_comments = drop_cols_after_nlp(prediction_comments)
# prediction_comments.rename(columns = {'com_full': 'comment'}, inplace=True)
# remove_missing_vals(prediction_comments)
# prediction_comments.head()

# tfidf_pred = tfidf_vectorizer.transform(prediction_comments['comment'])
# tfidf_svc_pred = tfidf_svc.predict(tfidf_pred)

# positive = (tfidf_svc_pred == 1.0).sum()
# negative = (tfidf_svc_pred == 0.0).sum()


# print(neutral, positive, negative)

# print("Good video" if positive > negative else "Bad video")

In [31]:
import joblib

# Save MultinomialNB model and vectorizer for TF-IDF
joblib.dump(tfidf_nb, 'tfidf_nb_model.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

# Save MultinomialNB model and vectorizer for CountVectorizer
joblib.dump(count_nb, 'count_nb_model.pkl')
joblib.dump(count_vectorizer, 'count_vectorizer.pkl')

# Save Logistic Regression model and vectorizer for TF-IDF
joblib.dump(lr_model, 'tfidf_lr_model.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

# Save Logistic Regression model and vectorizer for CountVectorizer
joblib.dump(lr_model, 'count_lr_model.pkl')
joblib.dump(count_vectorizer, 'count_vectorizer.pkl')

# Save LinearSVC model and vectorizer for TF-IDF
joblib.dump(tfidf_svc, 'tfidf_svc_model.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

# Save LinearSVC model and vectorizer for CountVectorizer
joblib.dump(count_svc, 'count_svc_model.pkl')
joblib.dump(count_vectorizer, 'count_vectorizer.pkl')

# Save Decision Tree model and vectorizer for TF-IDF
joblib.dump(dt_model, 'tfidf_dt_model.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

# Save Decision Tree model and vectorizer for CountVectorizer
joblib.dump(dt_model, 'count_dt_model.pkl')
joblib.dump(count_vectorizer, 'count_vectorizer.pkl')

# Save Random Forest model and vectorizer for TF-IDF
joblib.dump(rf_model_initial, 'tfidf_rf_model.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

# Save Random Forest model and vectorizer for CountVectorizer
joblib.dump(rf_model_initial, 'count_rf_model.pkl')
joblib.dump(count_vectorizer, 'count_vectorizer.pkl')


['count_vectorizer.pkl']