In [None]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

df = pd.read_csv('YoutubeCommentsDataSet_Filtered.csv')

print(df.head())

                                             Comment Sentiment
0  lets not forget that apple pay in 2014 require...   neutral
1  i will forever acknowledge this channel with t...  positive
2  apple pay is so convenient secure and easy to ...  positive
3  for now i need both apple pay and the physical...   neutral
4  in the united states we have an abundance of r...  positive


## Preprocessing

In [5]:
# Preprocess text
# (tokenization, stopwords removal, lemmatization)
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())

    # Remove stop words
    filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]

    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    # Join the tokens back into a string
    preprocessed_text = " ".join(lemmatized_tokens)

    return preprocessed_text

In [None]:
# Lets preprocess the text
# first: tokenize the text
# second: remove stopwords, punctuation, and special characters.
# third: stemming and lemmatizaion

In [None]:
df['Comment'] = df['Comment'].apply(preprocess_text)

In [15]:
df.head()

Unnamed: 0,Comment,Sentiment
0,let forget apple pay 2014 required brand new i...,neutral
1,forever acknowledge channel help lesson idea e...,positive
2,apple pay convenient secure easy use used kore...,positive
3,need apple pay physical credit card,neutral
4,united state abundance retailer accept apple p...,positive


## Convert Text to Numerical Form

In [40]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_auc_score

In [None]:
# Convert text to TF-IDF vectors (using TfidfVectorizer)
vectorizer = TfidfVectorizer(max_features=5000)  # Limit to top 5000 features
X = vectorizer.fit_transform(df['Comment'])

# Lets map the 'Sentiment' label to numerical format
# 'positive': 1, 'neutral': 0, and 'negative': -1

sentiment_map = {"positive": 1, "neutral": 0, "negative": -1}
df["Sentiment_Score"] = df["Sentiment"].map(sentiment_map)

y = df['Sentiment_Score']  # Assuming sentiment score has value: 1, 0, -1.

## Train Models

In [21]:
# Split the data into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Train SVM classifier

In [22]:
# SVC classifier
svc_model = SVC(kernel='linear', random_state=42)
svc_model.fit(X_train, y_train)


### Train Random Forest classifier

In [23]:
# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

### Evaluate Models

In [26]:
# SVM Evaluation
y_pred_svc = svc_model.predict(X_test)
print("SVC Classification Report:\n", classification_report(y_test, y_pred_svc))
print("SVC Accuracy:", accuracy_score(y_test, y_pred_svc))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svc))

SVC Classification Report:
               precision    recall  f1-score   support

          -1       0.62      0.41      0.49       377
           0       0.55      0.49      0.52       620
           1       0.82      0.91      0.86      1866

    accuracy                           0.75      2863
   macro avg       0.66      0.60      0.62      2863
weighted avg       0.74      0.75      0.74      2863

SVC Accuracy: 0.7520083828152287
Confusion Matrix:
 [[ 153  104  120]
 [  66  304  250]
 [  29  141 1696]]


In [28]:
# Random Forest Evaluation
y_pred_rf = rf_model.predict(X_test)
print("Random Forest Classification Report:\n", classification_report(y_test, y_pred_rf))
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))

Random Forest Classification Report:
               precision    recall  f1-score   support

          -1       0.78      0.06      0.10       377
           0       0.64      0.29      0.40       620
           1       0.71      0.97      0.82      1866

    accuracy                           0.70      2863
   macro avg       0.71      0.44      0.44      2863
weighted avg       0.70      0.70      0.64      2863

Random Forest Accuracy: 0.7048550471533357
Confusion Matrix:
 [[  21   55  301]
 [   5  181  434]
 [   1   49 1816]]


## Model Inference

In [None]:
test_text = ["Wow! what a caricature?", "What a disgusting scene"]

test_text_emb = vectorizer.transform(test_text)

svc_model.predict(test_text_emb)

array([1, 0])

In [72]:
test_text = ["Disgusting! scene?"]

test_text_emb = vectorizer.transform(test_text)

svc_model.predict(test_text_emb)

array([0])

### Using CountVectorizer and TfidfTransformer

In [59]:
count_vectorizer = CountVectorizer(max_features=5000)

X_vec = count_vectorizer.fit_transform(df['Comment'])
X_vec = X_vec.todense()

In [None]:
tfidf_transformer = TfidfTransformer()
X_vec = tfidf_transformer.fit_transform(np.asarray(X_vec))
X_vec = X_vec.todense()

In [66]:
X_train_, X_test_, y_train_, y_test_ = train_test_split(np.asarray(X_vec), y, test_size=0.2, random_state=42, shuffle=True)

In [64]:
# SVC classifier
svc_model_ = SVC(kernel='linear', random_state=42)
svc_model_.fit(X_train_, y_train_)

y_pred_svc_ = svc_model_.predict(X_test_)

# SVM Evaluation
print("SVC Classification Report:\n", classification_report(y_test, y_pred_svc_))
print("SVC Accuracy:", accuracy_score(y_test, y_pred_svc_))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svc_))

SVC Classification Report:
               precision    recall  f1-score   support

          -1       0.62      0.41      0.49       377
           0       0.55      0.49      0.52       620
           1       0.82      0.91      0.86      1866

    accuracy                           0.75      2863
   macro avg       0.66      0.60      0.62      2863
weighted avg       0.74      0.75      0.74      2863

SVC Accuracy: 0.7520083828152287
Confusion Matrix:
 [[ 153  104  120]
 [  66  304  250]
 [  29  141 1696]]


In [67]:
# Random Forest
rf_model_ = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_.fit(X_train_, y_train_)

# Random Forest Evaluation
y_pred_rf_ = rf_model.predict(X_test_)
print("Random Forest Classification Report:\n", classification_report(y_test, y_pred_rf_))
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf_))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf_))

Random Forest Classification Report:
               precision    recall  f1-score   support

          -1       0.78      0.06      0.10       377
           0       0.64      0.29      0.40       620
           1       0.71      0.97      0.82      1866

    accuracy                           0.70      2863
   macro avg       0.71      0.44      0.44      2863
weighted avg       0.70      0.70      0.64      2863

Random Forest Accuracy: 0.7048550471533357
Confusion Matrix:
 [[  21   55  301]
 [   5  181  434]
 [   1   49 1816]]


### Pickling the model

In [70]:
import pickle

pickle.dump(svc_model, open('svc_model.pkl', 'wb'))
pickle.dump(vectorizer, open('tfidf_vectorizer.pkl', 'wb'))