In [1]:
import pandas as pd
import zipfile
import pickle

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

from sklearn.metrics import confusion_matrix, classification_report

In [2]:
z = zipfile.ZipFile("./dataset.zip")
Psy = pd.read_csv(z.open("Youtube01-Psy.csv"))
Katy = pd.read_csv(z.open("Youtube02-KatyPerry.csv"))
LMFAO = pd.read_csv(z.open("Youtube03-LMFAO.csv"))
Eminem = pd.read_csv(z.open("Youtube04-Eminem.csv"))
Shakira = pd.read_csv(z.open("Youtube05-Shakira.csv"))

In [3]:
data = pd.concat([Psy, Katy, LMFAO, Eminem, Shakira])
data.drop(["COMMENT_ID", "DATE", "AUTHOR"], axis=1, inplace=True)

In [4]:
x_train, x_test, y_train, y_test = train_test_split(data["CONTENT"], data["CLASS"])

In [5]:
# Term frequency - inverse document frequency

tfidf_vect = TfidfVectorizer(use_idf=True, lowercase=True)
x_train_tfidf = tfidf_vect.fit_transform(x_train)
x_train_tfidf.shape

(1467, 3601)

In [6]:
with open("./model/tfidf-vect.pkl", "wb") as tfidf_vect_file:
  pickle.dump(tfidf_vect, tfidf_vect_file)

In [7]:
x_test_tfidf = tfidf_vect.transform(x_test)

In [8]:
# Create and train Naive Bayes model

model_nb = MultinomialNB()
model_nb.fit(x_train_tfidf, y_train)

In [9]:
predictions_nb = model_nb.predict(x_test_tfidf)

In [10]:
confusion_matrix(y_test, predictions_nb)

array([[203,  31],
       [  9, 246]], dtype=int64)

In [11]:
print(classification_report(y_test, predictions_nb))

              precision    recall  f1-score   support

           0       0.96      0.87      0.91       234
           1       0.89      0.96      0.92       255

    accuracy                           0.92       489
   macro avg       0.92      0.92      0.92       489
weighted avg       0.92      0.92      0.92       489



In [12]:
model_nb.score(x_test_tfidf, y_test)

0.918200408997955

In [13]:
with open("./model/nb.pkl", "wb") as model_file:
  pickle.dump(model_nb, model_file)

In [14]:
# Create and train Logistic Regression model

model_lr = LogisticRegression()
model_lr.fit(x_train_tfidf, y_train)

In [15]:
predictions_lr = model_lr.predict(x_test_tfidf)

In [16]:
confusion_matrix(y_test, predictions_lr)

array([[225,   9],
       [ 13, 242]], dtype=int64)

In [17]:
print(classification_report(y_test, predictions_lr))

              precision    recall  f1-score   support

           0       0.95      0.96      0.95       234
           1       0.96      0.95      0.96       255

    accuracy                           0.96       489
   macro avg       0.95      0.96      0.95       489
weighted avg       0.96      0.96      0.96       489



In [18]:
model_lr.score(x_test_tfidf, y_test)

0.9550102249488752

In [19]:
with open("./model/lr.pkl", "wb") as model_file:
  pickle.dump(model_lr, model_file)

In [20]:
# Create and train Random Forest Classifier model

model_rfc = RandomForestClassifier()
model_rfc.fit(x_train_tfidf, y_train)

In [21]:
predictions_rfc = model_rfc.predict(x_test_tfidf)

In [22]:
confusion_matrix(y_test, predictions_rfc)

array([[228,   6],
       [ 14, 241]], dtype=int64)

In [23]:
print(classification_report(y_test, predictions_rfc))

              precision    recall  f1-score   support

           0       0.94      0.97      0.96       234
           1       0.98      0.95      0.96       255

    accuracy                           0.96       489
   macro avg       0.96      0.96      0.96       489
weighted avg       0.96      0.96      0.96       489



In [24]:
model_rfc.score(x_test_tfidf, y_test)

0.9591002044989775

In [25]:
with open("./model/rfc.pkl", "wb") as model_file:
  pickle.dump(model_lr, model_file)

In [26]:
# Create and train Multi-layer Preceptron model

model_nn = MLPClassifier(hidden_layer_sizes=(20,40,40,20), activation="relu", solver="adam", max_iter=10000)
model_nn.fit(x_train_tfidf, y_train)

In [27]:
predictions_nn = model_nn.predict(x_test_tfidf)

In [28]:
confusion_matrix(y_test, predictions_nn)

array([[218,  16],
       [ 13, 242]], dtype=int64)

In [29]:
print(classification_report(y_test, predictions_nn))

              precision    recall  f1-score   support

           0       0.94      0.93      0.94       234
           1       0.94      0.95      0.94       255

    accuracy                           0.94       489
   macro avg       0.94      0.94      0.94       489
weighted avg       0.94      0.94      0.94       489



In [30]:
model_nn.score(x_test_tfidf, y_test)

0.9406952965235174

In [31]:
with open("./model/nn.pkl", "wb") as model_file:
  pickle.dump(model_nn, model_file)

In [32]:
# Create and train XGBClassifier

model_xgb = XGBClassifier(objective="binary:logistic", max_depth=4, alpha=10, learning_rate=1.0, n_estimators=100)
model_xgb.fit(x_train_tfidf, y_train)

In [33]:
predictions_xgb = model_xgb.predict(x_test_tfidf)

In [34]:
confusion_matrix(y_test, predictions_xgb)

array([[229,   5],
       [ 24, 231]], dtype=int64)

In [35]:
print(classification_report(y_test, predictions_xgb))

              precision    recall  f1-score   support

           0       0.91      0.98      0.94       234
           1       0.98      0.91      0.94       255

    accuracy                           0.94       489
   macro avg       0.94      0.94      0.94       489
weighted avg       0.94      0.94      0.94       489



In [36]:
model_xgb.score(x_test_tfidf, y_test)

0.9406952965235174

In [37]:
with open("./model/xgb.pkl", "wb") as model_file:
  pickle.dump(model_xgb, model_file)