# Import Libraries

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import numpy as np
import joblib

from sklearn import preprocessing
from sklearn.metrics import accuracy_score

from sklearn.decomposition import PCA

from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB,BernoulliNB

import pickle

# Read Embeddings

In [4]:
X_train_tokens = np.load('/content/drive/My Drive/youtube-comment-analysis/GBF/keras_embeddings_train.npy', allow_pickle=True)
y_train_encoded = np.load('/content/drive/My Drive/youtube-comment-analysis/GBF/y_train_encoded.npy', allow_pickle=True)
X_test_tokens = np.load('/content/drive/My Drive/youtube-comment-analysis/GBF/keras_embeddings_test.npy', allow_pickle=True)
y_test_encoded = np.load('/content/drive/My Drive/youtube-comment-analysis/GBF/y_test_encoded.npy', allow_pickle=True)

In [5]:
encoder=joblib.load('/content/drive/My Drive/youtube-comment-analysis/GBF/labelEncoder.joblib')

# Label Encoding

In [6]:
y_train = encoder.inverse_transform(y_train_encoded)
y_test = encoder.inverse_transform(y_test_encoded)

In [7]:
le = preprocessing.LabelEncoder()
y_train = le.fit_transform(y_train.ravel())
y_test = le.transform(y_test.ravel())

# Principal Component Analysis (PCA)

In [8]:
pca_model = PCA(n_components=50)
pca_model.fit(X_train_tokens)
print("Sum of variance ratios: ",sum(pca_model.explained_variance_ratio_))

Sum of variance ratios:  0.9835621036925072


In [9]:
X_train_tokens = pca_model.transform(X_train_tokens)
X_test_tokens = pca_model.transform(X_test_tokens)

# Model Training

## Random Forest

In [10]:
rfc = RandomForestClassifier()
rfc.fit(X_train_tokens,y_train)

RandomForestClassifier()

In [11]:
y_pred_rf = rfc.predict(X_test_tokens)

In [12]:
accuracy_score(y_test, y_pred_rf)

0.6110147669346786

In [13]:
filename = '/content/drive/My Drive/youtube-comment-analysis/GBF/rf.sav'
pickle.dump(rfc, open(filename, 'wb'))

## Logistic Regression

In [14]:
logreg = LogisticRegression()
logreg.fit(X_train_tokens,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [15]:
y_pred_lr = logreg.predict(X_test_tokens)

In [16]:
accuracy_score(y_test, y_pred_lr)

0.41653090896351186

In [17]:
filename = '/content/drive/My Drive/youtube-comment-analysis/GBF/lr.sav'
pickle.dump(logreg, open(filename, 'wb'))

## Gaussian Naive Bayes

In [18]:
gnb = GaussianNB()
gnb.fit(X_train_tokens,y_train)

GaussianNB()

In [19]:
y_pred_gnb = gnb.predict(X_test_tokens)

In [20]:
accuracy_score(y_test, y_pred_gnb)

0.34286264823865326

In [21]:
filename = '/content/drive/My Drive/youtube-comment-analysis/GBF/gnb.sav'
pickle.dump(gnb, open(filename, 'wb'))

## Bernoulli Naive Bayes

In [22]:
bnb = BernoulliNB()
bnb.fit(X_train_tokens,y_train)

BernoulliNB()

In [23]:
y_pred_bnb = bnb.predict(X_test_tokens)

In [24]:
accuracy_score(y_test, y_pred_bnb)

0.4073869707639217

In [25]:
filename = '/content/drive/My Drive/youtube-comment-analysis/GBF/bnb.sav'
pickle.dump(bnb, open(filename, 'wb'))