In [116]:
import pandas as pd
import numpy as np
from apiclient.discovery import build
from pymongo import MongoClient
import os
from dotenv import load_dotenv
import string
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import metrics
from sklearn.compose import ColumnTransformer

# Connecting to DB and loading data...

In [2]:
# connect to mongo cloud db and select collection

load_dotenv()

uri = os.getenv('MONGO_URI')

mongo_client = MongoClient(uri)

db = mongo_client['youtube-db']

In [4]:
# select collections for training and test data

train_col = db['training_col']
test_col = db['test_col']

In [183]:
# fetching data from training collection, might take a view seconds...

data = list(train_col.find())

In [184]:
df = pd.DataFrame(data)
df

Unnamed: 0,_id,video_id,title,description,category_id
0,628e25946b047126d6d3794b,FfJ5XG5i2aw,introducing teded lessons worth sharing,tededs mission is to capture and amplify the v...,1
1,628e25946b047126d6d3794c,Oi2_qJumnDo,teded lessons worth sharing,,0
2,628e25946b047126d6d3794d,2W85Dwxx218,why do we dream amy adkins,view full lesson httpedtedcomlessonswhydowedre...,1
3,628e25946b047126d6d3794e,LaLvVc1sS20,the history of tea shunan teng,view full lesson httpedtedcomlessonsthehistory...,1
4,628e25946b047126d6d3794f,MMmOLN5zBLY,the benefits of a bilingual brain mia nacamulli,check out our patreon page httpswwwpatreoncomt...,1
...,...,...,...,...,...
40545,628e35eed4f92320472d5045,5iyn2q6s1Sk,ever given update 28 march operation backtwist,these 4 great youtube sources helped me better...,1
40546,628e35eed4f92320472d5046,BSmb725eu7s,switching credit cards w alisha marie for 24 h...,✂ vlog channel httpswwwyoutubecomlaurdiyvlogs✂...,0
40547,628e35eed4f92320472d5047,xLh1676KRaE,stranded ever given back afloat in suez canal ...,the ever given has been “successfully refloate...,1
40548,628e35eed4f92320472d5048,EIQL7fZKcEI,switching credit cards w laurdiy no limit,lauren and i decided to switch credit cards we...,0


# Building and testing ML models

In [210]:
# splitting data into training and test data

X = df['title'] + " " + df['description']
y = df['category_id']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True, stratify=y)

# loading new random data

test_data = list(test_col.find())

test_df = pd.DataFrame(test_data)

X_new = test_df['title'] + " " + test_df['description']
y_new = test_df['category_id']

# SGD Classifier

In [186]:
# Building ML Pipeline

sgd_clf = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2))),
    ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=1000, tol=None)),
    ])

In [187]:
sgd_clf.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer(ngram_range=(1, 2))),
                ('clf', SGDClassifier(alpha=0.001, random_state=42, tol=None))])

In [188]:
# Evaluating metrics on test data

predicted = sgd_clf.predict(X_test)

print("Metrics on test data from split:")

print(metrics.classification_report(predicted, y_test))

# Evaluating clf against completly new random data

predicted_new = sgd_clf.predict(X_new)

print("Metrics on new random data:")

print(metrics.classification_report(predicted_new, y_new))

Metrics on test data from split:
              precision    recall  f1-score   support

           0       1.00      0.90      0.95      7970
           1       0.15      0.99      0.27       140

    accuracy                           0.91      8110
   macro avg       0.58      0.95      0.61      8110
weighted avg       0.99      0.91      0.94      8110

Metrics on new random data:
              precision    recall  f1-score   support

           0       1.00      0.93      0.96      4948
           1       0.13      1.00      0.23        52

    accuracy                           0.93      5000
   macro avg       0.57      0.97      0.60      5000
weighted avg       0.99      0.93      0.96      5000



# Random Forest Classifier

In [204]:
# Building ML Pipeline

rf_clf = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2))),
    ('clf', RandomForestClassifier()),
    ])

In [205]:
rf_clf.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer(ngram_range=(1, 2))),
                ('clf', RandomForestClassifier())])

In [206]:
# Evaluating metrics on test data

predicted = rf_clf.predict(X_test)

print(metrics.classification_report(predicted, y_test))

# Evaluating clf against completly new random data

predicted_new = rf_clf.predict(X_new)

print("Metrics on new random data:")

print(metrics.classification_report(predicted_new, y_new))

print(metrics.confusion_matrix(predicted_new, y_new))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7185
           1       0.98      1.00      0.99       925

    accuracy                           1.00      8110
   macro avg       0.99      1.00      0.99      8110
weighted avg       1.00      1.00      1.00      8110

Metrics on new random data:
              precision    recall  f1-score   support

           0       1.00      0.97      0.98      4743
           1       0.63      0.98      0.77       257

    accuracy                           0.97      5000
   macro avg       0.81      0.97      0.88      5000
weighted avg       0.98      0.97      0.97      5000

[[4596  147]
 [   6  251]]


# Multi Layer Perceptron Classifier

In [212]:
# Building ML Pipeline

mlp_clf = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2))),
    ('clf', MLPClassifier(verbose=True))
    ])

In [215]:
mlp_clf.fit(X_train, y_train)



Pipeline(steps=[('tfidf', TfidfVectorizer(ngram_range=(1, 2))),
                ('clf', MLPClassifier())])

In [217]:
# Evaluating metrics on test data

predicted = mlp_clf.predict(X_test)

print(metrics.classification_report(predicted, y_test))

# Evaluating clf against completly new random data

predicted_new = mlp_clf.predict(X_new)

print("Metrics on new random data:")

print(metrics.classification_report(predicted_new, y_new))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7176
           1       0.99      1.00      0.99       934

    accuracy                           1.00      8110
   macro avg       0.99      1.00      1.00      8110
weighted avg       1.00      1.00      1.00      8110

Metrics on new random data:
              precision    recall  f1-score   support

           0       1.00      0.97      0.99      9366
           1       0.71      0.97      0.82       634

    accuracy                           0.97     10000
   macro avg       0.85      0.97      0.90     10000
weighted avg       0.98      0.97      0.97     10000



In [None]:
analysis = test_df[test_df["category_id"] == 1] 
analysis[100:150]

# <mark>TODO</mark>: Grid Search for optimal params