# Detecting Youtube Comment Spam

## Load Libraries

In [111]:
# import necessary libraries

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.naive_bayes import ComplementNB


## Load Data

In [112]:
data = pd.read_csv("Youtube01.csv") 
print(data.head(5))


                                    COMMENT_ID            AUTHOR  \
0  LZQPQhLyRh80UYxNuaDWhIGQYNQ96IuCg-AYWqNPjpU         Julius NM   
1  LZQPQhLyRh_C2cTtd9MvFRJedxydaVW-2sNg5Diuo4A       adam riyati   
2  LZQPQhLyRh9MSZYnf8djyk0gEF9BHDPYrrK-qCczIY8  Evgeny Murashkin   
3          z13jhp0bxqncu512g22wvzkasxmvvzjaz04   ElNino Melendez   
4          z13fwbwp1oujthgqj04chlngpvzmtt3r3dw            GsMega   

                  DATE                                            CONTENT  \
0  2013-11-07T06:20:48  Huh, anyway check out this you[tube] channel: ...   
1  2013-11-07T12:37:15  Hey guys check out my new channel and our firs...   
2  2013-11-08T17:34:21             just for test I have to say murdev.com   
3  2013-11-09T08:28:43   me shaking my sexy ass on my channel enjoy ^_^ ﻿   
4  2013-11-10T16:05:38            watch?v=vtaRGgvGtWQ   Check this out .﻿   

   CLASS  
0      1  
1      1  
2      1  
3      1  
4      1  


## Check for null values

In [113]:
print(data.isnull().sum())

COMMENT_ID      0
AUTHOR          0
DATE          245
CONTENT         0
CLASS           0
dtype: int64


In [114]:
# We only require Content and Class Values

data = data[['CONTENT','CLASS']]

In [115]:
# Map 0 to not spam and 1 to span in class column
data["CLASS"] = data['CLASS'].map({0:'NOT A SPAM COMMENT', 1: 'SPAM COMMENT'})

## Print new sample data

In [116]:
print(data.sample(5))

                                                CONTENT               CLASS
1847                                     She is perfect  NOT A SPAM COMMENT
1454                        Is that girl is Megan fox ﻿  NOT A SPAM COMMENT
1264                 Love the way you lie - Driveshaft﻿  NOT A SPAM COMMENT
465   Katy Perry is garbage. Rihanna is the best sin...  NOT A SPAM COMMENT
360   Hey Guys this is Glamour Beauty! I just starte...        SPAM COMMENT


## Algorithm Choice
> As the output of this problem will either be 0 or 1,i.e, the problem of binary classification,
we can use the Bernoulli Naive Bayes algorithm to train the model:

## Prepare data for training and testing

In [117]:

x = np.array(data['CONTENT'])
y = np.array(data['CLASS'])

cv = CountVectorizer()
x = cv.fit_transform(x)
xtrain, xtest, ytrain, ytest = train_test_split(x,y,train_size=0.2,random_state= 42)


## MODEL

In [118]:
model = ComplementNB()
model.fit(xtrain.toarray(),ytrain)

#print other metrics
ypred = model.predict(xtest.toarray())
print(classification_report(ytest,ypred))
print(confusion_matrix(ytest,ypred))

                    precision    recall  f1-score   support

NOT A SPAM COMMENT       0.95      0.83      0.89       743
      SPAM COMMENT       0.86      0.96      0.91       822

          accuracy                           0.90      1565
         macro avg       0.91      0.90      0.90      1565
      weighted avg       0.91      0.90      0.90      1565

[[617 126]
 [ 30 792]]


## Inference

In [120]:
s1 = 'Love your content'
s2 = 'Check out my website from www.myweb.com'
d1 = cv.transform([s1]).toarray()
d2 = cv.transform([s2]).toarray()
print(f"{s1} - {model.predict(d1)[0]}")
print(f"{s2} - {model.predict(d2)[0]}")

Love your content - NOT A SPAM COMMENT
Check out my website from www.myweb.com - SPAM COMMENT
