##### Set Working Directory

In [95]:
import os
os.getcwd()

'C:\\Users\\thars\\Downloads'

In [96]:
os.chdir(r"C:\Users\thars\Downloads")

In [97]:
# Checking directory
os.getcwd()

'C:\\Users\\thars\\Downloads'

##### Import Necessary Libraries

In [98]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split 
from sklearn.naive_bayes import BernoulliNB

In [99]:
data = pd.read_csv(r"C:\Users\thars\Downloads\Youtube01-Psy.csv")
print(data.sample(5))

                                COMMENT_ID               AUTHOR  \
162      z13dvrmxorf0cnj0423rsfcjlxevgjwll              ZodexHD   
85   z13wzt5yezvhsboz104cjlkqalz0fpcglmk0k         Alessio Siri   
261  z13shj4wpmflidcxc04ce5f4vlqdyzjowso0k  Tornike Noniashvili   
179  z12yu11xyumitzpho04cix1y3zn3ijrpcm40k      Chinsoman Films   
100    z13juvrhisuzsfczo04cgnsxhlfdiz2rhho       Imprezzi Vidzz   

                    DATE                                            CONTENT  \
162  2014-11-06T17:50:59  look at my channel i make minecraft pe lets pl...   
85   2014-11-03T16:43:36  PSY - GANGNAM STYLE (강남스타일) M/V: http://youtu....   
261  2014-11-08T04:08:09                               subscribe my chanel﻿   
179  2014-11-07T01:22:02                            Please subscribe to me﻿   
100  2014-11-04T03:12:23  My videos are half way decent, check them out ...   

     CLASS  
162      1  
85       0  
261      1  
179      1  
100      1  


In [100]:
data.shape

(350, 5)

In [101]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 350 entries, 0 to 349
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   COMMENT_ID  350 non-null    object
 1   AUTHOR      350 non-null    object
 2   DATE        350 non-null    object
 3   CONTENT     350 non-null    object
 4   CLASS       350 non-null    int64 
dtypes: int64(1), object(4)
memory usage: 13.8+ KB


In [102]:
data.isnull().sum()

COMMENT_ID    0
AUTHOR        0
DATE          0
CONTENT       0
CLASS         0
dtype: int64

In [103]:
# We only the need the Content and Class Column from the dataset for the rest of the task
# let's select both the column and move further
data = data[["CONTENT", "CLASS"]]
print(data.sample(5))

                                               CONTENT  CLASS
204    The population of world is more than 7 billion﻿      0
79   Hi there~I'm group leader of Angel, a rookie K...      1
5    Hey, check out my new website!! This site is a...      1
157              Follow me on Twitter @mscalifornia95﻿      1
288  if i reach 100 subscribers i will go round in ...      1


In [104]:
# The CLASS column contains 0, and 1, 0 indicates not spam, and 1 indicates spam, 
# So to look better, i'll use spam and not spam labels instead of 1 and 0:
data["CLASS"] = data["CLASS"].map({0:"Not Spam",
                                   1:"Spam Comment"})
print(data.sample(5)) 

                                               CONTENT         CLASS
48                      go here to check the views :3﻿      Not Spam
250  why are they 5million comments when there is o...      Not Spam
15   Hello! Do you like gaming, art videos, scienti...  Spam Comment
184                                        OPPA &lt;3﻿      Not Spam
343   Something to dance to, even if your sad JUST ...      Not Spam


##### Training a Classification Model

In [108]:
x = np.array(data["CONTENT"])
y = np.array(data["CLASS"])

cv = CountVectorizer()
x = cv.fit_transform(x)

xtrain, xtest, ytrain, ytest = train_test_split(x, y, 
                                                test_size=0.2, random_state=42)

model = BernoulliNB()
model.fit(xtrain, ytrain)
print(model.score(xtest, ytest))


0.9857142857142858


##### Test the Model

In [110]:
# Now let's test the model by giving the spam and not spam comments as input
sample = "check this out https//:thefarazgill.com"
data = cv.transform([sample]).toarray()
print(model.predict(data))

['Spam Comment']


In [115]:
sample = "Lack of information!"
data = cv.transform([sample]).toarray()
print(model.predict(data))

['Not Spam']
