<a href="https://colab.research.google.com/github/GWsandip/YT-spam-diction-/blob/main/YT_spam_diction_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setting up environment

In [None]:
# Snorkel Tutorial: https://www.snorkel.org/use-cases/01-spam-tutorial#-snorkel-intro-tutorial-data-labeling

In [None]:
! pip install snorkel
! pip install utils    # Grab-bag of utility functions and objects

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting snorkel
  Downloading snorkel-0.9.9-py3-none-any.whl (103 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.3/103.3 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Collecting munkres>=1.0.6
  Downloading munkres-1.1.4-py2.py3-none-any.whl (7.0 kB)
Installing collected packages: munkres, snorkel
Successfully installed munkres-1.1.4 snorkel-0.9.9
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting utils
  Downloading utils-1.0.1-py2.py3-none-any.whl (21 kB)
Installing collected packages: utils
Successfully installed utils-1.0.1


In [None]:
# Cloning Snorkel package into our runtime
! git clone https://github.com/skillcate/snorkeltutorials.git
%cd /content/snorkeltutorials

Cloning into 'snorkeltutorials'...
remote: Enumerating objects: 1366, done.[K
remote: Counting objects: 100% (307/307), done.[K
remote: Compressing objects: 100% (124/124), done.[K
remote: Total 1366 (delta 150), reused 264 (delta 134), pack-reused 1059[K
Receiving objects: 100% (1366/1366), 3.56 MiB | 4.24 MiB/s, done.
Resolving deltas: 100% (865/865), done.
/content/snorkeltutorials


# Loading Dataset

In [None]:
# Pulling the demo SPAM Dataset from Snorkel
from snorkeltutorials.spam.utils import load_spam_dataset
df_train, df_test = load_spam_dataset()

# We pull out the label vectors for ease of use later
Y_test = df_test.label.values

In [None]:
df_train.head(5)

Unnamed: 0,author,date,text,label,video
0,Alessandro leite,2014-11-05T22:21:36,pls http://www10.vakinha.com.br/VaquinhaE.aspx...,-1.0,1
1,Salim Tayara,2014-11-02T14:33:30,"if your like drones, plz subscribe to Kamal Ta...",-1.0,1
2,Phuc Ly,2014-01-20T15:27:47,go here to check the views :3﻿,-1.0,1
3,DropShotSk8r,2014-01-19T04:27:18,"Came here to check the views, goodbye.﻿",-1.0,1
4,css403,2014-11-07T14:25:48,"i am 2,126,492,636 viewer :D﻿",-1.0,1


In [None]:
df_train.shape

(1586, 5)

In [None]:
df_train['video'].value_counts()

4    448
3    438
1    350
2    350
Name: video, dtype: int64

In [None]:
df_test.head(5)

Unnamed: 0,author,date,text,label,video
27,‫حلم الشباب‬‎,2015-05-25T23:42:49.533000,Check out this video on YouTube:﻿,1,5
194,MOHAMED THASLEEM,2015-05-24T07:03:59.488000,super music﻿,0,5
277,AlabaGames,2015-05-22T00:31:43.922000,Subscribe my channel I RECORDING FIFA 15 GOAL...,1,5
132,Manish Ray,2015-05-23T08:55:07.512000,This song is so beauty,0,5
163,Sudheer Yadav,2015-05-28T10:28:25.133000,SEE SOME MORE SONG OPEN GOOGLE AND TYPE Shakir...,1,5


In [None]:
df_test.shape

(250, 5)

# Snorkel Weak Labelling

## Define Labels

In [None]:
# For clarity, we define constants to represent the class labels for spam, legit, and abstaining.
ABSTAIN = -1
HAM = 0
SPAM = 1

In [None]:
# Exploring the training set for initial ideas
df_train[["author", "text", "video"]].sample(20, random_state=2)

Unnamed: 0,author,text,video
4,ambareesh nimkar,"""eye of the tiger"" ""i am the champion"" seems l...",2
87,pratik patel,"mindblowing dance.,.,.superbbb song﻿",3
14,RaMpAgE420,Check out Berzerk video on my channel ! :D,4
80,Jason Haddad,"Hey, check out my new website!! This site is a...",1
104,austin green,Eminem is my insperasen and fav﻿,4
305,M.E.S,hey guys look im aware im spamming and it piss...,4
22,John Monster,Οh my god ... Roar is the most liked video at ...,2
338,Alanoud Alsaleh,I started hating Katy Perry after finding out ...,2
336,Leonardo Baptista,http://www.avaaz.org/po/petition/Youtube_Corpo...,1
143,UKz DoleSnacher,Remove This video its wank﻿,1


## Define labeling functions

In [None]:
from snorkel.labeling import labeling_function
from snorkel.labeling import PandasLFApplier
from snorkel.labeling import LFAnalysis
from snorkel.analysis import get_label_buckets

In [None]:
# Writing an LF to identify spammy comments that use the phrase “check out
import re
@labeling_function()
def regex_check_out(x):
    return SPAM if re.search(r"check.*out", x.text, flags=re.I) else ABSTAIN

In [None]:
# Writing an LF to gauge sentiment - that uses a third-party model
from snorkel.preprocess import preprocessor
@preprocessor(memoize=True)
def textblob_sentiment(x):
    scores = TextBlob(x.text)
    x.polarity = scores.polarity
    x.subjectivity = scores.subjectivity
    return x

In [None]:
@labeling_function(pre=[textblob_sentiment])
def textblob_polarity(x):
    return HAM if x.polarity > 0.9 else ABSTAIN

In [None]:
@labeling_function(pre=[textblob_sentiment])
def textblob_subjectivity(x):
    return HAM if x.subjectivity >= 0.5 else ABSTAIN

In [None]:
from snorkel.labeling import LabelingFunction

def keyword_lookup(x, keywords, label):
    if any(word in x.text.lower() for word in keywords):
        return label
    return ABSTAIN


def make_keyword_lf(keywords, label=SPAM):
    return LabelingFunction(
        name=f"keyword_{keywords[0]}",
        f=keyword_lookup,
        resources=dict(keywords=keywords, label=label),
    )


"""Spam comments talk about 'my channel', 'my video', etc."""
keyword_my = make_keyword_lf(keywords=["my"])

"""Spam comments ask users to subscribe to their channels."""
keyword_subscribe = make_keyword_lf(keywords=["subscribe"])

"""Spam comments post links to other channels."""
keyword_link = make_keyword_lf(keywords=["http"])

"""Spam comments make requests rather than commenting."""
keyword_please = make_keyword_lf(keywords=["please", "plz"])

"""Ham comments actually talk about the video's content."""
keyword_song = make_keyword_lf(keywords=["song"], label=HAM)

In [None]:
@labeling_function()
def short_comment(x):
    """Ham comments are often short, such as 'cool video!'"""
    return HAM if len(x.text.split()) < 5 else ABSTAIN

In [None]:
from snorkel.labeling.lf.nlp import nlp_labeling_function

@nlp_labeling_function()
def has_person_nlp(x):
    """Ham comments mention specific people and are short."""
    if len(x.doc) < 20 and any([ent.label_ == "PERSON" for ent in x.doc.ents]):
        return HAM
    else:
        return ABSTAIN

## Generate labels by Applying LFs

In [None]:
lfs = [
    keyword_my,
    keyword_subscribe,
    keyword_link,
    keyword_please,
    keyword_song,
    regex_check_out,
    short_comment,
    has_person_nlp,
    textblob_polarity,
    textblob_subjectivity
]

In [None]:
applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=df_train)
L_test = applier.apply(df=df_test)

100%|██████████| 1586/1586 [00:21<00:00, 75.41it/s]
100%|██████████| 250/250 [00:03<00:00, 71.86it/s]


In [None]:
LFAnalysis(L=L_train, lfs=lfs).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
keyword_my,0,[1],0.198613,0.186003,0.110971
keyword_subscribe,1,[1],0.127364,0.107188,0.067465
keyword_http,2,[1],0.119168,0.101513,0.082598
keyword_please,3,[1],0.112232,0.10971,0.057377
keyword_song,4,[0],0.141866,0.111602,0.043506
regex_check_out,5,[1],0.233922,0.129256,0.083859
short_comment,6,[0],0.225725,0.144388,0.074401
has_person_nlp,7,[0],0.083859,0.062421,0.027743
textblob_polarity,8,[0],0.035309,0.030895,0.005044
textblob_subjectivity,9,[0],0.357503,0.261665,0.160151


# Combine LF Outputs with Label Model



### Majority-Vote-based Model

In [None]:
# Our goal is now to convert the labels from our LFs into
# a single noise-aware probabilistic (or confidence-weighted)
# label per data point

In [None]:
from snorkel.labeling.model import MajorityLabelVoter

majority_model = MajorityLabelVoter()


preds_train

array([ 1,  1, -1, ...,  1,  1,  1])

In [None]:
# Our LFs have varying properties and should not be treated identically.
# LFs may be correlated, resulting in certain signals being overrepresented in a majority-vote-based model.
# To handle this, we use a more sophisticated Snorkel LabelModel to combine LF outputs.

# This model will ultimately produce a single set of noise-aware training labels,
# which are probabilistic or confidence-weighted labels

### Confiedence-weighted Label Model

In [None]:
from snorkel.labeling.model import LabelModel

label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123)

probs_train = label_model.predict_proba(L=L_train)
probs_train

100%|██████████| 500/500 [00:00<00:00, 838.65epoch/s]


array([[6.57400061e-01, 3.42599939e-01],
       [2.25180862e-06, 9.99997748e-01],
       [5.00000000e-01, 5.00000000e-01],
       ...,
       [2.27481634e-07, 9.99999773e-01],
       [1.08918560e-03, 9.98910814e-01],
       [1.22570277e-08, 9.99999988e-01]])

In [None]:
majority_acc = majority_model.score(L=L_test, Y=Y_test, tie_break_policy="random")[
    "accuracy"
]
print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%")

label_model_acc = label_model.score(L=L_test, Y=Y_test, tie_break_policy="random")[
    "accuracy"
]
print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%")

Majority Vote Accuracy:   84.4%
Label Model Accuracy:     87.6%


## Filtering out unlabeled data points

In [None]:
from snorkel.labeling import filter_unlabeled_dataframe

# Training a Classifier

## Feature Representation

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(ngram_range=(1, 5))
X_train = vectorizer.fit_transform(df_train_filtered.text.tolist())
X_test = vectorizer.transform(df_test.text.tolist())

## Scikit-learn Classifier

In [None]:
from snorkel.utils import probs_to_preds
preds_train_filtered = probs_to_preds(probs=probs_train_filtered)

In [None]:
from sklearn.linear_model import LogisticRegression

sklearn_model = LogisticRegression(C=1e3, solver="liblinear")
sklearn_model.fit(X=X_train, y=preds_train_filtered)

In [None]:
print(f"Test Accuracy: {sklearn_model.score(X=X_test, y=Y_test) * 100:.1f}%")

Test Accuracy: 93.6%


# Real Predictions

In [None]:
import pandas as pd

In [None]:
new_review = ['check out my channel. it is the best',
              'your channel is the best']
df = pd.DataFrame(new_review, columns=['review'])
df = vectorizer.transform(df['review'])
results = sklearn_model.predict(df)

In [None]:
i=0
for item in results:
  i+=1
  if item ==0:
    print(f'Review#{i} is ham')
  else:
    print(f'Review#{i} is spam')

Review#1 is spam
Review#2 is ham
