In [2]:
import re
import numpy as np
import pandas as pd
from textblob import TextBlob
from snorkel.preprocess import preprocessor
from snorkel.labeling import LabelingFunction
from snorkel.labeling import labeling_function
from snorkel.preprocess.nlp import SpacyPreprocessor
from snorkel.labeling.lf.nlp import nlp_labeling_function

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
ABSTAIN = -1
HAM = 0
SPAM = 1

@labeling_function()
def check(x):
    return SPAM if "check" in x.text.lower() else ABSTAIN


@labeling_function()
def check_out(x):
    return SPAM if "check out" in x.text.lower() else ABSTAIN

@labeling_function()
def regex_check_out(x):
    return SPAM if re.search(r"check.*out", x.text, flags=re.I) else ABSTAIN

'''============================================================================================='''

@preprocessor(memoize=True)
def textblob_sentiment(x):
    scores = TextBlob(x.text)
    x.polarity = scores.sentiment.polarity
    x.subjectivity = scores.sentiment.subjectivity
    return x

@labeling_function(pre=[textblob_sentiment])
def textblob_polarity(x):
    return HAM if x.polarity > 0.9 else ABSTAIN

@labeling_function(pre=[textblob_sentiment])
def textblob_subjectivity(x):
    return HAM if x.subjectivity >= 0.5 else ABSTAIN

'''============================================================================================='''

# @nlp_labeling_function()
# def has_person_nlp(x):
#     """Ham comments mention specific people and are short."""
#     if len(x.doc) < 20 and any([ent.label_ == "PERSON" for ent in x.doc.ents]):
#         return HAM
#     else:
#         return ABSTAIN

'''============================================================================================='''    

def keyword_lookup(x, keywords, label):
    if any(word in x.text.lower() for word in keywords):
        return label
    return ABSTAIN


def make_keyword_lf(keywords, label=SPAM):
    return LabelingFunction(
        name=f"keyword_{keywords[0]}",
        f=keyword_lookup,
        resources=dict(keywords=keywords, label=label),
    )


"""Spam comments talk about 'my channel', 'my video', etc."""
keyword_my = make_keyword_lf(keywords=["my"])

"""Spam comments ask users to subscribe to their channels."""
keyword_subscribe = make_keyword_lf(keywords=["subscribe"])

"""Spam comments post links to other channels."""
keyword_link = make_keyword_lf(keywords=["http"])

"""Spam comments make requests rather than commenting."""
keyword_please = make_keyword_lf(keywords=["please", "plz"])

"""Ham comments actually talk about the video's content."""
keyword_song = make_keyword_lf(keywords=["song"], label=HAM)

@labeling_function()
def short_comment(x):
    """Ham comments are often short, such as 'cool video!'"""
    return HAM if len(x.text.split()) < 5 else ABSTAIN

lfs = [
    keyword_my,
    keyword_subscribe,
    keyword_link,
    keyword_please,
    keyword_song,
    regex_check_out,
    short_comment,
#     has_person_nlp,
    textblob_polarity,
    textblob_subjectivity,
]

In [4]:
from snorkel.labeling import PandasLFApplier

applier = PandasLFApplier(lfs=lfs)

In [5]:
DIR = 'data/YouTube-Spam-Collection-v1'

df_U = pd.read_csv(DIR+'/'+'U_data.csv')
U_L = applier.apply(df=df_U)
with open(DIR+'/'+'U_L.npy', 'wb') as f:
    np.save(f, U_L)


df_L = pd.read_csv(DIR+'/'+'L_data.csv')
L_L = applier.apply(df=df_L)
with open(DIR+'/'+'L_L.npy', 'wb') as f:
    np.save(f, L_L)

df_V = pd.read_csv(DIR+'/'+'V_data.csv')
V_L = applier.apply(df=df_V)
with open(DIR+'/'+'V_L.npy', 'wb') as f:
    np.save(f, V_L)

df_test = pd.read_csv(DIR+'/'+'test_data.csv')
test_L = applier.apply(df=df_test)
with open(DIR+'/'+'test_L.npy', 'wb') as f:
    np.save(f, test_L)

100%|██████████| 1586/1586 [00:01<00:00, 973.17it/s] 
100%|██████████| 100/100 [00:00<00:00, 8414.36it/s]
100%|██████████| 198/198 [00:00<00:00, 6772.56it/s]
100%|██████████| 250/250 [00:00<00:00, 6149.65it/s]


In [6]:
U_L

array([[-1, -1, -1, ..., -1, -1, -1],
       [ 1,  1, -1, ..., -1, -1, -1],
       [-1, -1, -1, ..., -1, -1, -1],
       ...,
       [-1,  1, -1, ..., -1, -1,  0],
       [-1, -1, -1, ..., -1, -1,  0],
       [-1, -1, -1, ...,  0,  0,  0]])

In [8]:
from snorkel.labeling import LFAnalysis

LFAnalysis(L=U_L, lfs=lfs).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
keyword_my,0,[1],0.198613,0.184111,0.106557
keyword_subscribe,1,[1],0.127364,0.106557,0.066204
keyword_http,2,[1],0.119168,0.09773,0.078184
keyword_please,3,[1],0.112232,0.109079,0.055485
keyword_song,4,[0],0.141866,0.108449,0.043506
regex_check_out,5,[1],0.233922,0.127364,0.080076
short_comment,6,[0],0.225725,0.137453,0.074401
textblob_polarity,7,[0],0.035309,0.030265,0.005044
textblob_subjectivity,8,[0],0.357503,0.24338,0.160151


array([1, 1, 2, 2, 2, 1, 0, 1, 1, 1, 1, 1, 3, 1, 2, 1, 1, 2, 1, 2, 0, 1,
       1, 2, 2, 1, 4, 1, 0, 1, 1, 1, 3, 1, 2, 1, 2, 1, 3, 2, 1, 3, 3, 2,
       3, 2, 0, 3, 4, 1, 4, 1, 1, 0, 1, 0, 0, 0, 2, 1, 1, 2, 1, 3, 1, 2,
       0, 2, 4, 3, 1, 0, 2, 1, 2, 3, 1, 1, 1, 2, 4, 1, 1, 1, 3, 3, 2, 0,
       1, 1, 1, 1, 2, 1, 1, 0, 1, 2, 2, 0])