In [12]:
import re
import pandas as pd
from textblob import TextBlob
from snorkel.preprocess import preprocessor
from snorkel.labeling import LabelingFunction
from snorkel.labeling import labeling_function
from snorkel.preprocess.nlp import SpacyPreprocessor
from snorkel.labeling.lf.nlp import nlp_labeling_function

In [13]:
ABSTAIN = -1
HAM = 0
SPAM = 1

@labeling_function()
def check(x):
    return SPAM if "check" in x.text.lower() else ABSTAIN


@labeling_function()
def check_out(x):
    return SPAM if "check out" in x.text.lower() else ABSTAIN

@labeling_function()
def regex_check_out(x):
    return SPAM if re.search(r"check.*out", x.text, flags=re.I) else ABSTAIN

'''============================================================================================='''

@preprocessor(memoize=True)
def textblob_sentiment(x):
    scores = TextBlob(x.text)
    x.polarity = scores.sentiment.polarity
    x.subjectivity = scores.sentiment.subjectivity
    return x

@labeling_function(pre=[textblob_sentiment])
def textblob_polarity(x):
    return HAM if x.polarity > 0.9 else ABSTAIN

@labeling_function(pre=[textblob_sentiment])
def textblob_subjectivity(x):
    return HAM if x.subjectivity >= 0.5 else ABSTAIN

'''============================================================================================='''

# @nlp_labeling_function()
# def has_person_nlp(x):
#     """Ham comments mention specific people and are short."""
#     if len(x.doc) < 20 and any([ent.label_ == "PERSON" for ent in x.doc.ents]):
#         return HAM
#     else:
#         return ABSTAIN

'''============================================================================================='''    

def keyword_lookup(x, keywords, label):
    if any(word in x.text.lower() for word in keywords):
        return label
    return ABSTAIN


def make_keyword_lf(keywords, label=SPAM):
    return LabelingFunction(
        name=f"keyword_{keywords[0]}",
        f=keyword_lookup,
        resources=dict(keywords=keywords, label=label),
    )


"""Spam comments talk about 'my channel', 'my video', etc."""
keyword_my = make_keyword_lf(keywords=["my"])

"""Spam comments ask users to subscribe to their channels."""
keyword_subscribe = make_keyword_lf(keywords=["subscribe"])

"""Spam comments post links to other channels."""
keyword_link = make_keyword_lf(keywords=["http"])

"""Spam comments make requests rather than commenting."""
keyword_please = make_keyword_lf(keywords=["please", "plz"])

"""Ham comments actually talk about the video's content."""
keyword_song = make_keyword_lf(keywords=["song"], label=HAM)

@labeling_function()
def short_comment(x):
    """Ham comments are often short, such as 'cool video!'"""
    return HAM if len(x.text.split()) < 5 else ABSTAIN

lfs = [
    keyword_my,
    keyword_subscribe,
    keyword_link,
    keyword_please,
    keyword_song,
    regex_check_out,
    short_comment,
#     has_person_nlp,
    textblob_polarity,
    textblob_subjectivity,
]

In [15]:
from snorkel.labeling import PandasLFApplier

applier = PandasLFApplier(lfs=lfs)

In [17]:
df_train = pd.read_csv('data/smsspamcollection/spam.csv')
applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=df_train)

100%|██████████| 5572/5572 [00:00<00:00, 10624.64it/s]


In [18]:
L_train

array([[-1, -1, -1, ..., -1, -1,  0],
       [-1, -1, -1, ..., -1, -1,  0],
       [-1, -1, -1, ..., -1, -1,  0],
       ...,
       [-1, -1, -1, ..., -1, -1, -1],
       [-1, -1, -1, ..., -1, -1, -1],
       [-1, -1, -1, ..., -1, -1,  0]])

In [19]:
from snorkel.labeling import LFAnalysis

LFAnalysis(L=L_train, lfs=lfs).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
keyword_my,0,[1],0.114501,0.058507,0.056174
keyword_subscribe,1,[1],0.005025,0.001795,0.001795
keyword_http,2,[1],0.003948,0.002513,0.001795
keyword_please,3,[1],0.028356,0.012204,0.009871
keyword_song,4,[0],0.002692,0.001077,0.000359
regex_check_out,5,[1],0.001615,0.000718,0.000359
short_comment,6,[0],0.063532,0.024228,0.001436
textblob_polarity,7,[0],0.009512,0.007897,0.001974
textblob_subjectivity,8,[0],0.423905,0.095118,0.066224
