<a href="https://colab.research.google.com/github/AjeetSingh02/Notebooks/blob/master/Snorkel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# !pip install snorkel



In [0]:
from snorkel.labeling import labeling_function
from snorkel.labeling import LabelingFunction
from sklearn.model_selection import train_test_split
from snorkel.labeling import PandasLFApplier
from snorkel.labeling import LFAnalysis
from snorkel.preprocess import preprocessor
from snorkel.preprocess.nlp import SpacyPreprocessor
from snorkel.labeling import LabelModel
from sklearn.feature_extraction.text import CountVectorizer
from snorkel.analysis import get_label_buckets
from textblob import TextBlob
import pandas as pd
import numpy as np

In [0]:
# run this cell to load dataset into df_train, df_dev, df_valid, df_test
data = pd.read_csv("/content/Youtube 01-comments Psy.csv", sep=",", usecols=["CONTENT","CLASS"])
data = data.append(pd.read_csv("/content/Youtube 04-comments KatyPerry.csv", sep=",", usecols=["CONTENT","CLASS"]), ignore_index=True)
data = data.append(pd.read_csv("/content/Youtube 07-comments LMFAO.csv", sep=",", usecols=["CONTENT","CLASS"]), ignore_index=True)
df_train = data.append(pd.read_csv("/content/Youtube 08-comments Eminem.csv", sep=",", usecols=["CONTENT","CLASS"]), ignore_index=True)

df_dev = df_train.sample(200)
df_dev.reset_index(inplace=True, drop=True)
df_train.drop(columns=["CLASS"], inplace=True)

data = pd.read_csv("/content/Youtube 09-comments Shakira.csv", sep=",", usecols=["CONTENT","CLASS"])
df_valid, df_test = train_test_split(data, test_size=0.5, random_state=42)
df_valid.reset_index(inplace=True, drop=True)
df_test.reset_index(inplace=True, drop=True)

In [0]:
# pulling the label vectors for ease of use later
Y_dev = df_dev.CLASS.values
Y_valid = df_valid.CLASS.values
Y_test = df_test.CLASS.values

In [0]:
# For clarity, we define constants to represent the class labels for spam, ham, and abstaining.
ABSTAIN = -1
HAM = 0
SPAM = 1

print(f"Dev SPAM frequency: {100 * (df_dev.CLASS.values == SPAM).mean():.1f}%")

Dev SPAM frequency: 54.5%


In [0]:
df_dev.head()

Unnamed: 0,CONTENT,CLASS
0,Check out this video on YouTube:﻿,1
1,I'm here to check the views.. holy shit﻿,0
2,Love﻿,0
3,"2 billion views, only 2 million shares﻿",0
4,Check out our Channel for nice Beats!!﻿,1


In [0]:
# Writing LFs to identify spammy comments that use the phrase “check out” and "check"
@labeling_function()
def check(x):
    return SPAM if "check" in x.CONTENT.lower() else ABSTAIN


@labeling_function()
def check_out(x):
    return SPAM if "check out" in x.CONTENT.lower() else ABSTAIN

In [0]:
# applying one or more LFs that we’ve written to a collection of data points
# The output of the apply(...) method is a label matrix
# It’s a NumPy array L with one column for each LF and one row for each data point

lfs = [check_out, check]

applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=df_train)
L_dev = applier.apply(df=df_dev)

100%|██████████| 1586/1586 [00:00<00:00, 22406.47it/s]
100%|██████████| 200/200 [00:00<00:00, 5904.44it/s]


In [0]:
L_train

array([[ 1,  1],
       [ 1,  1],
       [-1, -1],
       ...,
       [-1, -1],
       [-1, -1],
       [-1, -1]])

In [0]:
df_train.CONTENT[0]

'Huh, anyway check out this you[tube] channel: kobyoshi02'

In [0]:
# Coverage of these LFs 

coverage_check_out, coverage_check = (L_train != ABSTAIN).mean(axis=0)
print(f"check_out coverage: {coverage_check_out * 100:.1f}%")
print(f"check coverage: {coverage_check * 100:.1f}%")

check_out coverage: 21.4%
check coverage: 25.8%


In [0]:
# LF analyses using the LFAnalysis utility.

'Polarity:' #The set of unique labels this LF outputs (excluding abstains)
'Coverage:' #The fraction of the dataset the LF labels
'Overlaps:' #The fraction of the dataset where this LF and at least one other LF label
'Conflicts:' #The fraction of the dataset where this LF and at least one other LF label and disagree
'Correct:' #The number of data points this LF labels correctly (if gold labels are provided)
'Incorrect:' #The number of data points this LF labels incorrectly (if gold labels are provided)
'Empirical Accuracy:' #The empirical accuracy of this LF (if gold labels are provided)


LFAnalysis(L=L_train, lfs=lfs).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
check_out,0,[1],0.214376,0.214376,0.0
check,1,[1],0.257881,0.214376,0.0


In [0]:
#passing dev dataset as it has gold labels

LFAnalysis(L=L_dev, lfs=lfs).lf_summary(Y=Y_dev)

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
check_out,0,[1],0.23,0.23,0.0,46,0,1.0
check,1,[1],0.28,0.23,0.0,53,3,0.946429


In [0]:
#regular expressions to get the coverage of check plus the accuracy of check_out.

import re

@labeling_function()
def regex_check_out(x):
    return SPAM if re.search(r"check.*out", x.CONTENT, flags=re.I) else ABSTAIN



In [0]:
lfs = [check_out, check, regex_check_out]

applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=df_train)
L_dev = applier.apply(df=df_dev)

100%|██████████| 1586/1586 [00:00<00:00, 16597.76it/s]
100%|██████████| 200/200 [00:00<00:00, 11672.89it/s]


In [0]:
L_train

array([[ 1,  1,  1],
       [ 1,  1,  1],
       [-1, -1, -1],
       ...,
       [-1, -1, -1],
       [-1, -1, -1],
       [-1, -1, -1]])

In [0]:
LFAnalysis(L=L_train, lfs=lfs).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
check_out,0,[1],0.214376,0.214376,0.0
check,1,[1],0.257881,0.233922,0.0
regex_check_out,2,[1],0.233922,0.233922,0.0


In [0]:
LFAnalysis(L_dev, lfs).lf_summary(Y=Y_dev)

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
check_out,0,[1],0.23,0.23,0.0,46,0,1.0
check,1,[1],0.28,0.265,0.0,53,3,0.946429
regex_check_out,2,[1],0.265,0.265,0.0,53,0,1.0


<h1>Writing an LF that uses a third-party model</h1>

In [0]:
# We’ll start by creating a Preprocessor that runs TextBlob on our comments, then extracts 
#the polarity and subjectivity scores.

@preprocessor(memoize=True)
def textblob_sentiment(x):
    scores = TextBlob(x.CONTENT)
    x.polarity = scores.sentiment.polarity
    x.subjectivity = scores.sentiment.subjectivity
    return x

In [0]:
#We’ll tune the output of our LFs based on the TextBlob scores.

@labeling_function(pre=[textblob_sentiment])
def textblob_polarity(x):
    return HAM if x.polarity > 0.9 else ABSTAIN

@labeling_function(pre=[textblob_sentiment])
def textblob_subjectivity(x):
    return HAM if x.subjectivity >= 0.5 else ABSTAIN

In [0]:
#The process: 1st the applier will try to apply LF "textblob_polarity" over L_train. But since on top of 
# "textblob_polarity" we have a decorator, it will apply the preprocessor "textblob_sentiment" on the datapoint.
#The resultant will be a new datapoint with two attributes: sunjectivity and polarity. Now based on that
#The LF "textblob_polarity" will identify HAM or not

lfs = [textblob_polarity, textblob_subjectivity]

applier = PandasLFApplier(lfs)
L_train = applier.apply(df_train)
L_dev = applier.apply(df_dev)

100%|██████████| 1586/1586 [00:01<00:00, 824.11it/s]
100%|██████████| 200/200 [00:00<00:00, 830.10it/s]


In [0]:
LFAnalysis(L_train, lfs).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
textblob_polarity,0,[0],0.035309,0.013871,0.0
textblob_subjectivity,1,[0],0.357503,0.013871,0.0


In [0]:
LFAnalysis(L_dev, lfs).lf_summary(Y=Y_dev)

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
textblob_polarity,0,[0],0.04,0.015,0.0,7,1,0.875
textblob_subjectivity,1,[0],0.35,0.015,0.0,41,29,0.585714


<h1>Writing More Labeling Functions</h1>

No single LF has high enough coverage to label our entire test dataset accurately. If it was then wouldn’t need a classifier at all. We could just use that single simple heuristic to complete the task. <br><br>
We usually need to combine multiple LFs to label our dataset, both to increase the size of the generated training set (since we can’t generate training labels for data points that all LFs abstained on) and to improve the overall accuracy of the training labels we generate by factoring in multiple different signals.

In [0]:
'''Keyword LFs'''

#This is very intuitive
#what we are doing here is instead of creating separate function to search for each type of keyword(for example, please) we are creating tempate
#which will take in a type of keywords and then make a function for those keywords. 

def keyword_lookup(x, keywords, label):
    if any(word in x.CONTENT.lower() for word in keywords):
        return label
    return ABSTAIN


def make_keyword_lf(keywords, label=SPAM):
    return LabelingFunction(
        name=f"keyword_{keywords[0]}",
        f=keyword_lookup,
        resources=dict(keywords=keywords, label=label),
    )


#Spam comments talk about 'my channel', 'my video', etc.
keyword_my = make_keyword_lf(keywords=["my"])

#Spam comments ask users to subscribe to their channels.
keyword_subscribe = make_keyword_lf(keywords=["subscribe"])

#Spam comments post links to other channels.
keyword_link = make_keyword_lf(keywords=["http"])

#Spam comments make requests rather than commenting.
keyword_please = make_keyword_lf(keywords=["please", "plz"])

#Ham comments actually talk about the video's content.
keyword_song = make_keyword_lf(keywords=["song"], label=HAM)

In [0]:
'''Pattern-matching LFs (regular expressions)'''
#If we want a little more control over a keyword search, we can look for regular expressions instead. ]
#The LF we developed above (regex_check_out) is an example of this.

'Pattern-matching LFs (regular expressions)'

In [0]:
'''Heuristic LFs'''
#There may other heuristics or “rules of thumb” that you come up with as you look at the data. 
#So long as you can express it in a function, it’s a viable LF!

@labeling_function()
def short_comment(x):
    """Ham comments are often short, such as 'cool video!'"""
    return HAM if len(x.CONTENT.split()) < 5 else ABSTAIN

In [0]:
'''LFs with Complex Preprocessors'''
#We can enrich our data (providing more fields for the LFs to refer to) using Preprocessors.

#Using Spacy to add lemmas, part-of-speech (pos) tags, etc. to each token
#can also add other NLTK libraries


# The SpacyPreprocessor parses the text in text_field and
# stores the new enriched representation in doc_field
spacy = SpacyPreprocessor(text_field="CONTENT", doc_field="doc", memoize=True)


@labeling_function(pre=[spacy])
def has_person(x):
    """Ham comments mention specific people and are short."""
    if len(x.doc) < 20 and any([ent.label_ == "PERSON" for ent in x.doc.ents]):
        return HAM
    else:
        return ABSTAIN


#same thing below. Just predifined function than manual above

# from snorkel.labeling.lf.nlp import nlp_labeling_function
# @nlp_labeling_function()
# def has_person_nlp(x):
#     """Ham comments mention specific people and are short."""
#     if len(x.doc) < 20 and any([ent.label_ == "PERSON" for ent in x.doc.ents]):
#         return HAM
#     else:
#         return ABSTAIN

In [0]:
'''Third-party Model LFs'''
#We can also utilize other models, including ones trained for other tasks that are related to, but not the same as, the one we care about. 
#The TextBlob-based LFs we created above are great examples of this!

'Third-party Model LFs'

<h1>Combining Labeling Function Outputs with the Label Model</h1>

In [0]:
lfs = [
    keyword_my,
    keyword_subscribe,
    keyword_link,
    keyword_please,
    keyword_song,
    regex_check_out,
    short_comment,
    has_person,
    textblob_polarity,
    textblob_subjectivity,
]

In [0]:
applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=df_train)
L_dev = applier.apply(df=df_dev)
L_valid = applier.apply(df=df_valid)

100%|██████████| 1586/1586 [00:21<00:00, 74.79it/s]
100%|██████████| 200/200 [00:02<00:00, 81.60it/s]
100%|██████████| 185/185 [00:02<00:00, 68.77it/s]


In [0]:
L_train[10:20]

array([[-1,  1, -1, -1, -1, -1,  0, -1, -1, -1],
       [-1,  1, -1, -1, -1, -1, -1, -1, -1,  0],
       [-1, -1,  1, -1, -1, -1,  0, -1, -1, -1],
       [-1,  1, -1, -1, -1, -1,  0, -1, -1, -1],
       [-1, -1,  1,  1, -1, -1,  0, -1,  0,  0],
       [-1,  1, -1,  1, -1,  1, -1, -1, -1,  0],
       [-1, -1, -1, -1, -1, -1, -1, -1, -1,  0],
       [-1, -1,  1, -1, -1, -1,  0, -1, -1, -1],
       [-1, -1,  1, -1, -1, -1,  0, -1, -1, -1],
       [-1, -1, -1, -1, -1, -1, -1, -1, -1,  0]])

In [0]:
LFAnalysis(L=L_dev, lfs=lfs).lf_summary(Y=Y_dev)

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
keyword_my,0,[1],0.215,0.21,0.095,36,7,0.837209
keyword_subscribe,1,[1],0.15,0.115,0.045,29,1,0.966667
keyword_http,2,[1],0.115,0.095,0.075,20,3,0.869565
keyword_please,3,[1],0.135,0.135,0.035,27,0,1.0
keyword_song,4,[0],0.14,0.11,0.04,21,7,0.75
regex_check_out,5,[1],0.265,0.16,0.09,53,0,1.0
short_comment,6,[0],0.215,0.135,0.055,32,11,0.744186
has_person,7,[0],0.075,0.065,0.03,11,4,0.733333
textblob_polarity,8,[0],0.04,0.04,0.01,7,1,0.875
textblob_subjectivity,9,[0],0.35,0.275,0.16,41,29,0.585714


<h1>Now we will combine the outputs of all the labeling functions to get one label</h1>


**Our goal is to convert the labels from our LFs into a single noise-aware probabilistic (or confidence-weighted) label per data point.<br>**

A simple baseline for doing this is to take the majority vote on a per-data point basis: if more LFs voted SPAM than HAM, label it SPAM (and vice versa).

In [0]:
from snorkel.labeling import MajorityLabelVoter

majority_model = MajorityLabelVoter()
preds_train = majority_model.predict(L=L_train)

In [0]:
np.unique(preds_train)

array([0, 1])

In [0]:
preds_train

array([1, 1, 0, ..., 1, 0, 0])

In [0]:
# Now we will use Snorkel's LabelModel to combine the outputs of the LFs.

# This model will ultimately produce a single set of noise-aware training labels, 
# which are probabilistic or confidence-weighted labels. We will then use these labels to train a classifier for our task.

# The LabelModel is able to learn weights for the labeling functions using only the label matrix as input.
# no gold labels are used during the training process. They are just for evaluation

In [0]:
L_train

array([[-1, -1, -1, ..., -1, -1, -1],
       [ 1,  1, -1, ..., -1, -1, -1],
       [-1, -1, -1, ..., -1, -1, -1],
       ...,
       [-1,  1, -1, ..., -1, -1,  0],
       [-1, -1, -1, ..., -1, -1,  0],
       [-1, -1, -1, ..., -1,  0,  0]])

In [0]:
# Filtering non labelled points
# L_temp = []
# for l in L_train:
#   if not all([True if pred == -1 else False for pred in l]):
#     L_temp.append(l)
# L_train = np.array(L_temp)

In [0]:
# Fitting the model on L_train(The matrix) and getting prediction for L_valid

label_model = LabelModel(cardinality=2, verbose=True)  #cardinality - number of classes
label_model.fit(L_train=L_train, n_epochs=1000, lr=0.001, log_freq=100, seed=123)

In [0]:
#calculating score for majority model and snorkel's label_models
majority_acc = majority_model.score(L=L_valid, Y=Y_valid)["accuracy"]
print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%")

label_model_acc = label_model.score(L=L_valid, Y=Y_valid)["accuracy"]
print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%")

Majority Vote Accuracy:   84.3%
Label Model Accuracy:     87.0%


In [0]:
preds_train = label_model.predict(L_train)

In [0]:
preds_train.shape

(1380,)

In [0]:
#  This labeling is typically not suitable as an inference-time model to make predictions for unseen data points, 
#  due to (among other things) some data points having all abstain labels.

#Now what we will do is this:
#For every comment, we have a label either 0 or 1. Now instead of using this 0 and 1 to classify an unseen comment as SPAM or HAM 
#what we will do is supply this comment(X) and label(Y) to a discriminative classifier to see if we can improve performance further

'''formaly'''
#we will use the output of the label model as training labels to train a discriminative classifier.
#This classifier will only need the text of the comment to make predictions, making it much more suitable for inference over unseen comments. 

'formaly'

<h1>Filtering out unlabeled data points</h1>

As we saw earlier, some of the data points in our train set received no labels from any of our LFs. 
These data points convey no supervision signal and tend to hurt performance, so we filter them out before training using a built-in utility.

In [0]:
from snorkel.labeling import filter_unlabeled_dataframe

df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
    X=df_train, y=preds_train, L=L_train
)

In [0]:
len(df_train_filtered)

1380

In [0]:
probs_train_filtered[0:18]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0])

<h1>Training a Classifier</h1>

In [0]:
# we’ll use the noisy training labels to train a classifier for our task. 
# The output of the Snorkel LabelModel is just a set of labels which can be used with most popular libraries for performing supervised learning, 
# such as TensorFlow, Keras, PyTorch, Scikit-Learn, Ludwig, and XGBoost. 
# We will be using classifiers from Keras and Scikit-Learn.

In [0]:
#Featurization

# For simplicity and speed, we use a simple “bag of n-grams” feature representation: 
#each data point is represented by a one-hot vector marking which words or 2-word combinations are present in the comment text.

In [0]:
vectorizer = CountVectorizer(ngram_range=(1, 2))
X_train = vectorizer.fit_transform(df_train_filtered.CONTENT.tolist())

X_dev = vectorizer.transform(df_dev.CONTENT.tolist())
X_valid = vectorizer.transform(df_valid.CONTENT.tolist())
X_test = vectorizer.transform(df_test.CONTENT.tolist())

***Keras Classifier with Probabilistic Labels***

In [0]:
from snorkel.analysis import metric_score
from snorkel.utils import preds_to_probs
from utils import get_keras_logreg, get_keras_early_stopping
from keras.utils import to_categorical

# Define a vanilla logistic regression model with Keras
keras_model = get_keras_logreg(input_dim=X_train.shape[1])
keras_model.fit(
    x=X_train,
    y=to_categorical(probs_train_filtered),
    validation_data=(X_valid, to_categorical(Y_valid)),
    callbacks=[get_keras_early_stopping()],
    epochs=20,
    verbose=0,
)

Restoring model weights from the end of the best epoch.
Epoch 00011: early stopping


<tensorflow.python.keras.callbacks.History at 0x7fb93aa03128>

In [0]:
preds_test = keras_model.predict(x=X_test).argmax(axis=1)
test_acc = metric_score(golds=Y_test, preds=preds_test, metric="accuracy")
print(f"Test Accuracy: {test_acc * 100:.1f}%")

Test Accuracy: 88.1%


<h1>Summary</h1>
In this tutorial, we accomplished the following:



1.   We introduced the concept of Labeling Functions (LFs) and demonstrated some of the forms they can take.
2.   We used the Snorkel LabelModel to automatically learn how to combine the outputs of our LFs into strong probabilistic labels.
3.   We showed that a classifier trained on a weakly supervised dataset can outperform an approach based on the LFs alone as it learns to generalize beyond the noisy heuristics we provide.