In [1]:
from textblob import TextBlob
import pandas as pd
import numpy as np
from snorkel.labeling import labeling_function

In [2]:
_data_path = "../data/training_data_snorkel_label.csv"
df = pd.read_csv(_data_path)[:]

In [3]:
df.dropna(inplace=True)

In [24]:
df.head()

Unnamed: 0,id,text,user,user_verified,user_followers_count,user_friends_count,retweet_count,fav_count,hashtags,target,snorkel_label,label,ttt
0,572332655397629952,These girls are the equivalent of the irritati...,JL_Whitaker,False,2189.0,2186.0,0.0,2,"[{'text': 'MKR', 'indices': [95, 99]}]",racism,1.0,1.0,0
1,572341498827522049,Drasko they didn't cook half a bird you idiot ...,trish2295,False,14.0,62.0,0.0,4,"[{'text': 'mkr', 'indices': [46, 50]}]",neither,0.0,1.0,0
2,572340476503724032,Hopefully someone cooks Drasko in the next ep ...,foodbling,False,3432.0,2529.0,0.0,2,"[{'text': 'MKR', 'indices': [49, 53]}]",neither,0.0,1.0,0
3,572334712804384768,of course you were born in serbia...you're as ...,lilbeastunleash,False,529.0,1848.0,0.0,0,"[{'text': 'MKR', 'indices': [71, 75]}]",racism,1.0,1.0,0
4,572342978255048705,So Drasko just said he was impressed the girls...,thefoxbandit,False,7.0,0.0,0.0,2,"[{'text': 'MKR', 'indices': [96, 100]}]",neither,0.0,1.0,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12755 entries, 0 to 12757
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    12755 non-null  object 
 1   text                  12755 non-null  object 
 2   user                  12755 non-null  object 
 3   user_verified         12755 non-null  object 
 4   user_followers_count  12755 non-null  float64
 5   user_friends_count    12755 non-null  float64
 6   retweet_count         12755 non-null  float64
 7   fav_count             12755 non-null  object 
 8   hashtags              12755 non-null  object 
 9   target                12755 non-null  object 
 10  snorkel_label         12755 non-null  float64
 11  label                 12755 non-null  float64
dtypes: float64(5), object(7)
memory usage: 1.3+ MB


In [5]:
w = []
with open('../data/hatewords.txt', 'r') as f:
    for line in f.readlines():
        w.append(line.split('\t')[0][:-1])
w = list(set(w))

In [6]:
HATE = 1
NONHATE = 0
ABSTAIN = -1

In [7]:
@labeling_function()
def lf_hate_words(x):
    text = x.text.lower()
    for word in w:
        if word in text.split(): return HATE
    return NONHATE

In [8]:
@labeling_function()
def lf_fav_count(x):
    return NONHATE if int(x.fav_count) > 10 else ABSTAIN

In [9]:
@labeling_function()
def lf_user_friends_count(x):
    return NONHATE if x.user_friends_count > 10 else ABSTAIN

In [10]:
@labeling_function()
def lf_user_followers_count(x):
    return NONHATE if x.user_followers_count > 10 else ABSTAIN

In [11]:
@labeling_function()
def lf_user_verified(x):
    return NONHATE if x.user_verified else HATE

In [12]:
@labeling_function()
def lf_textblob_polarity(x):
    return NONHATE if TextBlob(x.text).sentiment.polarity > 0.3 else ABSTAIN

In [13]:
from snorkel.labeling.model import LabelModel
from snorkel.labeling import PandasLFApplier

In [14]:
# Define the set of labeling functions (LFs)
lfs = [lf_hate_words, lf_fav_count, lf_user_friends_count, lf_user_followers_count, lf_user_verified, lf_textblob_polarity]

In [15]:
# Apply the LFs to the unlabeled training data
applier = PandasLFApplier(lfs)
L_train = applier.apply(df)

  from pandas import Panel
100%|██████████| 12755/12755 [00:11<00:00, 1126.67it/s]


In [16]:
# Train the label model and compute the training labels
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train, n_epochs=500, log_freq=50, seed=123)
df["ttt"] = label_model.predict(L=L_train, tie_break_policy="abstain")

  Variable._execution_engine.run_backward(


In [20]:
sum(df.ttt==df.label)

6278

In [18]:
len(df)

12755

In [19]:
len(df[df.label != ABSTAIN])

12755

In [25]:
df


Unnamed: 0,id,text,user,user_verified,user_followers_count,user_friends_count,retweet_count,fav_count,hashtags,target,snorkel_label,label,ttt
0,572332655397629952,These girls are the equivalent of the irritati...,JL_Whitaker,False,2189.0,2186.0,0.0,2,"[{'text': 'MKR', 'indices': [95, 99]}]",racism,1.0,1.0,0
1,572341498827522049,Drasko they didn't cook half a bird you idiot ...,trish2295,False,14.0,62.0,0.0,4,"[{'text': 'mkr', 'indices': [46, 50]}]",neither,0.0,1.0,0
2,572340476503724032,Hopefully someone cooks Drasko in the next ep ...,foodbling,False,3432.0,2529.0,0.0,2,"[{'text': 'MKR', 'indices': [49, 53]}]",neither,0.0,1.0,0
3,572334712804384768,of course you were born in serbia...you're as ...,lilbeastunleash,False,529.0,1848.0,0.0,0,"[{'text': 'MKR', 'indices': [71, 75]}]",racism,1.0,1.0,0
4,572342978255048705,So Drasko just said he was impressed the girls...,thefoxbandit,False,7.0,0.0,0.0,2,"[{'text': 'MKR', 'indices': [96, 100]}]",neither,0.0,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12753,595346833972408320,RT @Quinnae_Moon: That's not merely because of...,randileeharper,True,19435.0,766.0,13.0,0,[],neither,0.0,0.0,0
12754,596356013730598912,But this just goes to prove - @TychoBrahe only...,randileeharper,True,19435.0,766.0,12.0,16,[],neither,0.0,0.0,0
12755,603639577920839681,RT @olsen31shannon: Wow I love being called a ...,ambvrrr,False,3691.0,721.0,1.0,0,[],neither,0.0,1.0,0
12756,595282956970655744,RT @j_bigboote: @freebsdgirl You'd think havin...,randileeharper,True,19435.0,766.0,6.0,0,[],neither,0.0,0.0,0
