In [1]:
from textblob import TextBlob
import pandas as pd
import numpy as np
from snorkel.labeling import labeling_function

In [2]:
_data_path = "../data/training_data.csv"
df = pd.read_csv(_data_path, usecols=['id', 'text', 'user', 'user_verified',
                                      'user_followers_count', 'user_friends_count', 'retweet_count', 'fav_count', 'hashtags', 'target'])[:]

In [3]:
df['target'] = df.target.apply(lambda x: x.split()[1])

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12756 entries, 0 to 12755
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   id                    12756 non-null  int64 
 1   text                  12756 non-null  object
 2   user                  12756 non-null  object
 3   user_verified         12756 non-null  bool  
 4   user_followers_count  12756 non-null  int64 
 5   user_friends_count    12756 non-null  int64 
 6   retweet_count         12756 non-null  int64 
 7   fav_count             12756 non-null  int64 
 8   hashtags              12756 non-null  object
 9   target                12756 non-null  object
dtypes: bool(1), int64(5), object(4)
memory usage: 909.5+ KB


In [5]:
df.tail()

Unnamed: 0,id,text,user,user_verified,user_followers_count,user_friends_count,retweet_count,fav_count,hashtags,target
12751,595346833972408320,RT @Quinnae_Moon: That's not merely because of...,randileeharper,True,19435,766,13,0,[],neither
12752,596356013730598912,But this just goes to prove - @TychoBrahe only...,randileeharper,True,19435,766,12,16,[],neither
12753,603639577920839681,RT @olsen31shannon: Wow I love being called a ...,ambvrrr,False,3691,721,1,0,[],neither
12754,595282956970655744,RT @j_bigboote: @freebsdgirl You'd think havin...,randileeharper,True,19435,766,6,0,[],neither
12755,575480549332008962,via @weaselzippers: Feminazi Blog Reminds Libe...,commonpatriot,False,15093,13525,0,0,"[{'text': 'tcot', 'indices': [131, 136]}]",sexism


In [6]:
df[df.user_friends_count==0]['target'].value_counts()

none       83
sexism     26
neither    12
racism      2
both        1
Name: target, dtype: int64

In [7]:
def labe(x):
    if x == 'none' or x == 'neither': return 0
    return 1

In [8]:
df['label'] = df.target.apply(labe)

In [9]:
w = []
with open('../data/hatewords.txt', 'r') as f:
    for line in f.readlines():
        w.append(line.split('\t')[0][:-1])
w = list(set(w))

In [10]:
HATE = 1
NONHATE = 0
ABSTAIN = -1

In [11]:
@labeling_function()
def lf_hate_words(x):
    text = x.text.lower()
    for word in w:
        if word in text.split(): return HATE
    return NONHATE

In [12]:
@labeling_function()
def lf_fav_count(x):
    return NONHATE if x.fav_count > 10 else ABSTAIN

In [13]:
@labeling_function()
def lf_user_friends_count(x):
    return NONHATE if x.user_friends_count > 10 else ABSTAIN

In [14]:
@labeling_function()
def lf_user_followers_count(x):
    return NONHATE if x.user_followers_count > 10 else ABSTAIN

In [15]:
@labeling_function()
def lf_user_verified(x):
    return NONHATE if x.user_verified else HATE

In [16]:
@labeling_function()
def lf_textblob_polarity(x):
    return NONHATE if TextBlob(x.text).sentiment.polarity > 0.3 else ABSTAIN

In [17]:
from snorkel.labeling.model import LabelModel
from snorkel.labeling import PandasLFApplier

In [18]:
# Define the set of labeling functions (LFs)
lfs = [lf_hate_words, lf_fav_count, lf_user_friends_count, lf_user_followers_count, lf_user_verified, lf_textblob_polarity]

In [19]:
# Apply the LFs to the unlabeled training data
applier = PandasLFApplier(lfs)
L_train = applier.apply(df)

  from pandas import Panel
100%|██████████| 12756/12756 [00:11<00:00, 1121.37it/s]


In [20]:
# Train the label model and compute the training labels
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train, n_epochs=500, log_freq=50, seed=123)
df["snorkel_label"] = label_model.predict(L=L_train, tie_break_policy="abstain")

  Variable._execution_engine.run_backward(


In [21]:
df["snorkel_prob"] = [i[1] for i in label_model.predict(L=L_train, tie_break_policy="abstain", return_probs=True)[1]]

In [22]:
df = df[df.label != ABSTAIN]

In [24]:
sum(df.label==df.snorkel_label)

8584

In [25]:
len(df)

12756

In [26]:
df.head()

Unnamed: 0,id,text,user,user_verified,user_followers_count,user_friends_count,retweet_count,fav_count,hashtags,target,label,snorkel_label,snorkel_prob
0,572332655397629952,These girls are the equivalent of the irritati...,JL_Whitaker,False,2189,2186,0,2,"[{'text': 'MKR', 'indices': [95, 99]}]",racism,1,1,0.549079
1,572341498827522049,Drasko they didn't cook half a bird you idiot ...,trish2295,False,14,62,0,4,"[{'text': 'mkr', 'indices': [46, 50]}]",neither,0,1,0.897941
2,572340476503724032,Hopefully someone cooks Drasko in the next ep ...,foodbling,False,3432,2529,0,2,"[{'text': 'MKR', 'indices': [49, 53]}]",neither,0,1,0.549079
3,572334712804384768,of course you were born in serbia...you're as ...,lilbeastunleash,False,529,1848,0,0,"[{'text': 'MKR', 'indices': [71, 75]}]",racism,1,1,0.549079
4,572342978255048705,So Drasko just said he was impressed the girls...,thefoxbandit,False,7,0,0,2,"[{'text': 'MKR', 'indices': [96, 100]}]",neither,0,1,0.597545


In [23]:
df.to_csv('../data/training_data_snorkel_label1.csv', index=False)