In [1]:
from textblob import TextBlob
import pandas as pd
import numpy as np
from snorkel.labeling import labeling_function

In [2]:
_data_path = "../data/unlabeled_dump.csv"
df = pd.read_csv(_data_path, usecols=['id', 'text', 'user', 'user_verified',
                                      'user_followers_count', 'user_friends_count', 'retweet_count', 'fav_count', 'hashtags'])[:]

In [3]:
df['target'] = df.target.apply(lambda x: x.split()[1])

AttributeError: 'DataFrame' object has no attribute 'target'

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df[df.user_friends_count==0]['target'].value_counts()

In [None]:
def labe(x):
    if x == 'none' or x == 'neither': return 0
    return 1

In [None]:
df['labe'] = df.target.apply(labe)

In [None]:
w = []
with open('../data/hatewords.txt', 'r') as f:
    for line in f.readlines():
        w.append(line.split('\t')[0][:-1])
w = list(set(w))

In [None]:
HATE = 1
NONHATE = 0
ABSTAIN = -1

In [None]:
@labeling_function()
def lf_hate_words(x):
    text = x.text.lower()
    for word in w:
        if word in text.split(): return HATE
    return ABSTAIN

In [None]:
@labeling_function()
def lf_fav_count(x):
    return NONHATE if x.fav_count > 20 else ABSTAIN

In [None]:
@labeling_function()
def lf_user_friends_count(x):
    return NONHATE if x.user_friends_count > 20 else ABSTAIN

In [None]:
@labeling_function()
def lf_user_followers_count(x):
    return NONHATE if x.user_followers_count > 20 else ABSTAIN

In [None]:
@labeling_function()
def lf_user_verified(x):
    return NONHATE if x.user_verified else HATE

In [None]:
@labeling_function()
def lf_textblob_polarity(x):
    return HATE if TextBlob(x.text).sentiment.polarity < 0 else ABSTAIN

In [4]:
from snorkel.labeling.model import LabelModel
from snorkel.labeling import PandasLFApplier

In [5]:
df['h'] = df.apply(lf_hate_words, axis=1)

NameError: name 'lf_hate_words' is not defined

In [57]:
df.h.value_counts()

-1    48431
 1      810
Name: h, dtype: int64

In [58]:
df['p'] = df.apply(lf_textblob_polarity, axis=1)

In [59]:
df.p.value_counts()

-1    40685
 1     8556
Name: p, dtype: int64

In [60]:
# Define the set of labeling functions (LFs)
lfs = [lf_hate_words, lf_fav_count, lf_user_friends_count, lf_user_followers_count, lf_user_verified, lf_textblob_polarity]

In [61]:
# Apply the LFs to the unlabeled training data
applier = PandasLFApplier(lfs)
L_train = applier.apply(df)

  from pandas import Panel
100%|██████████| 49241/49241 [00:49<00:00, 987.19it/s] 


In [62]:
# Train the label model and compute the training labels
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train, n_epochs=500, log_freq=50, seed=123)
df["label"] = label_model.predict(L=L_train, tie_break_policy="abstain")

In [63]:
df = df[df.label != ABSTAIN]

In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49241 entries, 0 to 49240
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   id                    49241 non-null  int64 
 1   text                  49241 non-null  object
 2   user                  49241 non-null  object
 3   user_verified         49241 non-null  bool  
 4   user_followers_count  49241 non-null  int64 
 5   user_friends_count    49241 non-null  int64 
 6   retweet_count         49241 non-null  int64 
 7   fav_count             49241 non-null  int64 
 8   hashtags              49241 non-null  object
 9   label                 49241 non-null  int64 
 10  h                     49241 non-null  int64 
 11  p                     49241 non-null  int64 
dtypes: bool(1), int64(8), object(3)
memory usage: 4.6+ MB


In [65]:
df.label.value_counts()

0    49164
1       77
Name: label, dtype: int64

In [66]:
len(df)

12756

In [68]:
df.head()

Unnamed: 0,id,text,user,user_verified,user_followers_count,user_friends_count,retweet_count,fav_count,hashtags,target,labe,label
0,572332655397629952,These girls are the equivalent of the irritati...,JL_Whitaker,False,2189,2186,0,2,"[{'text': 'MKR', 'indices': [95, 99]}]",racism,1,1
1,572341498827522049,Drasko they didn't cook half a bird you idiot ...,trish2295,False,14,62,0,4,"[{'text': 'mkr', 'indices': [46, 50]}]",neither,0,1
2,572340476503724032,Hopefully someone cooks Drasko in the next ep ...,foodbling,False,3432,2529,0,2,"[{'text': 'MKR', 'indices': [49, 53]}]",neither,0,1
3,572334712804384768,of course you were born in serbia...you're as ...,lilbeastunleash,False,529,1848,0,0,"[{'text': 'MKR', 'indices': [71, 75]}]",racism,1,1
4,572342978255048705,So Drasko just said he was impressed the girls...,thefoxbandit,False,7,0,0,2,"[{'text': 'MKR', 'indices': [96, 100]}]",neither,0,1


In [21]:
df.to_csv('../data/data_weak_label.csv', index=False)