In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GroupShuffleSplit
from tqdm import tqdm
import re
import demoji

Load data from google sheets

In [2]:
def load_sheet(url):
    url_1 = url.replace("/edit#gid=", "/export?format=csv&gid=")
    data = pd.read_csv(url_1)
    return data


HATE_SHEET = "https://docs.google.com/spreadsheets/d/16lxEwKVA_d_g5QRFNcBTyLz_OBPPB3wZdzZu2UnvLWQ/edit#gid=0"
POS_NON_HATE_SHEET = "https://docs.google.com/spreadsheets/d/16lxEwKVA_d_g5QRFNcBTyLz_OBPPB3wZdzZu2UnvLWQ/edit#gid=1070451623"
NEU_NON_HATE_SHEET = "https://docs.google.com/spreadsheets/d/16lxEwKVA_d_g5QRFNcBTyLz_OBPPB3wZdzZu2UnvLWQ/edit#gid=497253390"

hate_df = load_sheet(HATE_SHEET)
pos_df = load_sheet(POS_NON_HATE_SHEET)
neu_df = load_sheet(NEU_NON_HATE_SHEET)
non_hate_df = pd.concat([pos_df, neu_df])

Drop useless columns

In [3]:
hate_df.drop(
    columns=[
        "Title",
        "Is Video Hateful (Yes / No)",
        "What Metadata / Information is Required?",
        "Synthetic or Original?",
        "Reviewer",
        "Additional Verification Needed (Yes / No)",
        "Reason For Additional Verficiation? (Only if YES)",
    ],
    inplace=True,
)
hate_df.rename(
    columns={
        "Link": "url",
        "Video Category": "category",
        "Comment": "comment",
        "Hate Towards Whom?": "hate_towards_whom",
    },
    inplace=True,
)

Fill columns for labels

In [4]:
hate_df["category"] = hate_df["category"].str.lower()
hate_df["label"] = "yes"
hate_df.fillna(method="ffill", inplace=True)

Repeat for non hate

In [5]:
non_hate_df.drop(
    columns=[
        "Manual Inspection",
        "Validator",
        "scores",
    ],
    inplace=True,
)
non_hate_df.rename(
    columns={
        "type": "category",
    },
    inplace=True,
)
non_hate_df["category"] = non_hate_df["category"].str.lower()
non_hate_df["hate_towards_whom"] = "None"
non_hate_df["label"] = "no"

In [6]:
df = pd.concat([hate_df, non_hate_df])
df.drop(columns=['Unnamed: 11'], inplace=True)

Split data according to groups into test and train first

In [7]:
groups = df['url']

best_state = 0
min_diff = 1000000
for random_state in tqdm(range(0, 10000)):
    gss = GroupShuffleSplit(n_splits=1, test_size=0.30, random_state=random_state)

    for train_index, test_index in gss.split(df, groups=groups):
        df_train = df.iloc[train_index]
        df_test = df.iloc[test_index]

    try:
        count_train = df_train['category'].value_counts().to_dict()
        count_test = df_test['category'].value_counts().to_dict()
        diff = {k : count_train[k] - count_test[k] * 7 for k in count_train}
        value_sum = sum(map(abs, diff.values()))
        if value_sum < min_diff:
            best_state = random_state
            min_diff = value_sum
    except:
        continue

100%|██████████| 10000/10000 [00:13<00:00, 741.55it/s]


In [8]:
print(best_state, min_diff)

8919 1705


In [9]:
gss = GroupShuffleSplit(n_splits=1, test_size=0.3, random_state=best_state)

for train_index, test_index in gss.split(df, groups=groups):
    df_train = df.iloc[train_index]
    df_test = df.iloc[test_index]

In [10]:
df_train.shape, df_test.shape

((1599, 5), (472, 5))

In [11]:
df_train.to_csv("../data/without_aug/train.csv", index=False)
df_test.to_csv("../data/without_aug/test.csv", index=False)

In [12]:
df_test = pd.read_csv("../data/without_aug/test.csv")

Load all augmented data and train data

In [48]:
hate_orig = pd.read_csv("../data/without_aug/train.csv")
hate_aug = pd.read_csv("../data/with_aug/all_with_uuid.csv")

In [49]:
hate_orig.shape

(1599, 5)

Make another split for train and validation data

In [50]:
groups = hate_orig['url']

best_state = 0
min_diff = 1000000
for random_state in tqdm(range(0, 10000)):
    gss = GroupShuffleSplit(n_splits=1, test_size=0.35, random_state=random_state)

    for train_index, test_index in gss.split(hate_orig, groups=groups):
        hate_orig_train = hate_orig.iloc[train_index]
        hate_orig_val = hate_orig.iloc[test_index]

    try:
        count_train = hate_orig_train['category'].value_counts().to_dict()
        count_test = hate_orig_val['category'].value_counts().to_dict()
        diff = {k : count_train[k] - count_test[k] * 6.5 for k in count_train}
        value_sum = sum(map(abs, diff.values()))
        if value_sum < min_diff:
            best_state = random_state
            min_diff = value_sum
    except:
        continue



100%|██████████| 10000/10000 [00:10<00:00, 935.06it/s]


In [51]:
print(best_state, min_diff)

1073 1521.0


split eval and train data

In [52]:
gss = GroupShuffleSplit(n_splits=1, test_size=0.35, random_state=best_state)

for train_index, test_index in gss.split(hate_orig, groups=groups):
    hate_orig_train = hate_orig.iloc[train_index]
    hate_orig_val = hate_orig.iloc[test_index]

In [53]:
hate_orig_train.shape, hate_orig_val.shape

((1183, 5), (416, 5))

train aug data only from urls which are not in val and test data

In [54]:
urls_in_val = list(set(hate_orig_val['url'].to_list()))
urls_in_test = list(set(df_test['url'].to_list()))
len(urls_in_val), len(urls_in_test)

(99, 121)

In [55]:
hate_aug.drop(columns=['Unnamed: 0'], inplace=True)

In [56]:
df_test.shape, hate_orig_val.shape, hate_aug.shape

((472, 5), (416, 5), (9762, 6))

clean up data

In [62]:
def check(text):
    if text in ['Individual', 'Organisation', 'Location', 'Community', 'None']:
        return True
    return False

def clean(text):
    text = re.sub(r"[\(\[].*?[\)\]]", "", text)
    text = re.sub(r",", " ", text)
    text = ",".join([cat for cat in sorted(list(set(text.split()))) if check(cat)])
    return text

def process_text(text):
    if text != text:
        return ''
    new_text = []

    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        t = re.sub(r'http\S+', '', t)
        new_text.append(t)
    new_text = " ".join(new_text)
    new_text = demoji.replace_with_desc(new_text, sep=' ')
    new_text = re.sub('\\s+', ' ', new_text)
    return new_text

In [63]:
for id, row in hate_orig_val.iterrows():
    hate_orig_val.loc[id, 'comment'] = process_text(row['comment'])
    hate_orig_val.loc[id, 'hate_towards_whom'] = clean(row['hate_towards_whom'])
hate_orig_val.drop_duplicates(inplace=True)

for id, row in hate_aug.iterrows():
    hate_aug.loc[id, 'comment'] = process_text(row['comment'])
    hate_aug.loc[id, 'hate_towards_whom'] = clean(row['hate_towards_whom'])
hate_aug.drop_duplicates(inplace=True)

for id, row in df_test.iterrows():
    df_test.loc[id, 'comment'] = process_text(row['comment'])
    df_test.loc[id, 'hate_towards_whom'] = clean(row['hate_towards_whom'])
df_test.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hate_orig_val.loc[id, 'comment'] = process_text(row['comment'])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hate_orig_val.loc[id, 'hate_towards_whom'] = clean(row['hate_towards_whom'])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hate_orig_val.drop_duplicates(inplace=True)


In [64]:
df_test_with_uuid = pd.merge(df_test, hate_aug, on=['url', 'comment', 'category', 'hate_towards_whom', 'label'], how='left')
df_eval_with_uuid = pd.merge(hate_orig_val, hate_aug, on=['url', 'comment', 'category', 'hate_towards_whom', 'label'], how='left')

In [66]:
print(hate_aug.shape)
df_train = hate_aug[~hate_aug['url'].isin(urls_in_val)]
df_train = df_train[~df_train['url'].isin(urls_in_test)]
print(df_train.shape)

(9762, 6)
(5643, 6)


In [67]:
df_test_with_uuid.shape, df_eval_with_uuid.shape, df_train.shape

((472, 6), (416, 6), (5643, 6))

In [73]:
df_eval_with_uuid.to_csv('../data/with_aug_ttv/eval.csv', index=False)
df_train.to_csv('../data/with_aug_ttv/train.csv', index=False)
df_test_with_uuid.to_csv("../data/with_aug_ttv/test.csv", index=False)

## Merge all augmented data and clean hate towards whom label for training, validation and test data (Different task)

In [3]:
test_df = pd.read_csv('../data/with_aug/test_aug.csv')
train_df = pd.read_csv('../data/with_aug/train_aug.csv')
all_df = pd.concat([train_df, test_df])

In [4]:
prev_label = ''
for id, row in train_df.iterrows():
    if str(row['hate_towards_whom']) == 'nan':
        train_df.iloc[id]['hate_towards_whom'] = prev_label
    else:
        prev_label = row['hate_towards_whom']

In [7]:
all_df['hate_towards_whom'] = all_df['hate_towards_whom'].apply(lambda x: clean(x))
all_df['comment'] = all_df['comment'].apply(lambda x: process_text(x))
all_df.drop_duplicates(inplace=True)

all_df.to_csv('../data/with_aug/all.csv', index=False)
print(all_df['hate_towards_whom'].value_counts())

None                       4595
Individual                 2414
Community                  1631
Location                    379
Organisation                326
Community,Individual        151
Community,Location          100
Community,Organisation       90
Individual,Organisation      39
Individual,Location          20
Location,Organisation        17
Name: hate_towards_whom, dtype: int64


In [8]:
all_df[all_df['url'] == 'https://www.youtube.com/watch?v=QFOpN957p3s']

Unnamed: 0,url,category,comment,hate_towards_whom,label
1487,https://www.youtube.com/watch?v=QFOpN957p3s,entertainment,because they are bad at geography. And if they...,Community,yes
1488,https://www.youtube.com/watch?v=QFOpN957p3s,entertainment,if they don't know where the aliens came from ...,Community,yes
1489,https://www.youtube.com/watch?v=QFOpN957p3s,entertainment,if they don't know where the aliens come from ...,Community,yes
1490,https://www.youtube.com/watch?v=QFOpN957p3s,entertainment,if they don't know where the aliens come from ...,Community,yes
1491,https://www.youtube.com/watch?v=QFOpN957p3s,entertainment,"Because they are bad in geography, and if the...",Community,yes
1492,https://www.youtube.com/watch?v=QFOpN957p3s,entertainment,0:50 that bozzo is wearing a US TShirt as well...,Individual,yes
1493,https://www.youtube.com/watch?v=QFOpN957p3s,entertainment,0:50 that bozzo also wears an American t-shirt...,Individual,yes
1494,https://www.youtube.com/watch?v=QFOpN957p3s,entertainment,"1:20 thats the future generation, look at his ...",Community,yes
1495,https://www.youtube.com/watch?v=QFOpN957p3s,entertainment,"1:20 as the future generation, look at its exp...",Community,yes
1496,https://www.youtube.com/watch?v=QFOpN957p3s,entertainment,That last white dude was seriously smoking som...,Community,yes


In [9]:
hate_aug[hate_aug['url'] == 'https://www.youtube.com/watch?v=QFOpN957p3s']

NameError: name 'hate_aug' is not defined

In [10]:
all_df.to_csv('../data/with_aug/all.csv', index=False)