In [101]:
import pandas as pd
from sklearn.model_selection import train_test_split, GroupShuffleSplit
from tqdm import tqdm
import re
import demoji

Load data from google sheets

In [102]:
def load_sheet(url):
    url_1 = url.replace("/edit#gid=", "/export?format=csv&gid=")
    data = pd.read_csv(url_1)
    return data


HATE_SHEET = "https://docs.google.com/spreadsheets/d/16lxEwKVA_d_g5QRFNcBTyLz_OBPPB3wZdzZu2UnvLWQ/edit#gid=0"
POS_NON_HATE_SHEET = "https://docs.google.com/spreadsheets/d/16lxEwKVA_d_g5QRFNcBTyLz_OBPPB3wZdzZu2UnvLWQ/edit#gid=1070451623"
NEU_NON_HATE_SHEET = "https://docs.google.com/spreadsheets/d/16lxEwKVA_d_g5QRFNcBTyLz_OBPPB3wZdzZu2UnvLWQ/edit#gid=497253390"

hate_df = load_sheet(HATE_SHEET)
pos_df = load_sheet(POS_NON_HATE_SHEET)
neu_df = load_sheet(NEU_NON_HATE_SHEET)
non_hate_df = pd.concat([pos_df, neu_df])

Drop useless columns

In [103]:
hate_df.drop(
    columns=[
        "Title",
        "Is Video Hateful (Yes / No)",
        "What Metadata / Information is Required?",
        "Synthetic or Original?",
        "Reviewer",
        "Additional Verification Needed (Yes / No)",
        "Reason For Additional Verficiation? (Only if YES)",
    ],
    inplace=True,
)
hate_df.rename(
    columns={
        "Link": "url",
        "Video Category": "category",
        "Comment": "comment",
        "Hate Towards Whom?": "hate_towards_whom",
    },
    inplace=True,
)

Fill columns for labels

In [104]:
hate_df["category"] = hate_df["category"].str.lower()
hate_df["label"] = "yes"
hate_df.fillna(method="ffill", inplace=True)

Repeat for non hate

In [105]:
non_hate_df.drop(
    columns=[
        "Manual Inspection",
        "Validator",
        "scores",
    ],
    inplace=True,
)
non_hate_df.rename(
    columns={
        "type": "category",
    },
    inplace=True,
)
non_hate_df["category"] = non_hate_df["category"].str.lower()
non_hate_df["hate_towards_whom"] = "None"
non_hate_df["label"] = "no"

In [106]:
df = pd.concat([hate_df, non_hate_df])
df.drop(columns=['Unnamed: 11'], inplace=True)

Split data according to groups into test and train first

In [107]:
groups = df['url']

best_state = 0
min_diff = 1000000
for random_state in tqdm(range(0, 10000)):
    gss = GroupShuffleSplit(n_splits=1, test_size=0.30, random_state=random_state)

    for train_index, test_index in gss.split(df, groups=groups):
        df_train = df.iloc[train_index]
        df_test = df.iloc[test_index]

    try:
        count_train = df_train['category'].value_counts().to_dict()
        count_test = df_test['category'].value_counts().to_dict()
        diff = {k : count_train[k] - count_test[k] * 7 for k in count_train}
        value_sum = sum(map(abs, diff.values()))
        if value_sum < min_diff:
            best_state = random_state
            min_diff = value_sum
    except:
        continue

100%|██████████| 10000/10000 [00:22<00:00, 447.38it/s]


In [108]:
print(best_state, min_diff)

8919 1705


In [109]:
gss = GroupShuffleSplit(n_splits=1, test_size=0.3, random_state=best_state)

for train_index, test_index in gss.split(df, groups=groups):
    df_train = df.iloc[train_index]
    df_test = df.iloc[test_index]

In [110]:
df_train.shape, df_test.shape

((1599, 5), (472, 5))

In [111]:
df_train.to_csv("../data/without_aug/train.csv", index=False)
df_test.to_csv("../data/without_aug/test.csv", index=False)

Load all augmented data and train data

In [112]:
hate_orig = pd.read_csv("../data/without_aug/train.csv")
hate_aug = pd.read_csv("../data/with_aug/all.csv")

In [113]:
hate_orig.shape

(1599, 5)

Make another split for train and validation data

In [114]:
groups = hate_orig['url']

best_state = 0
min_diff = 1000000
for random_state in tqdm(range(0, 10000)):
    gss = GroupShuffleSplit(n_splits=1, test_size=0.35, random_state=random_state)

    for train_index, test_index in gss.split(hate_orig, groups=groups):
        hate_orig_train = hate_orig.iloc[train_index]
        hate_orig_val = hate_orig.iloc[test_index]

    try:
        count_train = hate_orig_train['category'].value_counts().to_dict()
        count_test = hate_orig_val['category'].value_counts().to_dict()
        diff = {k : count_train[k] - count_test[k] * 6.5 for k in count_train}
        value_sum = sum(map(abs, diff.values()))
        if value_sum < min_diff:
            best_state = random_state
            min_diff = value_sum
    except:
        continue



100%|██████████| 10000/10000 [00:17<00:00, 562.86it/s]


In [115]:
print(best_state, min_diff)

1073 1521.0


split eval and train data

In [116]:
gss = GroupShuffleSplit(n_splits=1, test_size=0.35, random_state=best_state)

for train_index, test_index in gss.split(hate_orig, groups=groups):
    hate_orig_train = hate_orig.iloc[train_index]
    hate_orig_val = hate_orig.iloc[test_index]

In [117]:
hate_orig_train.shape, hate_orig_val.shape

((1183, 5), (416, 5))

train aug data only from urls which are not in val and test data

In [118]:
urls_in_val = list(set(hate_orig_val['url'].to_list()))
urls_in_test = list(set(df_test['url'].to_list()))
len(urls_in_val), len(urls_in_test)

(99, 121)

In [119]:
print(hate_aug.shape)
hate_aug = hate_aug[~hate_aug['url'].isin(urls_in_val)]
hate_aug = hate_aug[~hate_aug['url'].isin(urls_in_test)]
print(hate_aug.shape)

(4948, 5)
(2866, 5)


clean up data

In [120]:
def check(text):
    if text in ['Individual', 'Organisation', 'Location', 'Community', 'None']:
        return True
    return False

def clean(text):
    text = re.sub(r"[\(\[].*?[\)\]]", "", text)
    text = re.sub(r",", "", text)
    text = ",".join([cat for cat in sorted(list(set(text.split()))) if check(cat)])
    return text

def process_text(text):
    if text != text:
        return ''
    new_text = []

    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        t = re.sub(r'http\S+', '', t)
        new_text.append(t)
    new_text = " ".join(new_text)
    new_text = demoji.replace_with_desc(new_text, sep=' ')
    new_text = re.sub('\\s+', ' ', new_text)
    return new_text

In [121]:
hate_orig_val['hate_towards_whom'] = hate_orig_val['hate_towards_whom'].apply(lambda x: clean(x))
hate_orig_val['comment'] = hate_orig_val['comment'].apply(lambda x: process_text(x))
hate_orig_val.drop_duplicates(inplace=True)

hate_aug['hate_towards_whom'] = hate_aug['hate_towards_whom'].apply(lambda x: clean(x))
hate_aug['comment'] = hate_aug['comment'].apply(lambda x: process_text(x))
hate_aug.drop_duplicates(inplace=True)

df_test['hate_towards_whom'] = df_test['hate_towards_whom'].apply(lambda x: clean(x))
df_test['comment'] = df_test['comment'].apply(lambda x: process_text(x))
df_test.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hate_orig_val['hate_towards_whom'] = hate_orig_val['hate_towards_whom'].apply(lambda x: clean(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hate_orig_val['comment'] = hate_orig_val['comment'].apply(lambda x: process_text(x))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hate_orig_val.drop_duplicates(inplace=True)
A value is trying to be set on a co

In [122]:
hate_orig_val.to_csv('../data/with_aug_ttv/eval.csv', index=False)
hate_aug.to_csv('../data/with_aug_ttv/train.csv', index=False)
df_test.to_csv("../data/with_aug_ttv/test.csv", index=False)

In [96]:
hate_aug

Unnamed: 0,url,category,comment,hate_towards_whom,label
10,https://www.bitchute.com/video/UNB7iw1M7Uco/,fashion & beauty,you cannot mix Wild Chimps into the general po...,Community,yes
11,https://www.bitchute.com/video/UNB7iw1M7Uco/,fashion & beauty,there is no way to mix wild chimps with the ge...,Community,yes
12,https://www.bitchute.com/video/UNB7iw1M7Uco/,fashion & beauty,there's no way to mix wild chimps with the gen...,Community,yes
13,https://www.bitchute.com/video/UNB7iw1M7Uco/,fashion & beauty,you can't mix wild chimps with the general pop...,Community,yes
14,https://www.bitchute.com/video/UNB7iw1M7Uco/,fashion & beauty,keep white and black separate,Community,yes


## Merge all augmented data and clean hate towards whom label for training, validation and test data (Different task)

In [97]:
test_df = pd.read_csv('../data/with_aug/test_aug.csv')
train_df = pd.read_csv('../data/with_aug/train_aug.csv')
eval_df = pd.read_csv('../data/with_aug_ttv/eval.csv')
all_df = pd.concat([train_df, test_df])

In [98]:
prev_label = ''
for id, row in train_df.iterrows():
    if str(row['hate_towards_whom']) == 'nan':
        train_df.iloc[id]['hate_towards_whom'] = prev_label
    else:
        prev_label = row['hate_towards_whom']

In [65]:
all_df['hate_towards_whom'] = all_df['hate_towards_whom'].apply(lambda x: clean(x))
all_df['comment'] = all_df['comment'].apply(lambda x: process_text(x))
all_df.drop_duplicates(inplace=True)

all_df.to_csv('../data/with_aug/all.csv', index=False)
print(all_df['hate_towards_whom'].value_counts())

None                       2354
Individual                 1205
Community                   827
Location                    201
Organisation                157
Community,Individual         81
Community,Location           47
Community,Organisation       41
Individual,Organisation      17
Location,Organisation         9
Individual,Location           9
Name: hate_towards_whom, dtype: int64
