In [10]:
import os, sys
import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline


from app.packages.preprocessing.cleaning import *

from sklearn.model_selection import train_test_split

%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [27]:
text_col = "text"
selected_col = ["text", "sexist_binary", "idx"]
concatenate = False
url_label = "[URL]"
usr_label = "[USERNAME]"
target = "sexist_binary"


CLEANING CSVS

### Balanced With Punctuation

In [30]:
raw_balanced = pd.read_csv("data/raw_data/merged_df_en_balanced.csv")
raw_balanced = raw_balanced.rename(columns = {'Unnamed: 0':'idx'})
train_df_raw, test_df_raw = train_test_split(raw_balanced, test_size=0.2, stratify=raw_balanced['paper'], random_state=42)

train_df_raw, val_df_raw = train_test_split(train_df_raw, test_size=0.1, stratify=train_df_raw['paper'], random_state=42)


In [31]:
train_bal_punc = all_in_one(train_df_raw, text_col, selected_col, concatenate=False, func_to_exec=[True, True,True,True,True,True,True,False, True,True,True])
val_bal_punc = all_in_one(val_df_raw, text_col, selected_col, concatenate=False, func_to_exec=[True, True,True,True,True,True,True,False, True,True,True])
test_bal_punc = all_in_one(test_df_raw, text_col, selected_col, concatenate=False, func_to_exec=[True, True,True,True,True,True,True,False, True,True,True])


Amount of duplicates on : text
False    19994
True       108
Name: count, dtype: int64
drop_duplicates_from_one_col executed in 0.1 seconds, using up to 32.74MB of RAM
Amount of NaN on : text             1
sexist_binary    0
idx              0
dtype: int64
drop_na executed in 0.01 seconds, using up to 1.27MB of RAM
urls_remover executed in 0.45 seconds, using up to 1.51MB of RAM
username_remover executed in 0.26 seconds, using up to 1.36MB of RAM
emoji_replacer executed in 18.43 seconds, using up to 5.19MB of RAM
hashtag_adapter executed in 0.2 seconds, using up to 1.44MB of RAM
remove_punctuation executed in 0.32 seconds, using up to 4.83MB of RAM
remove_accents executed in 0.25 seconds, using up to 1.31MB of RAM
remove_numbers executed in 0.6 seconds, using up to 4.78MB of RAM
strip executed in 0.01 seconds, using up to 1.36MB of RAM
✅ All in One is done
Amount of duplicates on : text
False    2234
Name: count, dtype: int64
drop_duplicates_from_one_col executed in 0.0 seconds, using 

In [32]:
ds = [[train_bal_punc, "train_bal_punc"], [val_bal_punc, "val_bal_punc"],[test_bal_punc, "test_bal_punc"]]

for i in ds:
  i[0].to_csv(f"data/raw_data/{i[1]}.csv")


### Remove Ethos/Workplace + Balance + Clean

In [33]:
raw_balanced = pd.read_csv("data/raw_data/merged_df_en_balanced.csv")
raw_balanced = raw_balanced.rename(columns = {'Unnamed: 0':'idx'})

bal_four_raw = raw_balanced[raw_balanced['paper'] != 'workplace']
bal_four_raw = raw_balanced[raw_balanced['paper'] != 'ethos']

train_df_raw, test_df_raw = train_test_split(bal_four_raw, test_size=0.2, stratify=bal_four_raw['paper'], random_state=42)

train_df_raw, val_df_raw = train_test_split(train_df_raw, test_size=0.1, stratify=train_df_raw['paper'], random_state=42)


In [34]:
train_b_cl_4 = all_in_one(train_df_raw, text_col, selected_col, concatenate=False, func_to_exec=[True, True,True,True,True,True,True,False, True,False,True])
val_b_cl_4 = all_in_one(val_df_raw, text_col, selected_col, concatenate=False, func_to_exec=[True, True,True,True,True,True,True,False, True,False,True])
test_b_cl_4 = all_in_one(test_df_raw, text_col, selected_col, concatenate=False, func_to_exec=[True, True,True,True,True,True,True,False, True,False,True])


Amount of duplicates on : text
False    19930
True       112
Name: count, dtype: int64
drop_duplicates_from_one_col executed in 0.02 seconds, using up to 1.7MB of RAM
Amount of NaN on : text             1
sexist_binary    0
idx              0
dtype: int64
drop_na executed in 0.01 seconds, using up to 1.26MB of RAM
urls_remover executed in 0.32 seconds, using up to 1.5MB of RAM
username_remover executed in 0.17 seconds, using up to 1.37MB of RAM
emoji_replacer executed in 12.13 seconds, using up to 5.18MB of RAM
hashtag_adapter executed in 0.13 seconds, using up to 1.44MB of RAM
remove_punctuation executed in 0.22 seconds, using up to 4.82MB of RAM
remove_accents executed in 0.18 seconds, using up to 1.3MB of RAM
strip executed in 0.01 seconds, using up to 1.29MB of RAM
✅ All in One is done
Amount of duplicates on : text
False    2223
True        4
Name: count, dtype: int64
drop_duplicates_from_one_col executed in 0.0 seconds, using up to 0.2MB of RAM
Amount of NaN on : text            

In [35]:
ds = [[train_b_cl_4, "train_b_cl_4"], [val_b_cl_4, "val_b_cl_4"],[test_b_cl_4, "test_b_cl_4"]]

for i in ds:
  i[0].to_csv(f"data/raw_data/{i[1]}.csv")


### Unbalanced with Punctuation

In [23]:
train_unb_clean = all_in_one(train_df_raw, text_col, selected_col, concatenate=False, func_to_exec=[True, True,True,True,True,True,True,False, True,False,True])
val_unb_clean = all_in_one(val_df_raw, text_col, selected_col, concatenate=False, func_to_exec=[True, True,True,True,True,True,True,False, True,False,True])
test_unb_clean = all_in_one(test_df_raw, text_col, selected_col, concatenate=False, func_to_exec=[True, True,True,True,True,True,True,False, True,False,True])


Amount of duplicates on : text
False    37351
True       269
Name: count, dtype: int64
drop_duplicates_from_one_col executed in 0.03 seconds, using up to 3.18MB of RAM
Amount of NaN on : text             1
sexist_binary    0
idx              0
dtype: int64
drop_na executed in 0.02 seconds, using up to 2.36MB of RAM
urls_remover executed in 0.84 seconds, using up to 2.74MB of RAM
username_remover executed in 0.4 seconds, using up to 2.38MB of RAM
emoji_replacer executed in 34.9 seconds, using up to 9.55MB of RAM
hashtag_adapter executed in 0.4 seconds, using up to 2.75MB of RAM
remove_punctuation executed in 0.62 seconds, using up to 8.92MB of RAM
remove_accents executed in 0.58 seconds, using up to 2.35MB of RAM
strip executed in 0.03 seconds, using up to 2.43MB of RAM
✅ All in One is done
Amount of duplicates on : text
False    4177
True        4
Name: count, dtype: int64
drop_duplicates_from_one_col executed in 0.01 seconds, using up to 0.36MB of RAM
Amount of NaN on : text          

In [24]:
ds = [[train_unb_clean, "train_unb_clean"], [val_unb_clean, "val_unb_clean"],[test_unb_clean, "test_unb_clean"]]

for i in ds:
  i[0].to_csv(f"data/raw_data/{i[1]}.csv")


In [25]:
train_unb_clean['sexist_binary'].value_counts()


sexist_binary
0    29480
1     7870
Name: count, dtype: int64

["For a housewife that is good, they're usually too delicate to ride motorbikes",
 "Personally I think mothers can't fishing because it's not in their genes"]