In [327]:
import os

if "notebooks" in os.getcwd():
    %cd ..
from pathlib import Path

import matplotlib.pyplot as plt
import math
import pandas as pd
import numpy as np
import seaborn as sns
from src.utils.defines import INTERIM_DATA_DIR ,AUGMENTED_DATA_DIR, PROCESSED_DATA_DIR, RAW_DATA_DIR
import matplotlib.patches as mpatches

In [2]:
unlabeled_reddit = pd.read_csv(os.path.join(RAW_DATA_DIR, "reddit_1M_unlabelled.csv"))
unlabeled_GAB = pd.read_csv(os.path.join(RAW_DATA_DIR, "gab_1M_unlabelled.csv"))

In [6]:
unlabeled_reddit['key'] = unlabeled_reddit.index.astype(str) + 'reddit'
unlabeled_GAB['key'] = unlabeled_GAB.index.astype(str) + 'GAB'

In [11]:
# Calculate the number of pieces to split the dataframe into
n = 200000
num_pieces = math.ceil(len(unlabeled_reddit) / n)

# Split the dataframe into multiple smaller dataframes
dfs_unlabeled_reddit = [unlabeled_reddit.iloc[i*n:(i+1)*n] for i in range(num_pieces)]
dfs_unlabeled_GAB = [unlabeled_GAB.iloc[i*n:(i+1)*n] for i in range(num_pieces)]

In [13]:
# Save each of the smaller dataframes to separate files
for i, df in enumerate(dfs_unlabeled_reddit):
    df.to_csv(f'df_{i}reddit.csv', index=False)

for i, df in enumerate(dfs_unlabeled_GAB):
    df.to_csv(f'df_{i}GAB.csv', index=False)

In [152]:
######################################################################################################
# REFORMAT OUTPUT FROM UNLABELED - task a
task_a_unlabeled = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, "probs_task_a_350k.csv"))
unlabeled_data_10k = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, "unlabeled_data_350k.csv"))

In [153]:
# create final new data
merged_df = task_a_unlabeled.merge(unlabeled_data_10k, left_index=True, right_index=True, how='inner')

merged_df['target_a'] = np.where(merged_df['0_x'] > merged_df['1'], '0', '1')

merged_df = merged_df.drop(['0_x', '1'], axis=1)
merged_df['text'] = merged_df['0_y']
merged_df = merged_df.drop(['0_y'], axis=1)

merged_df

Unnamed: 0,target_a,text
0,0,What Ripple does is extremely difficult to dec...
1,0,"You probably won't answer, but roughly how old..."
2,0,[USER] That really is what it seems like.
3,0,This time on Pimptard-Wifestitute: Back in 199...
4,0,"Sometimes loading a page is a dramatic, Herucl..."
...,...,...
349995,0,It's OK to be mongrel.
349996,0,[USER] Ohhh so the Red Cross actually document...
349997,0,John Lennon was a hypocritical sack of shit.
349998,0,Remember this? German girl asks where are all ...


In [158]:
# take only sexist data
sexist_new = merged_df.loc[merged_df['target_a'] == '1']
sexist_new.to_csv(f'GAB_task_b.csv', index=False)

sexist_new = sexist_new.drop(['target_a'],axis=1)
sexist_new.to_csv("unlabeled_task_b_GAB.csv", index=False)
######################################################################################################

In [None]:
merged_df.head()

In [None]:
merged_df.head()

In [154]:
pd.DataFrame(np.where(merged_df.target_a == '1')).transpose()

Unnamed: 0,0
0,23
1,106
2,199
3,230
4,358
...,...
6536,349487
6537,349645
6538,349672
6539,349806


In [157]:
unlabeled_data_10k['0'][349672]

"Feminine NOT Feminist, I won't surrender my power over men without a fight. My philosophy on why not to dress like a slut."

In [208]:
######################################################################################################
# REFORMAT OUTPUT FROM UNLABELED - task b
task_b_labeled = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, "probs_task_b.csv"))
unlabeled_task_b_GAB = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, "unlabeled_task_b_GAB.csv"))

In [273]:
# Set a confidence threshold for pseudo-labeling
confidence_threshold = 0.90
test = np.array(task_b_labeled)

# Get the most confident predictions and their indices
most_confident_indices = np.where(test.max(axis=1) > confidence_threshold)[0]
most_confident_preds = pd.DataFrame(test.argmax(axis=1)[most_confident_indices])
most_confident_preds.columns = ['target_b']

In [274]:
text_task_b = unlabeled_task_b_GAB.iloc[most_confident_indices].reset_index(drop=True)

In [275]:
final_task_b = pd.concat([text_task_b, most_confident_preds],axis=1)
final_task_b

Unnamed: 0,text,target_b
0,I probably wouldn't brag about that because us...,3
1,Now I feel like we should just rib that whore....,0
2,Not everyone can be brave and talk shit online...,2
3,"And if you don't want to hear racist crap, SEL...",2
4,Junior high level 'thinking you're prescient' ...,2
...,...,...
1678,Read this and realize women are kind of stupid...,1
1679,"You know, maybe being an unattractive virgin i...",1
1680,You're right I don't understand nor care to. H...,2
1681,Moochelle-ms don't plan ahead. When 85y.o. mar...,2


In [302]:
dummy_values = np.random.rand(len(final_task_b))
df = final_task_b.copy()
df = df.assign(target_a=pd.Series(dummy_values))
df = df.assign(target_c=pd.Series(dummy_values))
df = df.assign(rewire_id=pd.Series(dummy_values))
df = df.reindex(columns=['rewire_id', 'text', 'target_a', 'target_b', 'target_c'])
df.to_csv("task_b_GAB_aug.csv", index=False)

In [310]:
df

Unnamed: 0,rewire_id,text,target_a,target_b,target_c
0,0.530177,I probably wouldn't brag about that because us...,0.530177,3,0.530177
1,0.849003,Now I feel like we should just rib that whore....,0.849003,0,0.849003
2,0.339105,Not everyone can be brave and talk shit online...,0.339105,2,0.339105
3,0.560390,"And if you don't want to hear racist crap, SEL...",0.560390,2,0.560390
4,0.692593,Junior high level 'thinking you're prescient' ...,0.692593,2,0.692593
...,...,...,...,...,...
1678,0.254854,Read this and realize women are kind of stupid...,0.254854,1,0.254854
1679,0.709994,"You know, maybe being an unattractive virgin i...",0.709994,1,0.709994
1680,0.775813,You're right I don't understand nor care to. H...,0.775813,2,0.775813
1681,0.628161,Moochelle-ms don't plan ahead. When 85y.o. mar...,0.628161,2,0.628161


In [311]:
train_aug_synonym1 = pd.read_csv(Path(AUGMENTED_DATA_DIR,"train_augmented_synonym_replacement_emb_augmax_1.csv"))

Unnamed: 0,rewire_id,text,target_a,target_b,target_c
0,0.530177,I probably wouldn't brag about that because us...,0.530177,3,0.530177
1,0.849003,Now I feel like we should just rib that whore....,0.849003,0,0.849003
2,0.339105,Not everyone can be brave and talk shit online...,0.339105,2,0.339105
3,0.56039,"And if you don't want to hear racist crap, SEL...",0.560390,2,0.560390
4,0.692593,Junior high level 'thinking you're prescient' ...,0.692593,2,0.692593
...,...,...,...,...,...
13995,sexism2022_english-12034,I blew a line at a record like that. Girl on t...,0.000000,-1,-1.000000
13996,sexism2022_english-5228,girls really get fucked around every weekend?,0.000000,-1,-1.000000
13997,sexism2022_english-10140,The hatred for moslems are Reasonable and Just...,0.000000,-1,-1.000000
13998,sexism2022_english-9726,Now this is a woman actually gets it. 👆,0.000000,-1,-1.000000


In [335]:
from src.data.text_processing import TextPreprocessor
text_preprocessor = TextPreprocessor(preprocessing_mode='none')

df = pd.read_csv(Path(INTERIM_DATA_DIR, "train.csv"))
test = pd.concat([df,train_aug_synonym1])

test['text'] = text_preprocessor.transform_series(test['text'])
test = test.to_numpy()

In [339]:
test

array([['sexism2022_english-16993', "Then, she's a keeper. 😉", 0, -1, -1],
       ['sexism2022_english-13149',
        'This is like the Metallica video where the poor mutilated bastard was saying "Please kill me" over and over again, only with emojis instead of Morse code.',
        0, -1, -1],
       ['sexism2022_english-13021', 'woman?', 0, -1, -1],
       ...,
       ['sexism2022_english-10140',
        "The hatred for moslems are Reasonable and Justified. They openly call for our subjugation, the rapes of our women and children, our massacres, and our death. I'm not going to coexist with them.",
        0, -1, -1],
       ['sexism2022_english-9726',
        'Now this is a woman actually gets it. 👆', 0, -1, -1],
       ['sexism2022_english-13365',
        '“American Idol” finalist [USER] said nothing is going to stop her from performing at The [USER] on #IndependenceDay, calls the chance a once-in-a-lifetime opportunity. [URL] [URL]',
        0, -1, -1]], dtype=object)

In [338]:
test[:, 3]

array([-1, -1, -1, ..., -1, -1, -1], dtype=object)

In [306]:
test = pd.read_csv(Path(AUGMENTED_DATA_DIR, "task_b_GAB_aug.csv"))

In [307]:
test

Unnamed: 0,rewire_id,text,target_a,target_b,target_c
0,0.530177,I probably wouldn't brag about that because us...,0.530177,3,0.530177
1,0.849003,Now I feel like we should just rib that whore....,0.849003,0,0.849003
2,0.339105,Not everyone can be brave and talk shit online...,0.339105,2,0.339105
3,0.560390,"And if you don't want to hear racist crap, SEL...",0.560390,2,0.560390
4,0.692593,Junior high level 'thinking you're prescient' ...,0.692593,2,0.692593
...,...,...,...,...,...
1678,0.254854,Read this and realize women are kind of stupid...,0.254854,1,0.254854
1679,0.709994,"You know, maybe being an unattractive virgin i...",0.709994,1,0.709994
1680,0.775813,You're right I don't understand nor care to. H...,0.775813,2,0.775813
1681,0.628161,Moochelle-ms don't plan ahead. When 85y.o. mar...,0.628161,2,0.628161
