In [1]:
# import relevant packages

import pandas as pd
import os
import re

from html import unescape
from sklearn.model_selection import train_test_split

pd.set_option('display.max_colwidth', None)

## Load raw datasets

In [21]:
df_dict = dict()

PATH = "/mounts/data/proj/faeze/data_efficient_hate/datasets/main/"

for file in sorted(os.listdir(PATH+'0_raw')):
    if 'measure' in file:
        continue
    if "ipynb" not in file:
        if file.endswith(".csv"):
            print(re.sub('\.csv$', '', file))
            df_dict[re.sub('\.csv$', '', file)] = pd.read_csv(f"{PATH}/0_raw/{file}", on_bad_lines='skip')
        else:
            print(re.sub('\.tsv$', '', file))
            df_dict[re.sub('\.tsv$', '', file)] = pd.read_csv(f"{PATH}/0_raw/{file}", on_bad_lines='skip', delimiter='\t')

bas19_es
dyn21_en
for19_pt
fou18_en
gahd24_de
has19_hi
has20_hi
has21_hi
implicit_en
ken20_en
ous19_ar
ous19_fr
san20_it
xdomain_en
xdomain_tr
xplain_en


## Reformat columns
Need separate logic for different datasets. 1 is for hateful, 0 for non-hateful.

In [22]:
df_dict["implicit_en"]['class'].replace({"implicit_hate":1, "explicit_hate":1, "not_hate":0}, inplace=True)
df_dict["implicit_en"].columns = ['text', 'label']


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_dict["implicit_en"]['class'].replace({"implicit_hate":1, "explicit_hate":1, "not_hate":0}, inplace=True)
  df_dict["implicit_en"]['class'].replace({"implicit_hate":1, "explicit_hate":1, "not_hate":0}, inplace=True)


In [23]:
df_dict["implicit_en"].label.value_counts()

label
0    13291
1     8189
Name: count, dtype: int64

In [24]:
# Dynabench 2021 / English
df_dict["dyn21_en"].label.replace({"hate":1, "nothate":0}, inplace=True)

# Founta 2018 / English
df_dict["fou18_en"].label.replace({'hateful': 1, "abusive": 0, "normal": 0, "spam": 0}, inplace = True)

# Kennedy 2020 / English
df_dict["ken20_en"].rename(columns={"label_hate_maj": "label"}, inplace=True)

# Fortuna 2019 / Portuguese
df_dict["for19_pt"].rename(columns={"hatespeech_comb": "label"}, inplace=True)

# Basile 2019 / Spanish
df_dict["bas19_es"].rename(columns={"HS": "label"}, inplace=True)

# Sanguinetti 2020 / Italian
df_dict["san20_it"].rename(columns={"hs": "label"}, inplace=True)

# Ousidhoum 2019 / Arabic & French
for d in ["ous19_ar", "ous19_fr"]:
    df_dict[d]["label"] = df_dict[d].sentiment.apply(lambda x: 1 if "hateful" in x else 0)
    # text was already cleaned in a way that conflicts with our later cleaning, so we align it here
    df_dict[d]["text"] = df_dict[d].tweet.apply(lambda x: x.replace("@url", "http"))

# HASOC 19, 20 and 21 / Hindi
for d in ["has19_hi", "has20_hi", "has21_hi"]:
    df_dict[d]["label"] = df_dict[d].task_2.apply(lambda x: 1 if x=="HATE" else 0)

# drop redundant columns
for dataset in df_dict:
    if "split" in df_dict[dataset].columns:
        df_dict[dataset] = df_dict[dataset][["text", "label", "split"]]
    else:
        df_dict[dataset] = df_dict[dataset][["text", "label"]]

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_dict["dyn21_en"].label.replace({"hate":1, "nothate":0}, inplace=True)
  df_dict["dyn21_en"].label.replace({"hate":1, "nothate":0}, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_dict["fou18_en"].label.replace({'hateful': 1, "abusive": 0, "normal": 0, "spam": 

## Clean text

In [25]:
def clean_text(text):
    text = unescape(text)
    text = re.sub(r"@[A-Za-z0-9_-]+",'@user',text) # format expected by XLM-T
    text = re.sub(r"http\S+",'http',text) # format expected by XLM-T
    text = re.sub(r"\n",' ',text)
    text = re.sub(r"\t",' ',text)
    text = text.replace("[URL]", "http") # format expected by XLM-T
    text = text.strip()
    return text

for dataset in df_dict:
    df_dict[dataset]["text"] = df_dict[dataset].text.apply(lambda x: clean_text(x))

In [26]:
# boost Kennedy 2020 / English to have 50% hate (up from ca. 30%)
df_dict["ken20_en"] = pd.concat([df_dict["ken20_en"][df_dict["ken20_en"].label==1], df_dict["ken20_en"][df_dict["ken20_en"].label==0].sample(11596, random_state=123)]).sample(frac=1, random_state=123)

# boost Founta 2018 / English to have 22% hate, which is max possible (up from ca. 5%)
df_dict["fou18_en"] = pd.concat([df_dict["fou18_en"][df_dict["fou18_en"].label==1], df_dict["fou18_en"][df_dict["fou18_en"].label==0].sample(17600, random_state=123)]).sample(frac=1, random_state=123)


In [27]:
df_dict["fou18_en"]["text"]

75975                                          RT @user: WE FUCKING DID IT!!!!!!!!!!!!!!!! ALL LOADED UP TO GO HOME TO FREEDOM!!!!!! http
28386                 @user Although one might argue that an attachment to free-floating commercialism may not be entirely unrelate… http
96446                             #ImpeachTrump #Trump #RussiaGate #Resist RT @user: Islamic State says U.S. 'being run by an idiot' http
54218                  @user The official gov stats show that the median earnings of FT female workers is 77 percent of FT male wor… http
98868                                                                               This shit way too crazy ayy! You do not amaze me ayy!
                                                                       ...                                                               
42289                                     My Twitter is worth $2,811.90!. Get your Twitter Value FREE at http #free #tools #freefollowers
41282          Takaharu for exampl

In [28]:
texts_list = []
texts_set = set()
for dataset in df_dict:
    for i in df_dict[dataset]["text"]:
        texts_list.append(i)
        if i in texts_set:
            print(dataset, i)
        texts_set.add(i)

len(texts_list), len(texts_set)

bas19_es @user @user Cállate perra
bas19_es @user @user Callate puta
bas19_es @user @user Cállate puta
bas19_es @user @user @user Cállate perra
bas19_es @user @user @user Cállate perra
bas19_es @user @user Cállate perra
bas19_es @user @user Cállate puta
bas19_es @user @user Callate hijo de perra
bas19_es @user @user Cállate perra
bas19_es @user @user @user @user Callate puta
bas19_es @user @user Cállate puta
bas19_es @user Cállate puta
bas19_es #NOME?
bas19_es @user Callate puta
bas19_es @user @user Callate puta
bas19_es @user Callate hija de puta
bas19_es @user Callate puta
bas19_es @user Cállate zorra
bas19_es @user cállate perra
bas19_es @user callate puta
bas19_es @user callate puta
bas19_es @user @user Cállate zorra
bas19_es @user Cállate perra
bas19_es @user Cállate perra
bas19_es @user Cállate zorra
bas19_es @user Cállate puta
bas19_es @user Cállate puta
bas19_es @user @user Cállate puta barata
bas19_es @user Cállate puta
bas19_es @user @user Callate puta
bas19_es @user Cállate 

(276931, 273794)

## Show descriptive stats

In [10]:
def descriptive_stats(df):
    n_total = df.shape[0]
    n_hate = df.label.sum()
    print("{} entries, of which {} ({:.2%}) are hateful.".format(n_total, n_hate, n_hate/n_total))
    return df.label.sum()/len(df), len(df)

for dataset in df_dict:
    print(dataset.upper())
    descriptive_stats(df_dict[dataset])
    display(df_dict[dataset].sample(5))
    print()

BAS19_ES
6600 entries, of which 2739 (41.50%) are hateful.


Unnamed: 0,text,label
184,// creo que algum vecino esta escuchando reggaeton pero no lo tengo claro porque suena como si coges una música árabe y le metes autotune rollo robótico,0
3772,Otro montaje sobre los refugiados ahogándose en el mar http,0
1949,"Arabe es claveee, dio me vuelvo locaa me encanta! @user",0
1916,"""El año pasado me teñí de rubia para ir a la CDMX y no indignarme porque me dijeran """"güera"""". http",0
1676,"a VER SI ES VERDAD Y, dejando de ser un bulo, se impone un poco de cordura en Europa . http",0



DYN21_EN
41144 entries, of which 22175 (53.90%) are hateful.


Unnamed: 0,text,label,split
15009,I whish I had been there only to enjoy the smell of the burning LGBT flag!!!,1,train
201,the NHS is rubbish,0,dev
23117,I actually believe religion is about controlling women. Men can gain a personal servant and they believe they are entitled to that.,0,dev
2520,I want to fucking kill myself because I live next to a tranny,1,train
17242,"The immigration rules violate personal liberty, insult your intelligence, and are symbol of ignorance and tyranny. I have not spoken to one chinese person since the start of this plandemic and have tried to encourage others to be brave enough not to comply. Unfortunately, most people are cowards. I quit my job and stopped doing business at a number of places which either insulted me by asking me to not voice my opinion one or in a few cases denied me service. Can someone explain why it is so hard to get the average person to grow some balls?""",1,train



FOR19_PT
5670 entries, of which 1788 (31.53%) are hateful.


Unnamed: 0,text,label
188,A @user fazer greve...não trabalhar..É um alívio para o BRASIL... já que ela só faz atrapalhar o país mesmo. Devo _ http,1
4538,"RT @user: queria comer polvo, mas o da copa gerou apego",0
4595,"RT @user: Com este champô não ganhas caspa, não ganhas oleosidade, não ganhas pontas espigadas, não ganhas nada! http _",0
1068,25 de julho: Dia Internacional da Mulher Negra Latino Americana e Caribenha... Então hoje é o dia só da mulher branca? #DiadaMulher,1
868,Chupa nazistinha! Vai daí @user http,1



FOU18_EN
22565 entries, of which 4965 (22.00%) are hateful.


Unnamed: 0,text,label
5444,Spoiler alert:Charlotte's going to be a 5 time SD Women's Champion by the end of the year http,0
81139,This week White Supremacists broke into this Iranian refugee's Oregon home & destroyed it. Total silence from the… http,1
66135,I'm shocked. It was impossible to predict the guy that said he had proof Obama was born in Kenya would turn out to be a pathological liar.,0
20818,"Descending into uncertainty is a healthy journey today, even i... More for Gemini http",0
89869,Going to the career fair made me realize how much I really don't know what I want to do the rest of my life 😳,0



GAHD24_DE
10996 entries, of which 4666 (42.43%) are hateful.


Unnamed: 0,text,label,split
4773,"Dieser Sommer ist nur ein Vorgeschmack darauf, was uns in Zukunft öfter und intensiver erwartet.",0,train
3491,Bünzlis essen nur Rösti mit Bauernspeck. #fact,0,train
9742,"Wenn sich herausstellen würde, dass mein Sohn rechtsradikal ist, würde ich mich so schämen, dass ich es niemals akzeptieren könnte",0,test
6045,"Sobald Sie zulassen, dass andere Kulturen Sie überlegen und ihre Gesetze und Bräuche auf Sie erzwingen, gibt es ein Problem.",0,train
1564,„UNAPOLOGETISCH WEISSE SEGREGATION MIT REPRÄSENTATION!“ Das Motto vieler Rassisten auf dieser Welt.,0,test



HAS19_HI
5983 entries, of which 746 (12.47%) are hateful.


Unnamed: 0,text,label,split
5348,जम्मू कश्मीर में किसी को डरने की जरूरत नहीं है क्योंकि वह सारे पत्थरबाज दिल्ली में आ गए हैं और दिल्ली के मंदिरों में तोड़फोड़ चालू है सरकार को विश्वास जीतना है मुसलमानों का तो खुद सिर पर पत्थर खाए ना कि हिंदुओं की आस्था और मंदिरों पर चोट होने दे,1,test
1149,"5-6 जिहादी मिलकर 1 हिन्दू को मार दिए हिन्दू लड़के की गलती सिर्फ इतनी थी कि वो नशे की हालत में किसी और घर मे घुस गया और लड़की के पुकारने पर डर के भाग गया। मामला हिन्दू लड़के की मौत का है इसलिए न्यूज़ में नही आई, लड़का मुस्लिम होता तो होड़ मच जाती न्यूज़ चैनलस पर TRP के लिए।",1,train
1478,कोई गुनाह नहीं है भाई...इन हरामी सूअरों को पेलते रहो और जो चूतिये इनके बचाव में बोले उसे भी रेल दोअब मिंनट आरजू करने का समय खत्म हुआ,0,train
3899,@user आपका जिस्म निचोडने का मोका तो दो..DM मे एक रिप्लाय देदो फिर देखो लुंड और चुत की टक्कर surgical strike कर देंगे चुत पर,0,train
3355,"दिल्ली के 7 MP इनके हैं, दिल्ली पुलिस इनकी है। केंद्र में सरकार इनकी बैठी है। अब अगर वहाँ सुअर 'अल्लाह हू अकबर' बोलकर एक मंदिर में तोड़फोड़ करते है। कहीं मीडिया में कोई खबर नही। ये कौन सा सबका साथ, सबका विकास है ?? #मुस्लिम",1,train



HAS20_HI
4232 entries, of which 347 (8.20%) are hateful.


Unnamed: 0,text,label,split
1290,मोदीजी के डर से चीन के राष्ट्रपति ने RSS की सदस्यता ग्रहण कर दी। मोदी हैं तो मुमकिन है। सुना है 23 मई को @user @user @user भी RSS जॉइन करेंगे। 😊😊 http,0,train
1905,@user @user देश की जनता और देश की अदालत यह जान चुकी है कि कौन चोर है और कौन चौकीदार है जो चोरी कर रहा है यह सब जो भी है वह आप ही के पार्टी के लोग हैं,0,train
1622,RT @user: फांसी के फंदे को भी ऐसे पापियों से घिन आती होगी इनको चौराहे पर कोङे बरसा के छोड़ देना चाहिए मरने के लिए ताकि जैसे जैसे…,0,train
1552,RT @user: मैं अपने छोटे मुख से कैसे करूँ तेरा गुणगान माँ तेरे प्यार आगे फीका सा लगता जहान #HappyMothersDay गुरू माँ नसीब कौर…,0,train
623,"RT @user: चलो चले माँ!सपनों के गाँव में, काँटों से दूर कहीं, फूलों की छाँव में, चलो चले माँ... हो रहे इशारे रेशमी घटाओं में, चलो चले…",0,train



HAS21_HI
4594 entries, of which 566 (12.32%) are hateful.


Unnamed: 0,text,label,split
118,@user @user ONCE IN A CENTURY @user @user #मनहुस_मोदी का साया मेरे देश के ऊपर से हट जाएगा आर्मी सारा काम संभाल रही है दवाई ऑक्सीजन आदि भी तेरे राजनीति से मुक्त हो जायेगी सभी के अपने अकालमृत्यु मर रहे है #ResignModi मानवता केलिऐ #Resign_PM_Modi देशभक्त हो तो,0,train
3502,@user elas falando: fo ekcekankoefnfsseseesromfln я сукаमैं एक कुतिया हूँ,0,train
2810,चुप रहो विपक्ष राष्ट्रवाद चरम पर है। http,0,train
214,@user दही तो तुम्हारे मुंह मे भी जमी हुई थी 😠😠 #JusticeForShahabuddin,0,train
399,@user @user @user @user @user ONCE IN A CENTURY @user @user #मनहुस_मोदी का साया मेरे देश के ऊपर से हट जाएगा आर्मी सारा काम संभाल रही है दवाई ऑक्सीजन आदि भी तेरे राजनीति से मुक्त हो जायेगी सभी के अपने अकालमृत्यु मर रहे है #ResignModi मानवता केलिऐ #Resign_PM_Modi देशभक्त हो तो,0,train



IMPLICIT_EN
21480 entries, of which 8189 (38.12%) are hateful.


Unnamed: 0,text,label
17897,i also respect people like hunter wallace and matt heimbach . stand up guys .,0
19446,"sub-saharan africans, the majority of american black lineage, are subhuman. the genetic data is clear. no matter what the cucks wish.",1
12638,rajan's handling of foreign exchange rates completely nullified the gains of fall in #crudeoil prices big national loss,0
13290,he is always talking about killing white people but he couldn't kill couldn't kill a fly !,0
14577,the alt right should be tearing down american history with the left because once history is erased the new can be born .,0



KEN20_EN
23192 entries, of which 11596 (50.00%) are hateful.


Unnamed: 0,text,label
14741,"Let's just forget the two mass shootings within 24 hours both done by white supremacists that didn't have a voice or platform before the fat, useless, racist traitor in the Whitehouse gave them one.",1
8933,Fuck feminists they should all die,0
26799,can u losers just get laid already so u can stop tweeting graphic details ab ur sexual desires,0
4560,"> I thought I was being a devout Hellenic using my wits to properly respect the gods. If there's one fucking thing Hellenic gods cannot *abide*, it's a mortal who thinks they's smarter than they are. They'll get that clever son of a bitch every time. Source: literally every Greek myth.",1
29988,@user Girl no ain't nobody boutta b sucking dick all night 😂😂😂😂 I just wanna lay on my nigga chest and hit some j's dang 😭😭😭😭,0



MEASURE_EN
22026 entries, of which 10334 (46.92%) are hateful.


Unnamed: 0,text,label,split
9025,">Jerking off to any cocks is gay. Jerking off to black cocks is mega cuck and mega gay Okay we've established how to define gay and mega gay, but how do we define fully automated luxury gay space communism?",0,train
9508,That bitch is responsible for the deaths of American citizens. She should be in Guantanamo along with the fucking Kenyan.,1,train
21664,this nigga a anime character URL,1,test
18805,Lol at this homo projecting gay shit onto normal art.,0,test
17048,Well we all know what happens when Germans take things seriously.....,0,validation



OUS19_AR
3353 entries, of which 755 (22.52%) are hateful.


Unnamed: 0,text,label
38,والله هندوس ع وش اللي جابوهم السبت,0
1639,بس ي بايره,0
2718,@user وبالنسبة للأطفال اللي بتقتلوهم يا خنزير,0
1190,وفي كساد البنات عن الزواج ..البنت بايرة.,0
2181,@user @user على اساس انتي بروفيسورة بعلم شراب بول البعير الطازج ومتفتهمين كلامي,0



OUS19_FR
4014 entries, of which 399 (9.94%) are hateful.


Unnamed: 0,text,label
1701,ptdr il la découpé dirait les renois au foot staprem http,0
600,les rebeus dans les brocantes vous êtes pas possible,0
1879,@user @user @user @user @user comment débattre entre gauchiste sans aucune contradiction,0
742,c’est un gauchiste normal qu’il soit con http,0
1616,@user t’es attardé tu m’as unfollow,0



SAN20_IT
8100 entries, of which 3388 (41.83%) are hateful.


Unnamed: 0,text,label,split
5158,"@user Tanto a te che ti frega...In Svizzera,fuori dalla UE,i migranti clandestini non arrivano.Non devi vivere con un C.I.E. vicino casa,con all'interno 70 risorse,non devi subire il furto di 3 biciclette,e 2 furti in casa.Comodo farsi bella,con il culo degli altri.IPOCRITA...",1,train
5214,"@user I BUONISTI IPOCRITI ACCOLGANO NELLE LORO CASE LE FAMIGLIE STRANIERE, NON NELLE SCUOLE CON I SOLDI DEGLI ITALIANI, MENTRE L'ISLAM DISTRUGGE LE CHIESE E NON PERMETTE LA LORO COSTRUZIONIE, PERCIO' TUTTI A CASA VOSTRA....",1,train
119,"Milano, polizia arresta ladri rom e sgombera il loro camper: la gente applaude http",0,train
3746,@user possibile che non possiamo cacciare questi maledetti rom dall'italia? Non ci rimane che votare lega nord.,1,train
2317,E di nuovo va a trovare i musulmani... E di nuovo cn i migranti e di nuovo...E ancora.#Francesco 👎,0,train



XDOMAIN_EN
56100 entries, of which 10890 (19.41%) are hateful.


Unnamed: 0,text,label,split
26496,"Anytime I remember Gareth Bale is stuck on the Madrid bench I feel real pain man, was so so good and is now the real madrid equivalent of Ozil ffs http",0,train
45571,HAHAHAHAHA Turks are so mad i love to see it 🤣🤣🤣🤣🤣 they were saying they will destroy Latvia 🤣🤣🤣,1,test
17313,Joe Biden under probe in Ukraine for alleged link to top prosecutor’s 2016 ouster: report http #FoxNews,0,train
47499,YEP OFF TWITTER UNTIL 7 THIS BOUT TO GET OUTTA HAND IMMA EAT THIS STUFFED CRUST PIZZA AND NAP THEN HOPEFULLY THIS GENDER WAR WILL HAVE DIED DOWN,0,test
36387,Just tryna be thicc like Nic #body #bodybuilding #bigbutt #LGBTQ http,0,validation



XDOMAIN_TR
45159 entries, of which 19269 (42.67%) are hateful.


Unnamed: 0,text,label,split
18156,"Mazinde bin şike yatar, geber fenerbahçe. Ne mutlu seni yenene, geber fenerbahçe #KadıkoeydeKaraGece",1,train
38754,"Bugün çok acaip haberler okudum, Adam imam nikahlı karısının kapısına bomba düzeneği kurmuş 😳 Başka bir adam başka bir hanımı 1570 farklı numaradan taciz etmiş. 😳",0,test
42311,LGBT pisliği yüzünden Gökkuşağından nefret ettim,1,test
43910,Ruslar dünyaya duyurdu! Bayraktarlara karşı bir süper silah... @user http,0,test
25912,KAZA DEĞİL CİNAYET! İstanbul’da 3 farklı geriye düşmesine rağmen muhteşem bir geri dönüş sergileyerek skoru dengeleyen Bursaspor’da coşku hakemin hatalı kararları nedeniyle camiada yaşanan tepkinin gölgesinde kaldı. http http,1,train



XPLAIN_EN
13749 entries, of which 7814 (56.83%) are hateful.


Unnamed: 0,text,label,split
9345,<user> <user> we are iraqi refugees we ask you to process our delayed files please standwithiraqirefugees,1,train
10010,the below countries adamantly refused syrian refugees and their reasons were exactly the same they are way too dangerous to allow in libtards are putting themselves especially their children in extreme danger by supporting them,1,train
13304,<user> do not understand why in america it all about black or white there are yellow mongoloids and brown orientals people as well most suppressed ones never get to raise their voice v unbeatable loosing in agt is an example to show how racially biased americans are,1,test
687,i am an ashkenazi jew,1,train
1539,<user> gay asf of u to say this kisses u,1,train





In [82]:
new_dict = {}
# new_dict["xdomain_tr"] = df_dict["xdomain_tr"]
# new_dict["xdomain_en"] = df_dict["xdomain_en"]
# new_dict["gahd24_de"] = df_dict["gahd24_de"]
new_dict["xplain_en"] = df_dict["xplain_en"]
# new_dict["measure_en"] = df_dict["measure_en"]
df_dict = new_dict

In [83]:
df_dict.keys()

dict_keys(['xplain_en'])

In [84]:
df = new_dict['xplain_en']

In [85]:
df.split.value_counts()

split
train         10999
test           1376
validation     1374
Name: count, dtype: int64

In [89]:
train_df = df[df['split'] == 'train']
dev_df = df[df['split'] == 'validation']
test_df = df[df['split'] == 'test']
# Adjust dev and test sizes
dev_to_test = dev_df.sample(n=2000 - len(test_df), random_state=123)  # Sample 354 from dev to add to test
remaining_dev = dev_df.drop(dev_to_test.index)

# Create new test set by combining existing test and sampled dev
new_test_df = pd.concat([test_df, dev_to_test])

# Split the remaining dev into dev (500 instances) and extra_dev
new_dev_df = remaining_dev.sample(n=500, random_state=123)
extra_dev_df = remaining_dev.drop(new_dev_df.index)

train_df = train_df.drop(columns=['split'])
new_dev_df = new_dev_df.drop(columns=['split'])
new_test_df = new_test_df.drop(columns=['split'])
extra_dev_df = extra_dev_df.drop(columns=['split'])

In [101]:
path = '/mounts/work/faeze/data_efficient_hate/datasets/main/1_clean/xplain_en/'
df_dict['xplain_en'] = train_df
# Save the adjusted splits to separate files
train_df.to_csv(path+f'train_{train_df.shape[0]}.csv', index=False)
new_dev_df.to_csv(path + f'dev_{new_dev_df.shape[0]}.csv', index=False)
new_test_df.to_csv(path + f'test_{new_test_df.shape[0]}.csv', index=False)
extra_dev_df.to_csv(path + f'extra_dev_{extra_dev_df.shape[0]}.csv', index=False)


In [102]:
train_df.shape, new_dev_df.shape, extra_dev_df.shape, new_test_df.shape

((10999, 2), (500, 2), (250, 2), (2000, 2))

(7701, 2)

In [7]:
# set aside 2k from each dataset for testing and 500 for dev
# except for Ousidhoum in French and Arabic, where train set would otherwise be too small
# and for HASOC 20 and 21 in Hindi, where test splits are given

TEST_SIZE = 2000
DEV_SIZE = 500

for dataset in df_dict:
    if "ous19_fr" in dataset:
        df_dict[dataset], devtest = train_test_split(df_dict[dataset], test_size = 2000, random_state=123)
        devset, testset = train_test_split(devtest, test_size = 1500, random_state=123)
        devset.to_csv(PATH + f"/1_clean/{dataset}/dev_500.csv", index=False)
        testset.to_csv(PATH + f"/1_clean/{dataset}/test_1500.csv", index=False)
    elif "ous19_ar" in dataset:
        df_dict[dataset], devtest = train_test_split(df_dict[dataset], test_size = 1300, random_state=123)
        devset, testset = train_test_split(devtest, test_size = 1000, random_state=123)
        devset.to_csv(PATH + f"/1_clean/{dataset}/dev_300.csv", index=False)
        testset.to_csv(PATH + f"/1_clean/{dataset}/test_1000.csv", index=False)
    elif "has19_hi" in dataset or "has20_hi" in dataset: # use provided test sets
        df_dict[dataset][df_dict[dataset]["split"]=="test"].to_csv(PATH + f"/1_clean/{dataset}/test_{len(df_dict[dataset][df_dict[dataset]['split']=='test'])}.csv", index=False)
        df_dict[dataset], devset = train_test_split(df_dict[dataset][df_dict[dataset]["split"]=="train"], test_size = 500, random_state=123)
        devset.to_csv(PATH + f"/1_clean/{dataset}/dev_500.csv", index=False)
    else:
        df_dict[dataset], devtest = train_test_split(df_dict[dataset], test_size = TEST_SIZE+DEV_SIZE, random_state=123)
        devset, testset = train_test_split(devtest, test_size = TEST_SIZE, random_state=123)
        devset.to_csv(PATH + f"/1_clean/{dataset}/dev_{DEV_SIZE}.csv", index=False)
        testset.to_csv(PATH + f"/1_clean/{dataset}/test_{TEST_SIZE}.csv", index=False)
        
# export all data that is not test or dev, so we can use it for full sample training
for dataset in df_dict:
    df_dict[dataset].to_csv(PATH + f"/1_clean/{dataset}/train_{len(df_dict[dataset])}.csv", index=False)

In [8]:
# create differently-sized train portions from rest of data

SEEDS = 10 # for repeated experiments with different random data selection
N_RANGE = [10, 20, 30, 40, 50, 100, 200, 300, 400, 500, 1000, 2000, 3000, 4000, 5000, 10000, 20000]

for dataset in df_dict:
    print(dataset.upper())
    for n in N_RANGE:
        
        # save all splits for English test sets
        if n<len(df_dict[dataset]) and ("_en" in dataset):
            print(f"  saving n = {n} training set")
            for random_state in range(1, SEEDS+1):
                df_dict[dataset].sample(n, random_state = random_state).to_csv(PATH + f"/1_clean/{dataset}/train/train_{n}_rs{random_state}.csv",index=False)
        
        # save splits up to 2k for other datasets
        elif n<len(df_dict[dataset]) and n<=2000: 
            print(f"  saving n = {n} training set")
            for random_state in range(1, SEEDS+1):  
                df_dict[dataset].sample(n, random_state = random_state).to_csv(PATH + f"/1_clean/{dataset}/train/train_{n}_rs{random_state}.csv",index=False)
    
    print()

IMPLICIT_EN
  saving n = 10 training set
  saving n = 20 training set
  saving n = 30 training set
  saving n = 40 training set
  saving n = 50 training set
  saving n = 100 training set
  saving n = 200 training set
  saving n = 300 training set
  saving n = 400 training set
  saving n = 500 training set
  saving n = 1000 training set
  saving n = 2000 training set
  saving n = 3000 training set
  saving n = 4000 training set
  saving n = 5000 training set
  saving n = 10000 training set



In [9]:
df_dict.keys()

dict_keys(['implicit_en'])