## Data Pre-processing steps

In [1]:
# import the necessary libraries
import json
import pandas as pd
import emoji
import re
from sklearn.model_selection import train_test_split

### preprocessing of the Chinese dataset (SWSR)

In [2]:
# Load the dataset
with open('/Users/zhangshanshan/Desktop/thesis/dataset/SRSW/SexComment.csv', 'r') as zh_dataset_file:
    zh_dataset = pd.read_csv(zh_dataset_file)

In [3]:
zh_dataset

Unnamed: 0,index,weibo_id,comment_text,x,location,like,date,label,category,target
0,0,J1zF7smOR,她们不结婚不要孩子，却觉得我们这种结婚有孩子的是婚驴,女,广东,14,12/05/2020 15:04,1,SCB,G
1,1,GvXf0ymdJ,妈的什么时候有人来性侵我一下啊 不要封口费的那种,男,其他,36,23/08/2018 13:10,1,SO,I
2,2,J2XeR8rPn,我觉得这就是对一类人所有的特点进行形容吧，“爹味”也是贬义词，为什么不觉得有性别歧视意味呢？,女,浙江,0,21/05/2020 16:12,1,MA,G
3,3,IxpfTCMAo,姐姐你不要急，关注久的都知道她骂的一切归根结底都是渴婚的，她并不骂学艺术的，她骂的是不肯像培...,女,江苏,1,31/03/2020 13:57,1,MA,I
4,4,J4B0Mps5Q,"给他们争取权利的不是女权是腐女，你该不会说腐女都女权吧？连女同性恋都不是全部女权，腐女？,男...",女,其他,0,##########,1,MA,G
...,...,...,...,...,...,...,...,...,...,...
8964,8964,GxUGteFvS,之前一直是五个男生那种，你们批评我吧,男,广东,0,05/09/2018 12:04,0,,
8965,8965,Fq0uZhLsx,中国的小孩也可以随母亲姓啊 我一个朋友就是随母亲姓的,男,其他,0,12/10/2017 11:50,0,,
8966,8966,H5OE5y1ij,所有为韦涛卖命而去攻击别人的炮灰都快点死.......咒语灵灵灵。。。。。。,男,广东,2,05/12/2018 17:50,0,,
8967,8967,J2cWRaHx1,本质是阶级矛盾……比如你是个资本家，别人抱你大腿的时候就不会管你是男还是女,男,上海,1,16/05/2020 17:56,0,,


In [4]:
# define a function to replace emoji with textual description
def replace_emoji_with_text(text):
    return emoji.demojize(text)

In [5]:
# apply the function to the weibo in the dataset
zh_dataset['comment_text'] = zh_dataset['comment_text'].apply(replace_emoji_with_text)

In [6]:
# examples
zh_dataset['comment_text'].head(10)

0                           她们不结婚不要孩子，却觉得我们这种结婚有孩子的是婚驴
1                            妈的什么时候有人来性侵我一下啊  不要封口费的那种
2       我觉得这就是对一类人所有的特点进行形容吧，“爹味”也是贬义词，为什么不觉得有性别歧视意味呢？
3    姐姐你不要急，关注久的都知道她骂的一切归根结底都是渴婚的，她并不骂学艺术的，她骂的是不肯像培...
4    给他们争取权利的不是女权是腐女，你该不会说腐女都女权吧？连女同性恋都不是全部女权，腐女？,男...
5      此类人虽说我平时少见，但今天这位一谈feminist就女拳的同学还是令我想隔着屏幕对他重拳出击
6    人夫妻相处之道我们也不知道，可能就抓拍了这一小段有什么原因也不一定，看着是不好看，不过上升到...
7    你拉鸡八倒吧，法律还说人人平等呢   ， 现实 真的 人人平等了吗？ 真是 搞笑的很。  现...
8    说了这么多你不就是在用metoo一个特例来给你骡子洗白么 还顺便把metoo骂得这么狗屁不是...
9    WA话说的没错啊…当年颁奖季:teacup_without_handle:就是站在metoo这边的
Name: comment_text, dtype: object

In [7]:
# define a function to remove all the links in the posts
def remove_urls (text):
    text = re.sub(r'(https|http|www)?:\/\/\S+|www\.\S+', 'link', text, flags=re.MULTILINE)
    return(text)

In [8]:
# apply the function to the dataset
zh_dataset['comment_text'] = zh_dataset['comment_text'].apply(remove_urls)

In [9]:
# split the training and test subset
train_zh, test_zh = train_test_split(zh_dataset, test_size=0.2, random_state=42)

In [10]:
# save them in csv format
train_zh.to_csv('train_zh_dataset.csv', index=False)
test_zh.to_csv('test_zh_dataset.csv', index=False)

### Pre-processing steps of the English dataset (EXIST 2024)

In [11]:
# Load the JSON data of the orginal tranining set
with open('/Users/zhangshanshan/Desktop/thesis/dataset/EXIST 2024 Lab/EXIST 2024 Tweets Dataset/training/EXIST2024_training.json', 'r') as train_file:
    EXIST_training_data = json.load(train_file)

In [12]:
EXIST_train_dataset = pd.DataFrame(EXIST_training_data)

In [14]:
# Load the JSON data of the orginal validation set
with open('/Users/zhangshanshan/Desktop/thesis/dataset/EXIST 2024 Lab/EXIST 2024 Tweets Dataset/dev/EXIST2024_dev.json', 'r') as dev_file:
    EXIST_dev_data = json.load(dev_file)

In [15]:
EXIST_dev_dataset = pd.DataFrame(EXIST_dev_data)

In [16]:
EXIST_train_dataset.head()

Unnamed: 0,100001,100002,100003,100004,100005,100006,100007,100008,100009,100010,...,203251,203252,203253,203254,203255,203256,203257,203258,203259,203260
id_EXIST,100001,100002,100003,100004,100005,100006,100007,100008,100009,100010,...,203251,203252,203253,203254,203255,203256,203257,203258,203259,203260
lang,es,es,es,es,es,es,es,es,es,es,...,en,en,en,en,en,en,en,en,en,en
tweet,"@TheChiflis Ignora al otro, es un capullo.El p...",@ultimonomada_ Si comicsgate se parece en algo...,"@Steven2897 Lee sobre Gamergate, y como eso ha...",@Lunariita7 Un retraso social bastante lamenta...,@novadragon21 @icep4ck @TvDannyZ Entonces como...,@yonkykong Aaah sí. Andrew Dobson. El que se d...,@glutamatom @JoaquinAdolfoC Estaba del lado de...,@BestKabest Esta gringa sigue llorando por el ...,.¿Conoces la #DECORACION #estilo #GAMER para #...,CES 2022 ASUS ROG Rise of Gamers Evento de lan...,...,"""you look like a whore"" I'm literally wearing ...",“You look like a whore” if you think I’m cute ...,Who fucking lied to you? You look like a whore...,@ShefVaidya Ma'am if I say that you look like ...,I forgot I have a m*d that changes the drachen...,idk why y’all bitches think having half your a...,This has been a part of an experiment with @Wo...,"""Take me already"" ""Not yet. You gotta be ready...",@clintneedcoffee why do you look like a whore?...,ik when mandy says “you look like a whore” i l...
number_annotators,6,6,6,6,6,6,6,6,6,6,...,6,6,6,6,6,6,6,6,6,6
annotators,"[Annotator_1, Annotator_2, Annotator_3, Annota...","[Annotator_7, Annotator_8, Annotator_9, Annota...","[Annotator_7, Annotator_8, Annotator_9, Annota...","[Annotator_13, Annotator_14, Annotator_15, Ann...","[Annotator_19, Annotator_20, Annotator_21, Ann...","[Annotator_25, Annotator_26, Annotator_27, Ann...","[Annotator_25, Annotator_26, Annotator_27, Ann...","[Annotator_25, Annotator_26, Annotator_27, Ann...","[Annotator_31, Annotator_32, Annotator_33, Ann...","[Annotator_37, Annotator_38, Annotator_39, Ann...",...,"[Annotator_473, Annotator_474, Annotator_475, ...","[Annotator_617, Annotator_618, Annotator_619, ...","[Annotator_617, Annotator_618, Annotator_619, ...","[Annotator_668, Annotator_669, Annotator_670, ...","[Annotator_674, Annotator_675, Annotator_676, ...","[Annotator_478, Annotator_479, Annotator_480, ...","[Annotator_668, Annotator_669, Annotator_670, ...","[Annotator_467, Annotator_468, Annotator_469, ...","[Annotator_674, Annotator_675, Annotator_676, ...","[Annotator_473, Annotator_474, Annotator_475, ..."


In [17]:
EXIST_train_dataset_transposed = EXIST_train_dataset.T

In [18]:
EXIST_train_dataset_transposed

Unnamed: 0,id_EXIST,lang,tweet,number_annotators,annotators,gender_annotators,age_annotators,ethnicities_annotators,study_levels_annotators,countries_annotators,labels_task1,labels_task2,labels_task3,split
100001,100001,es,"@TheChiflis Ignora al otro, es un capullo.El p...",6,"[Annotator_1, Annotator_2, Annotator_3, Annota...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[White or Caucasian, Hispano or Latino, White ...","[Bachelor’s degree, Bachelor’s degree, High sc...","[Italy, Mexico, United States, Spain, Spain, C...","[YES, YES, NO, YES, YES, YES]","[REPORTED, JUDGEMENTAL, -, REPORTED, JUDGEMENT...","[[OBJECTIFICATION], [OBJECTIFICATION, SEXUAL-V...",TRAIN_ES
100002,100002,es,@ultimonomada_ Si comicsgate se parece en algo...,6,"[Annotator_7, Annotator_8, Annotator_9, Annota...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Black or African American, Hispano or Latino,...","[High school degree or equivalent, Bachelor’s ...","[United Kingdom, Mexico, United States, Portug...","[NO, NO, NO, NO, YES, NO]","[-, -, -, -, DIRECT, -]","[[-], [-], [-], [-], [OBJECTIFICATION], [-]]",TRAIN_ES
100003,100003,es,"@Steven2897 Lee sobre Gamergate, y como eso ha...",6,"[Annotator_7, Annotator_8, Annotator_9, Annota...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Black or African American, Hispano or Latino,...","[High school degree or equivalent, Bachelor’s ...","[United Kingdom, Mexico, United States, Portug...","[NO, NO, NO, NO, NO, NO]","[-, -, -, -, -, -]","[[-], [-], [-], [-], [-], [-]]",TRAIN_ES
100004,100004,es,@Lunariita7 Un retraso social bastante lamenta...,6,"[Annotator_13, Annotator_14, Annotator_15, Ann...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Hispano or Latino, Hispano or Latino, White o...","[High school degree or equivalent, Bachelor’s ...","[Mexico, Chile, Spain, Spain, Portugal, Spain]","[NO, NO, YES, NO, YES, YES]","[-, -, DIRECT, -, REPORTED, REPORTED]","[[-], [-], [IDEOLOGICAL-INEQUALITY], [-], [IDE...",TRAIN_ES
100005,100005,es,@novadragon21 @icep4ck @TvDannyZ Entonces como...,6,"[Annotator_19, Annotator_20, Annotator_21, Ann...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Hispano or Latino, Hispano or Latino, White o...","[Bachelor’s degree, Bachelor’s degree, Master’...","[Mexico, Afghanistan, United States, Italy, Po...","[YES, NO, YES, NO, YES, YES]","[REPORTED, -, JUDGEMENTAL, -, JUDGEMENTAL, DIR...","[[STEREOTYPING-DOMINANCE, OBJECTIFICATION], [-...",TRAIN_ES
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203256,203256,en,idk why y’all bitches think having half your a...,6,"[Annotator_478, Annotator_479, Annotator_480, ...","[F, F, M, M, M, F]","[18-22, 23-45, 18-22, 23-45, 46+, 46+]","[White or Caucasian, Black or African American...","[High school degree or equivalent, Bachelor’s ...","[Hungary, South Africa, Chile, Portugal, Unite...","[YES, YES, YES, YES, YES, YES]","[JUDGEMENTAL, DIRECT, DIRECT, DIRECT, JUDGEMEN...","[[OBJECTIFICATION], [STEREOTYPING-DOMINANCE, S...",TRAIN_EN
203257,203257,en,This has been a part of an experiment with @Wo...,6,"[Annotator_668, Annotator_669, Annotator_670, ...","[F, F, M, M, M, F]","[18-22, 23-45, 18-22, 23-45, 46+, 46+]","[Hispano or Latino, other, White or Caucasian,...","[High school degree or equivalent, Master’s de...","[Mexico, Algeria, Portugal, Spain, United King...","[YES, YES, YES, YES, YES, YES]","[JUDGEMENTAL, REPORTED, JUDGEMENTAL, DIRECT, J...","[[OBJECTIFICATION], [OBJECTIFICATION], [OBJECT...",TRAIN_EN
203258,203258,en,"""Take me already"" ""Not yet. You gotta be ready...",6,"[Annotator_467, Annotator_468, Annotator_469, ...","[F, F, M, M, M, F]","[18-22, 23-45, 18-22, 23-45, 46+, 46+]","[White or Caucasian, White or Caucasian, White...","[High school degree or equivalent, Bachelor’s ...","[Poland, Poland, Portugal, Canada, United King...","[NO, YES, NO, YES, YES, YES]","[-, DIRECT, -, DIRECT, DIRECT, JUDGEMENTAL]","[[-], [OBJECTIFICATION], [-], [SEXUAL-VIOLENCE...",TRAIN_EN
203259,203259,en,@clintneedcoffee why do you look like a whore?...,6,"[Annotator_674, Annotator_675, Annotator_676, ...","[F, F, M, M, M, F]","[18-22, 23-45, 18-22, 23-45, 46+, 46+]","[Black or African American, Black or African A...","[High school degree or equivalent, Bachelor’s ...","[South Africa, South Africa, Portugal, Portuga...","[YES, YES, YES, YES, YES, YES]","[DIRECT, DIRECT, DIRECT, DIRECT, JUDGEMENTAL, ...","[[OBJECTIFICATION, SEXUAL-VIOLENCE, MISOGYNY-N...",TRAIN_EN


In [19]:
EXIST_dev_dataset_transposed = EXIST_dev_dataset.T

In [20]:
EXIST_dev_dataset_transposed

Unnamed: 0,id_EXIST,lang,tweet,number_annotators,annotators,gender_annotators,age_annotators,ethnicities_annotators,study_levels_annotators,countries_annotators,labels_task1,labels_task2,labels_task3,split
300001,300001,es,@Fichinescu La comunidad gamer es un antro de ...,6,"[Annotator_726, Annotator_727, Annotator_357, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 18-22, 23-45, 46+]","[Hispano or Latino, Hispano or Latino, White o...","[High school degree or equivalent, Bachelor’s ...","[Mexico, Chile, Serbia, Portugal, Mexico, Spain]","[NO, YES, YES, NO, YES, NO]","[-, JUDGEMENTAL, JUDGEMENTAL, -, REPORTED, -]","[[-], [MISOGYNY-NON-SEXUAL-VIOLENCE], [MISOGYN...",DEV_ES
300002,300002,es,@anacaotica88 @MordorLivin No me acuerdo de lo...,6,"[Annotator_731, Annotator_732, Annotator_315, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 18-22, 23-45, 46+]","[White or Caucasian, Hispano or Latino, White ...","[High school degree or equivalent, Bachelor’s ...","[Spain, Chile, United Kingdom, Chile, Chile, S...","[YES, YES, NO, YES, YES, YES]","[JUDGEMENTAL, REPORTED, -, JUDGEMENTAL, JUDGEM...","[[IDEOLOGICAL-INEQUALITY, STEREOTYPING-DOMINAN...",DEV_ES
300003,300003,es,@cosmicJunkBot lo digo cada pocos dias y lo re...,6,"[Annotator_735, Annotator_736, Annotator_345, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 18-22, 23-45, 46+]","[White or Caucasian, White or Caucasian, White...","[Bachelor’s degree, Master’s degree, Master’s ...","[Italy, Spain, Germany, Portugal, Spain, Spain]","[NO, NO, NO, NO, NO, NO]","[-, -, -, -, -, -]","[[-], [-], [-], [-], [-], [-]]",DEV_ES
300004,300004,es,Also mientras les decia eso la señalaba y deci...,6,"[Annotator_259, Annotator_739, Annotator_291, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 18-22, 23-45, 46+]","[Hispano or Latino, Hispano or Latino, White o...","[Bachelor’s degree, Bachelor’s degree, High sc...","[Mexico, Mexico, Portugal, Mexico, Mexico, Spain]","[NO, YES, YES, YES, YES, YES]","[-, REPORTED, REPORTED, REPORTED, JUDGEMENTAL,...","[[-], [SEXUAL-VIOLENCE], [SEXUAL-VIOLENCE], [S...",DEV_ES
300005,300005,es,"And all people killed, attacked, harassed by ...",6,"[Annotator_731, Annotator_732, Annotator_315, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 18-22, 23-45, 46+]","[White or Caucasian, Hispano or Latino, White ...","[High school degree or equivalent, Bachelor’s ...","[Spain, Chile, United Kingdom, Chile, Chile, S...","[NO, YES, NO, NO, NO, NO]","[-, DIRECT, -, -, -, -]","[[-], [STEREOTYPING-DOMINANCE], [-], [-], [-],...",DEV_ES
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
400485,400485,en,@YesReallyAngel “Don’t wear a black bra with a...,6,"[Annotator_801, Annotator_182, Annotator_802, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 18-22, 23-45, 46+]","[White or Caucasian, Hispano or Latino, White ...","[Bachelor’s degree, Bachelor’s degree, other, ...","[Greece, Mexico, Canada, Spain, Poland, Israel]","[YES, YES, YES, YES, YES, YES]","[DIRECT, DIRECT, JUDGEMENTAL, DIRECT, DIRECT, ...","[[STEREOTYPING-DOMINANCE, OBJECTIFICATION, SEX...",DEV_EN
400486,400486,en,""" get changed , you look like a prostitute . ""...",6,"[Annotator_801, Annotator_182, Annotator_802, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 18-22, 23-45, 46+]","[White or Caucasian, Hispano or Latino, White ...","[Bachelor’s degree, Bachelor’s degree, other, ...","[Greece, Mexico, Canada, Spain, Poland, Israel]","[YES, YES, YES, YES, NO, YES]","[DIRECT, REPORTED, REPORTED, REPORTED, -, DIRECT]","[[IDEOLOGICAL-INEQUALITY, STEREOTYPING-DOMINAN...",DEV_EN
400487,400487,en,made this top and my mom gave me the “you look...,6,"[Annotator_795, Annotator_796, Annotator_797, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 18-22, 23-45, 46+]","[White or Caucasian, Hispano or Latino, White ...","[High school degree or equivalent, Bachelor’s ...","[Portugal, Mexico, United Kingdom, Zimbabwe, P...","[YES, YES, YES, YES, YES, NO]","[JUDGEMENTAL, REPORTED, REPORTED, REPORTED, RE...","[[OBJECTIFICATION], [OBJECTIFICATION], [OBJECT...",DEV_EN
400488,400488,en,@DawnAnd91320913 I haven't seen anything that ...,6,"[Annotator_776, Annotator_777, Annotator_195, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 18-22, 23-45, 46+]","[Multiracial, White or Caucasian, White or Cau...","[High school degree or equivalent, High school...","[South Africa, Finland, Portugal, Poland, Ital...","[YES, YES, YES, YES, YES, YES]","[DIRECT, DIRECT, DIRECT, DIRECT, DIRECT, DIRECT]","[[IDEOLOGICAL-INEQUALITY, STEREOTYPING-DOMINAN...",DEV_EN


In [21]:
# load the gold label for the training set
with open('/Users/zhangshanshan/Desktop/thesis/dataset/EXIST 2024 Lab/evaluation/golds/EXIST2024_training_task1_gold_hard.json', 'r') as gold_training_file:
    gold_training_data = json.load(gold_training_file)

In [24]:
gold_training_dataset = pd.DataFrame(gold_training_data)
gold_training_dataset['value'] = gold_training_dataset['value'].replace({'YES': 1, 'NO': 0})

In [25]:
# the value here represents the majority votes amnong the six annotator
gold_training_dataset.head(10)

Unnamed: 0,test_case,id,value
0,EXIST2024,100001,1
1,EXIST2024,100002,0
2,EXIST2024,100003,0
3,EXIST2024,100005,1
4,EXIST2024,100006,0
5,EXIST2024,100008,1
6,EXIST2024,100009,0
7,EXIST2024,100010,0
8,EXIST2024,100011,0
9,EXIST2024,100012,0


In [26]:
# use left join to match the gold value to the original training set
training_merged = pd.merge(EXIST_train_dataset_transposed, gold_training_dataset, how='left', left_on='id_EXIST', right_on='id')

In [27]:
# NaN appear in the value represents there is no majority votes, which will be removed from the dataset
training_merged

Unnamed: 0,id_EXIST,lang,tweet,number_annotators,annotators,gender_annotators,age_annotators,ethnicities_annotators,study_levels_annotators,countries_annotators,labels_task1,labels_task2,labels_task3,split,test_case,id,value
0,100001,es,"@TheChiflis Ignora al otro, es un capullo.El p...",6,"[Annotator_1, Annotator_2, Annotator_3, Annota...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[White or Caucasian, Hispano or Latino, White ...","[Bachelor’s degree, Bachelor’s degree, High sc...","[Italy, Mexico, United States, Spain, Spain, C...","[YES, YES, NO, YES, YES, YES]","[REPORTED, JUDGEMENTAL, -, REPORTED, JUDGEMENT...","[[OBJECTIFICATION], [OBJECTIFICATION, SEXUAL-V...",TRAIN_ES,EXIST2024,100001,1.0
1,100002,es,@ultimonomada_ Si comicsgate se parece en algo...,6,"[Annotator_7, Annotator_8, Annotator_9, Annota...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Black or African American, Hispano or Latino,...","[High school degree or equivalent, Bachelor’s ...","[United Kingdom, Mexico, United States, Portug...","[NO, NO, NO, NO, YES, NO]","[-, -, -, -, DIRECT, -]","[[-], [-], [-], [-], [OBJECTIFICATION], [-]]",TRAIN_ES,EXIST2024,100002,0.0
2,100003,es,"@Steven2897 Lee sobre Gamergate, y como eso ha...",6,"[Annotator_7, Annotator_8, Annotator_9, Annota...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Black or African American, Hispano or Latino,...","[High school degree or equivalent, Bachelor’s ...","[United Kingdom, Mexico, United States, Portug...","[NO, NO, NO, NO, NO, NO]","[-, -, -, -, -, -]","[[-], [-], [-], [-], [-], [-]]",TRAIN_ES,EXIST2024,100003,0.0
3,100004,es,@Lunariita7 Un retraso social bastante lamenta...,6,"[Annotator_13, Annotator_14, Annotator_15, Ann...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Hispano or Latino, Hispano or Latino, White o...","[High school degree or equivalent, Bachelor’s ...","[Mexico, Chile, Spain, Spain, Portugal, Spain]","[NO, NO, YES, NO, YES, YES]","[-, -, DIRECT, -, REPORTED, REPORTED]","[[-], [-], [IDEOLOGICAL-INEQUALITY], [-], [IDE...",TRAIN_ES,,,
4,100005,es,@novadragon21 @icep4ck @TvDannyZ Entonces como...,6,"[Annotator_19, Annotator_20, Annotator_21, Ann...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Hispano or Latino, Hispano or Latino, White o...","[Bachelor’s degree, Bachelor’s degree, Master’...","[Mexico, Afghanistan, United States, Italy, Po...","[YES, NO, YES, NO, YES, YES]","[REPORTED, -, JUDGEMENTAL, -, JUDGEMENTAL, DIR...","[[STEREOTYPING-DOMINANCE, OBJECTIFICATION], [-...",TRAIN_ES,EXIST2024,100005,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6915,203256,en,idk why y’all bitches think having half your a...,6,"[Annotator_478, Annotator_479, Annotator_480, ...","[F, F, M, M, M, F]","[18-22, 23-45, 18-22, 23-45, 46+, 46+]","[White or Caucasian, Black or African American...","[High school degree or equivalent, Bachelor’s ...","[Hungary, South Africa, Chile, Portugal, Unite...","[YES, YES, YES, YES, YES, YES]","[JUDGEMENTAL, DIRECT, DIRECT, DIRECT, JUDGEMEN...","[[OBJECTIFICATION], [STEREOTYPING-DOMINANCE, S...",TRAIN_EN,EXIST2024,203256,1.0
6916,203257,en,This has been a part of an experiment with @Wo...,6,"[Annotator_668, Annotator_669, Annotator_670, ...","[F, F, M, M, M, F]","[18-22, 23-45, 18-22, 23-45, 46+, 46+]","[Hispano or Latino, other, White or Caucasian,...","[High school degree or equivalent, Master’s de...","[Mexico, Algeria, Portugal, Spain, United King...","[YES, YES, YES, YES, YES, YES]","[JUDGEMENTAL, REPORTED, JUDGEMENTAL, DIRECT, J...","[[OBJECTIFICATION], [OBJECTIFICATION], [OBJECT...",TRAIN_EN,EXIST2024,203257,1.0
6917,203258,en,"""Take me already"" ""Not yet. You gotta be ready...",6,"[Annotator_467, Annotator_468, Annotator_469, ...","[F, F, M, M, M, F]","[18-22, 23-45, 18-22, 23-45, 46+, 46+]","[White or Caucasian, White or Caucasian, White...","[High school degree or equivalent, Bachelor’s ...","[Poland, Poland, Portugal, Canada, United King...","[NO, YES, NO, YES, YES, YES]","[-, DIRECT, -, DIRECT, DIRECT, JUDGEMENTAL]","[[-], [OBJECTIFICATION], [-], [SEXUAL-VIOLENCE...",TRAIN_EN,EXIST2024,203258,1.0
6918,203259,en,@clintneedcoffee why do you look like a whore?...,6,"[Annotator_674, Annotator_675, Annotator_676, ...","[F, F, M, M, M, F]","[18-22, 23-45, 18-22, 23-45, 46+, 46+]","[Black or African American, Black or African A...","[High school degree or equivalent, Bachelor’s ...","[South Africa, South Africa, Portugal, Portuga...","[YES, YES, YES, YES, YES, YES]","[DIRECT, DIRECT, DIRECT, DIRECT, JUDGEMENTAL, ...","[[OBJECTIFICATION, SEXUAL-VIOLENCE, MISOGYNY-N...",TRAIN_EN,EXIST2024,203259,1.0


In [28]:
# load the gold label for the training set
with open('/Users/zhangshanshan/Desktop/thesis/dataset/EXIST 2024 Lab/evaluation/golds/EXIST2024_dev_task1_gold_hard.json', 'r') as gold_dev_file:
    gold_dev_data = json.load(gold_dev_file)

In [31]:
gold_dev_dataset = pd.DataFrame(gold_dev_data)
gold_dev_dataset['value'] = gold_dev_dataset['value'].replace({'YES': 1, 'NO': 0})

In [32]:
gold_dev_dataset.head(10)

Unnamed: 0,test_case,id,value
0,EXIST2024,300002,1
1,EXIST2024,300003,0
2,EXIST2024,300004,1
3,EXIST2024,300005,0
4,EXIST2024,300006,0
5,EXIST2024,300007,1
6,EXIST2024,300008,1
7,EXIST2024,300009,0
8,EXIST2024,300010,1
9,EXIST2024,300013,1


In [33]:
dev_merged = pd.merge(EXIST_dev_dataset_transposed, gold_dev_dataset, how='left', left_on='id_EXIST', right_on='id')

In [34]:
dev_merged

Unnamed: 0,id_EXIST,lang,tweet,number_annotators,annotators,gender_annotators,age_annotators,ethnicities_annotators,study_levels_annotators,countries_annotators,labels_task1,labels_task2,labels_task3,split,test_case,id,value
0,300001,es,@Fichinescu La comunidad gamer es un antro de ...,6,"[Annotator_726, Annotator_727, Annotator_357, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 18-22, 23-45, 46+]","[Hispano or Latino, Hispano or Latino, White o...","[High school degree or equivalent, Bachelor’s ...","[Mexico, Chile, Serbia, Portugal, Mexico, Spain]","[NO, YES, YES, NO, YES, NO]","[-, JUDGEMENTAL, JUDGEMENTAL, -, REPORTED, -]","[[-], [MISOGYNY-NON-SEXUAL-VIOLENCE], [MISOGYN...",DEV_ES,,,
1,300002,es,@anacaotica88 @MordorLivin No me acuerdo de lo...,6,"[Annotator_731, Annotator_732, Annotator_315, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 18-22, 23-45, 46+]","[White or Caucasian, Hispano or Latino, White ...","[High school degree or equivalent, Bachelor’s ...","[Spain, Chile, United Kingdom, Chile, Chile, S...","[YES, YES, NO, YES, YES, YES]","[JUDGEMENTAL, REPORTED, -, JUDGEMENTAL, JUDGEM...","[[IDEOLOGICAL-INEQUALITY, STEREOTYPING-DOMINAN...",DEV_ES,EXIST2024,300002,1.0
2,300003,es,@cosmicJunkBot lo digo cada pocos dias y lo re...,6,"[Annotator_735, Annotator_736, Annotator_345, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 18-22, 23-45, 46+]","[White or Caucasian, White or Caucasian, White...","[Bachelor’s degree, Master’s degree, Master’s ...","[Italy, Spain, Germany, Portugal, Spain, Spain]","[NO, NO, NO, NO, NO, NO]","[-, -, -, -, -, -]","[[-], [-], [-], [-], [-], [-]]",DEV_ES,EXIST2024,300003,0.0
3,300004,es,Also mientras les decia eso la señalaba y deci...,6,"[Annotator_259, Annotator_739, Annotator_291, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 18-22, 23-45, 46+]","[Hispano or Latino, Hispano or Latino, White o...","[Bachelor’s degree, Bachelor’s degree, High sc...","[Mexico, Mexico, Portugal, Mexico, Mexico, Spain]","[NO, YES, YES, YES, YES, YES]","[-, REPORTED, REPORTED, REPORTED, JUDGEMENTAL,...","[[-], [SEXUAL-VIOLENCE], [SEXUAL-VIOLENCE], [S...",DEV_ES,EXIST2024,300004,1.0
4,300005,es,"And all people killed, attacked, harassed by ...",6,"[Annotator_731, Annotator_732, Annotator_315, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 18-22, 23-45, 46+]","[White or Caucasian, Hispano or Latino, White ...","[High school degree or equivalent, Bachelor’s ...","[Spain, Chile, United Kingdom, Chile, Chile, S...","[NO, YES, NO, NO, NO, NO]","[-, DIRECT, -, -, -, -]","[[-], [STEREOTYPING-DOMINANCE], [-], [-], [-],...",DEV_ES,EXIST2024,300005,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1033,400485,en,@YesReallyAngel “Don’t wear a black bra with a...,6,"[Annotator_801, Annotator_182, Annotator_802, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 18-22, 23-45, 46+]","[White or Caucasian, Hispano or Latino, White ...","[Bachelor’s degree, Bachelor’s degree, other, ...","[Greece, Mexico, Canada, Spain, Poland, Israel]","[YES, YES, YES, YES, YES, YES]","[DIRECT, DIRECT, JUDGEMENTAL, DIRECT, DIRECT, ...","[[STEREOTYPING-DOMINANCE, OBJECTIFICATION, SEX...",DEV_EN,EXIST2024,400485,1.0
1034,400486,en,""" get changed , you look like a prostitute . ""...",6,"[Annotator_801, Annotator_182, Annotator_802, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 18-22, 23-45, 46+]","[White or Caucasian, Hispano or Latino, White ...","[Bachelor’s degree, Bachelor’s degree, other, ...","[Greece, Mexico, Canada, Spain, Poland, Israel]","[YES, YES, YES, YES, NO, YES]","[DIRECT, REPORTED, REPORTED, REPORTED, -, DIRECT]","[[IDEOLOGICAL-INEQUALITY, STEREOTYPING-DOMINAN...",DEV_EN,EXIST2024,400486,1.0
1035,400487,en,made this top and my mom gave me the “you look...,6,"[Annotator_795, Annotator_796, Annotator_797, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 18-22, 23-45, 46+]","[White or Caucasian, Hispano or Latino, White ...","[High school degree or equivalent, Bachelor’s ...","[Portugal, Mexico, United Kingdom, Zimbabwe, P...","[YES, YES, YES, YES, YES, NO]","[JUDGEMENTAL, REPORTED, REPORTED, REPORTED, RE...","[[OBJECTIFICATION], [OBJECTIFICATION], [OBJECT...",DEV_EN,EXIST2024,400487,1.0
1036,400488,en,@DawnAnd91320913 I haven't seen anything that ...,6,"[Annotator_776, Annotator_777, Annotator_195, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 18-22, 23-45, 46+]","[Multiracial, White or Caucasian, White or Cau...","[High school degree or equivalent, High school...","[South Africa, Finland, Portugal, Poland, Ital...","[YES, YES, YES, YES, YES, YES]","[DIRECT, DIRECT, DIRECT, DIRECT, DIRECT, DIRECT]","[[IDEOLOGICAL-INEQUALITY, STEREOTYPING-DOMINAN...",DEV_EN,EXIST2024,400488,1.0


In [35]:
new_dataset = pd.concat([training_merged, dev_merged], ignore_index=True)

In [36]:
new_dataset.shape

(7958, 17)

In [37]:
new_dataset.head(20)

Unnamed: 0,id_EXIST,lang,tweet,number_annotators,annotators,gender_annotators,age_annotators,ethnicities_annotators,study_levels_annotators,countries_annotators,labels_task1,labels_task2,labels_task3,split,test_case,id,value
0,100001,es,"@TheChiflis Ignora al otro, es un capullo.El p...",6,"[Annotator_1, Annotator_2, Annotator_3, Annota...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[White or Caucasian, Hispano or Latino, White ...","[Bachelor’s degree, Bachelor’s degree, High sc...","[Italy, Mexico, United States, Spain, Spain, C...","[YES, YES, NO, YES, YES, YES]","[REPORTED, JUDGEMENTAL, -, REPORTED, JUDGEMENT...","[[OBJECTIFICATION], [OBJECTIFICATION, SEXUAL-V...",TRAIN_ES,EXIST2024,100001.0,1.0
1,100002,es,@ultimonomada_ Si comicsgate se parece en algo...,6,"[Annotator_7, Annotator_8, Annotator_9, Annota...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Black or African American, Hispano or Latino,...","[High school degree or equivalent, Bachelor’s ...","[United Kingdom, Mexico, United States, Portug...","[NO, NO, NO, NO, YES, NO]","[-, -, -, -, DIRECT, -]","[[-], [-], [-], [-], [OBJECTIFICATION], [-]]",TRAIN_ES,EXIST2024,100002.0,0.0
2,100003,es,"@Steven2897 Lee sobre Gamergate, y como eso ha...",6,"[Annotator_7, Annotator_8, Annotator_9, Annota...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Black or African American, Hispano or Latino,...","[High school degree or equivalent, Bachelor’s ...","[United Kingdom, Mexico, United States, Portug...","[NO, NO, NO, NO, NO, NO]","[-, -, -, -, -, -]","[[-], [-], [-], [-], [-], [-]]",TRAIN_ES,EXIST2024,100003.0,0.0
3,100004,es,@Lunariita7 Un retraso social bastante lamenta...,6,"[Annotator_13, Annotator_14, Annotator_15, Ann...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Hispano or Latino, Hispano or Latino, White o...","[High school degree or equivalent, Bachelor’s ...","[Mexico, Chile, Spain, Spain, Portugal, Spain]","[NO, NO, YES, NO, YES, YES]","[-, -, DIRECT, -, REPORTED, REPORTED]","[[-], [-], [IDEOLOGICAL-INEQUALITY], [-], [IDE...",TRAIN_ES,,,
4,100005,es,@novadragon21 @icep4ck @TvDannyZ Entonces como...,6,"[Annotator_19, Annotator_20, Annotator_21, Ann...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Hispano or Latino, Hispano or Latino, White o...","[Bachelor’s degree, Bachelor’s degree, Master’...","[Mexico, Afghanistan, United States, Italy, Po...","[YES, NO, YES, NO, YES, YES]","[REPORTED, -, JUDGEMENTAL, -, JUDGEMENTAL, DIR...","[[STEREOTYPING-DOMINANCE, OBJECTIFICATION], [-...",TRAIN_ES,EXIST2024,100005.0,1.0
5,100006,es,@yonkykong Aaah sí. Andrew Dobson. El que se d...,6,"[Annotator_25, Annotator_26, Annotator_27, Ann...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Multiracial, Hispano or Latino, White or Cauc...","[Bachelor’s degree, Bachelor’s degree, Master’...","[United Kingdom, Mexico, Poland, Spain, Portug...","[NO, NO, NO, NO, NO, NO]","[-, -, -, -, -, -]","[[-], [-], [-], [-], [-], [-]]",TRAIN_ES,EXIST2024,100006.0,0.0
6,100007,es,@glutamatom @JoaquinAdolfoC Estaba del lado de...,6,"[Annotator_25, Annotator_26, Annotator_27, Ann...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Multiracial, Hispano or Latino, White or Cauc...","[Bachelor’s degree, Bachelor’s degree, Master’...","[United Kingdom, Mexico, Poland, Spain, Portug...","[NO, YES, YES, NO, NO, YES]","[-, UNKNOWN, DIRECT, -, -, DIRECT]","[[-], [UNKNOWN], [MISOGYNY-NON-SEXUAL-VIOLENCE...",TRAIN_ES,,,
7,100008,es,@BestKabest Esta gringa sigue llorando por el ...,6,"[Annotator_25, Annotator_26, Annotator_27, Ann...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Multiracial, Hispano or Latino, White or Cauc...","[Bachelor’s degree, Bachelor’s degree, Master’...","[United Kingdom, Mexico, Poland, Spain, Portug...","[YES, YES, YES, YES, YES, YES]","[DIRECT, DIRECT, DIRECT, JUDGEMENTAL, DIRECT, ...","[[IDEOLOGICAL-INEQUALITY], [STEREOTYPING-DOMIN...",TRAIN_ES,EXIST2024,100008.0,1.0
8,100009,es,.¿Conoces la #DECORACION #estilo #GAMER para #...,6,"[Annotator_31, Annotator_32, Annotator_33, Ann...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Hispano or Latino, Hispano or Latino, White o...","[Bachelor’s degree, Bachelor’s degree, Master’...","[Mexico, Mexico, United Kingdom, Spain, Chile,...","[NO, NO, NO, NO, NO, NO]","[-, -, -, -, -, -]","[[-], [-], [-], [-], [-], [-]]",TRAIN_ES,EXIST2024,100009.0,0.0
9,100010,es,CES 2022 ASUS ROG Rise of Gamers Evento de lan...,6,"[Annotator_37, Annotator_38, Annotator_39, Ann...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Hispano or Latino, Hispano or Latino, Hispano...","[Bachelor’s degree, Bachelor’s degree, Master’...","[United States, Mexico, Venezuela, Spain, Spai...","[NO, NO, YES, NO, NO, NO]","[-, -, DIRECT, -, -, -]","[[-], [-], [STEREOTYPING-DOMINANCE], [-], [-],...",TRAIN_ES,EXIST2024,100010.0,0.0


In [38]:
# remove the rows whose value is NaN
new_dataset_cleaned = new_dataset.dropna(subset=['value'])

In [39]:
new_dataset_cleaned.shape

(6998, 17)

In [41]:
# lowercase the tweets
new_dataset_cleaned['tweet'] = new_dataset_cleaned['tweet'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_dataset_cleaned['tweet'] = new_dataset_cleaned['tweet'].str.lower()


In [42]:
# use the previously defined function to remove the urls in the tweets 
def remove_urls_exist (text):
    text = re.sub(r'(https|http|www)?:\/\/\S+|www\.\S+', '#URL', text, flags=re.MULTILINE)
    return text.strip() 

In [43]:
# apply the function to the dataset
new_dataset_cleaned['tweet'] = new_dataset_cleaned['tweet'].apply(remove_urls_exist)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_dataset_cleaned['tweet'] = new_dataset_cleaned['tweet'].apply(remove_urls_exist)


In [44]:
# use previously defined function to replace emoji with textual representations
new_dataset_cleaned['tweet'] = new_dataset_cleaned['tweet'].apply(replace_emoji_with_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_dataset_cleaned['tweet'] = new_dataset_cleaned['tweet'].apply(replace_emoji_with_text)


In [45]:
# defina a function to remove the usernames starting with @
def replace_usernames(text):
    return re.sub(r'@\w+', '@username', text)

In [46]:
# apply the function to all tweets 
new_dataset_cleaned['tweet'] = new_dataset_cleaned['tweet'].apply(replace_usernames)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_dataset_cleaned['tweet'] = new_dataset_cleaned['tweet'].apply(replace_usernames)


In [47]:
new_dataset_cleaned

Unnamed: 0,id_EXIST,lang,tweet,number_annotators,annotators,gender_annotators,age_annotators,ethnicities_annotators,study_levels_annotators,countries_annotators,labels_task1,labels_task2,labels_task3,split,test_case,id,value
0,100001,es,"@username ignora al otro, es un capullo.el pro...",6,"[Annotator_1, Annotator_2, Annotator_3, Annota...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[White or Caucasian, Hispano or Latino, White ...","[Bachelor’s degree, Bachelor’s degree, High sc...","[Italy, Mexico, United States, Spain, Spain, C...","[YES, YES, NO, YES, YES, YES]","[REPORTED, JUDGEMENTAL, -, REPORTED, JUDGEMENT...","[[OBJECTIFICATION], [OBJECTIFICATION, SEXUAL-V...",TRAIN_ES,EXIST2024,100001,1.0
1,100002,es,@username si comicsgate se parece en algo a ga...,6,"[Annotator_7, Annotator_8, Annotator_9, Annota...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Black or African American, Hispano or Latino,...","[High school degree or equivalent, Bachelor’s ...","[United Kingdom, Mexico, United States, Portug...","[NO, NO, NO, NO, YES, NO]","[-, -, -, -, DIRECT, -]","[[-], [-], [-], [-], [OBJECTIFICATION], [-]]",TRAIN_ES,EXIST2024,100002,0.0
2,100003,es,"@username lee sobre gamergate, y como eso ha c...",6,"[Annotator_7, Annotator_8, Annotator_9, Annota...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Black or African American, Hispano or Latino,...","[High school degree or equivalent, Bachelor’s ...","[United Kingdom, Mexico, United States, Portug...","[NO, NO, NO, NO, NO, NO]","[-, -, -, -, -, -]","[[-], [-], [-], [-], [-], [-]]",TRAIN_ES,EXIST2024,100003,0.0
4,100005,es,@username @username @username entonces como as...,6,"[Annotator_19, Annotator_20, Annotator_21, Ann...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Hispano or Latino, Hispano or Latino, White o...","[Bachelor’s degree, Bachelor’s degree, Master’...","[Mexico, Afghanistan, United States, Italy, Po...","[YES, NO, YES, NO, YES, YES]","[REPORTED, -, JUDGEMENTAL, -, JUDGEMENTAL, DIR...","[[STEREOTYPING-DOMINANCE, OBJECTIFICATION], [-...",TRAIN_ES,EXIST2024,100005,1.0
5,100006,es,@username aaah sí. andrew dobson. el que se de...,6,"[Annotator_25, Annotator_26, Annotator_27, Ann...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Multiracial, Hispano or Latino, White or Cauc...","[Bachelor’s degree, Bachelor’s degree, Master’...","[United Kingdom, Mexico, Poland, Spain, Portug...","[NO, NO, NO, NO, NO, NO]","[-, -, -, -, -, -]","[[-], [-], [-], [-], [-], [-]]",TRAIN_ES,EXIST2024,100006,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7953,400485,en,@username “don’t wear a black bra with a white...,6,"[Annotator_801, Annotator_182, Annotator_802, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 18-22, 23-45, 46+]","[White or Caucasian, Hispano or Latino, White ...","[Bachelor’s degree, Bachelor’s degree, other, ...","[Greece, Mexico, Canada, Spain, Poland, Israel]","[YES, YES, YES, YES, YES, YES]","[DIRECT, DIRECT, JUDGEMENTAL, DIRECT, DIRECT, ...","[[STEREOTYPING-DOMINANCE, OBJECTIFICATION, SEX...",DEV_EN,EXIST2024,400485,1.0
7954,400486,en,""" get changed , you look like a prostitute . ""...",6,"[Annotator_801, Annotator_182, Annotator_802, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 18-22, 23-45, 46+]","[White or Caucasian, Hispano or Latino, White ...","[Bachelor’s degree, Bachelor’s degree, other, ...","[Greece, Mexico, Canada, Spain, Poland, Israel]","[YES, YES, YES, YES, NO, YES]","[DIRECT, REPORTED, REPORTED, REPORTED, -, DIRECT]","[[IDEOLOGICAL-INEQUALITY, STEREOTYPING-DOMINAN...",DEV_EN,EXIST2024,400486,1.0
7955,400487,en,made this top and my mom gave me the “you look...,6,"[Annotator_795, Annotator_796, Annotator_797, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 18-22, 23-45, 46+]","[White or Caucasian, Hispano or Latino, White ...","[High school degree or equivalent, Bachelor’s ...","[Portugal, Mexico, United Kingdom, Zimbabwe, P...","[YES, YES, YES, YES, YES, NO]","[JUDGEMENTAL, REPORTED, REPORTED, REPORTED, RE...","[[OBJECTIFICATION], [OBJECTIFICATION], [OBJECT...",DEV_EN,EXIST2024,400487,1.0
7956,400488,en,@username i haven't seen anything that makes y...,6,"[Annotator_776, Annotator_777, Annotator_195, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 18-22, 23-45, 46+]","[Multiracial, White or Caucasian, White or Cau...","[High school degree or equivalent, High school...","[South Africa, Finland, Portugal, Poland, Ital...","[YES, YES, YES, YES, YES, YES]","[DIRECT, DIRECT, DIRECT, DIRECT, DIRECT, DIRECT]","[[IDEOLOGICAL-INEQUALITY, STEREOTYPING-DOMINAN...",DEV_EN,EXIST2024,400488,1.0


In [48]:
# split the dataset into two subsets according to their language
es_dataset = new_dataset_cleaned[new_dataset_cleaned['lang'] == 'es']
en_dataset = new_dataset_cleaned[new_dataset_cleaned['lang'] == 'en']

In [49]:
es_dataset.shape

(3684, 17)

In [50]:
en_dataset.shape

(3314, 17)

In [51]:
# split the training and test subset - spanish dataset
train_es, test_es = train_test_split(es_dataset, test_size=0.2, random_state=42)

In [52]:
# save them in csv format - spanish dataset
train_es.to_csv('train_es_dataset.csv', index=False)
test_es.to_csv('test_es_dataset.csv', index=False)

In [53]:
# split the training and test subset - english dataset
train_en, test_en = train_test_split(en_dataset, test_size=0.2, random_state=42)

In [54]:
# save them in csv format - english dataset
train_en.to_csv('train_en_dataset.csv', index=False)
test_en.to_csv('test_en_dataset.csv', index=False)