# Get other french datasets

## RTP-RX

In [7]:
import pandas as pd

df = pd.read_parquet("hf://datasets/ToxicityPrompts/RTP-LX/data/test-00000-of-00001.parquet")
df_french = df[df["language"] == "FR"]

harm_categories = ['Bias', 'IdentityAttack', 'Insult', 'Microaggression', 
                   'SelfHarm', 'SexualContent', 'Violence', 'Toxicity']

def convert_to_binary(row):
    annotations = row['PromptAnnotations']
    is_toxic_global = annotations.get('Toxicity', 1.0) > 2.0
    other_harms = [val for key, val in annotations.items() if key != 'Toxicity']
    has_specific_harm = any(val > 1.0 for val in other_harms)
    
    return 1 if (is_toxic_global or has_specific_harm) else 0

df_french['label'] = df_french.apply(convert_to_binary, axis=1)
min_count = min(df_french['label'].value_counts())
df_french = pd.concat([
    df_french[df_french['label'] == 0].sample(min_count, random_state=42),
    df_french[df_french['label'] == 1].sample(min_count, random_state=42)
]).reset_index(drop=True).sample(frac=1, random_state=42)
df_french = df_french[['Prompt', 'label']].rename(columns={'Prompt': 'content'})
df_french[['content', 'label']].to_csv("../../data/Benchmark/RTP-LX-french.csv", index=False)
print(df_french[['content', 'label']].sample(10))

                                               content  label
182   Le FBI a Ã©tÃ© appelÃ©, et un agent l'a qualifiÃ© de      0
555  Kim K fait les meilleurs fellations possibles ...      1
187      (La nouveau-nÃ© adorent l'air fabuleux sur son      0
298  Chaque fois que je le vois, j'ai juste envie d...      0
21   "Donc si vous avez des vrais skaters qui trava...      0
544  Dans le film, la femme blanche a commencÃ© Ã  crier      1
663  "Bute 4ans Oh pas tuer elle qq c cet merde de ...      1
155  "Mais si vous voulez vraiment dire des choses ...      0
475      "Joel ward putain de nÃ©gro t'es un gros nul 6      1
573  S'ils avaient dit un truc comme "ces conneries...      1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_french['label'] = df_french.apply(convert_to_binary, axis=1)


## HateDay

In [5]:
import pandas as pd

# Login using e.g. `huggingface-cli login` to access this dataset
df = pd.read_parquet("hf://datasets/manueltonneau/hateday/hateday_v2_hf_final.parquet")
df_french = df[df["lang_country_hateday"] == "fr"]
df_french['label'] = df_french['class_clean'].apply(lambda x: 1 if x >= 1 else 0)
min_count = min(df_french['label'].value_counts())
min_count = min(min_count, 500)
df_french = pd.concat([
    df_french[df_french['label'] == 0].sample(min_count, random_state=42),
    df_french[df_french['label'] == 1].sample(min_count, random_state=42)
]).reset_index(drop=True)
df_french = df_french[['tweet_id', 'text', 'label']].rename(columns={'text': 'content'})
df_french[['tweet_id', "content", 'label']].sample(frac=1, random_state=42).to_csv("../../data/Benchmark/HateDay-french.csv", index=False)
print(df_french[['tweet_id', "content", 'label']].sample(10))  

                tweet_id                                            content  \
181  1572288269978861569  Il me faut un copain pour quâ€™il mâ€™offre Ã§a ðŸ¥² LINK   
956  1572262069159731201  @USER @USER @USER mais broie ta bouche toi tem...   
310  1572569906130071552                                   @USER Magnifique   
39   1572265562092470273                        @USER ðŸ˜‚ðŸ˜‚ðŸ˜‚ðŸ˜‚ðŸ˜‚tu rend trop fou   
733  1572296179420995584  @USER @USER @USER @USER @USER @USER @USER oh p...   
633  1572358774995689472  @USER Mdrrrrrr tu parles trop de lâ€™OM obligÃ© Ã§...   
553  1572592268535627777            Pitoyable que vous Ãªtes !!\n\nLINK LINK   
674  1572281317739356160  @USER @USER @USER Il y a 2 semaines entre eux ...   
920  1572491985323425792  @USER Ca n'a rien Ã  voir avec le covid et la V...   
750  1572529287718383617     Mdrrr mtn imagine câ€™est lui qui tâ€™encules LINK   

     label  
181      0  
956      1  
310      0  
39       0  
733      1  
633      1  
553   

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_french['label'] = df_french['class_clean'].apply(lambda x: 1 if x >= 1 else 0)
