In [1]:
pip install transformers datasets evaluate pysentimiento

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting pysentimiento
  Downloading pysentimiento-0.7.3-py3-none-any.whl.metadata (7.7 kB)
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pysentimiento-0.7.3-py3-none-any.whl (39 kB)
Installing collected packages: evaluate, pysentimiento
Successfully installed evaluate-0.4.2 pysentimiento-0.7.3
Note: you may need to restart the kernel to use updated packages.


In [18]:
import re
from pysentimiento.preprocessing import preprocess_tweet


def simple_preprocess(text):
    """
    Preprocesses a single tweet string to replace user handles and URLs.
    """
    URL_RE = re.compile(r"https?:\/\/[\w\.\/\?\=\d%_:/-]+")
    HANDLE_RE = re.compile(r"@\w+")
    text = HANDLE_RE.sub("@USER", text)
    text = URL_RE.sub("HTTPURL", text)
    return text

def tweet_preprocessor_spanish(text):
    """
    imported from pysentimiento
    does the following:
    1. changes to @usario
    2. shortens characters repetitions to 3 (can be changed to two or more)
    3. converts emojis to text type of need
    4. normalises laughter expressions
    5. handles hashtags -- removes the #
    """
    return preprocess_tweet(text, lang="es", shorten=3)

def tweet_preprocessor_english(text):
    """
    imported from pysentimiento
    does the following:
    1. changes to @usario
    2. shortens characters repetitions to 3 (can be changed to two or more)
    3. converts emojis to text type of need
    4. normalises laughter expressions
    5. handles hashtags -- removes the #
    """
    return preprocess_tweet(text, lang="en", shorten=3)

In [19]:
import numpy as np
import pandas as pd
Train = pd.read_csv('/kaggle/input/existv1/balanced_train.csv')
Val = pd.read_csv('/kaggle/input/existv1/balanced_val.csv')

In [20]:
Train = Train[['tweet', 'task2_hard_labels']]
Val = Val[['tweet', 'task2_hard_labels']]

In [21]:
Train = Train[Train['task2_hard_labels'] != 'UND']
Val = Val[Val['task2_hard_labels'] != 'UND']

In [22]:
Train['tweet'] = Train['tweet'].apply(simple_preprocess)
Train['tweet'] = Train['tweet'].apply(tweet_preprocessor_spanish)
Train['tweet'] = Train['tweet'].apply(tweet_preprocessor_english)
Val['tweet'] = Val['tweet'].apply(simple_preprocess)
Val['tweet'] = Val['tweet'].apply(tweet_preprocessor_spanish)
Val['tweet'] = Val['tweet'].apply(tweet_preprocessor_english)

In [23]:
Train.rename(columns={'task2_hard_labels': 'label'}, inplace=True)
Val.rename(columns={'task2_hard_labels': 'label'}, inplace=True)

In [24]:
Train

Unnamed: 0,tweet,label
0,El primer tuit que escribí esta mañana fue en ...,REPORTED
1,@USER TiranicidioYa @USER AlertaNews24 Hey más...,JUDGEMENTAL
2,Can men stop threatening to spank women wtf. G...,REPORTED
3,/ annabeth está para romper el estereotipo de ...,JUDGEMENTAL
4,@USER proctor_jason Funny enough the free woma...,REPORTED
...,...,...
15444,Cómo quieren que mejoremos cómo sociedad si no...,-
15445,"emoji tijeras emoji ""Decidieron arbitrariament...",-
15446,Witch raises wind to break up enemys lumber po...,-
15447,@USER TheEAC37 @USER MyGrindelwald @USER Disne...,JUDGEMENTAL


In [25]:
def factorize_column(df, column_name):
    """
    Factorizes a specified column of a pandas DataFrame.

    Parameters:
        df (pd.DataFrame): The DataFrame containing the column to be factorized.
        column_name (str): The name of the column to factorize.

    Returns:
        pd.DataFrame: A DataFrame with the specified column factorized.
        pd.Series: The array containing the labels corresponding to the factorized codes.
    """
    # Check if the column exists in the DataFrame
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' not found in the DataFrame.")

    # Factorize the column
    codes, uniques = pd.factorize(df[column_name])

    # Replace the original column with the encoded data
    df[column_name] = codes

    # Return the modified DataFrame and the unique labels
    return df, uniques

In [26]:
Train, unique = factorize_column(Train, 'label')
unique

Index(['REPORTED', 'JUDGEMENTAL', 'DIRECT', '-'], dtype='object')

In [27]:
Train.head(5)

Unnamed: 0,tweet,label
0,El primer tuit que escribí esta mañana fue en ...,0
1,@USER TiranicidioYa @USER AlertaNews24 Hey más...,1
2,Can men stop threatening to spank women wtf. G...,0
3,/ annabeth está para romper el estereotipo de ...,1
4,@USER proctor_jason Funny enough the free woma...,0


In [28]:
# Creating the id2label dictionary
id2label = {idx: label for idx, label in enumerate(unique)}
# Creating the label2id dictionary
label2id = {label: idx for idx, label in enumerate(unique)}

In [30]:
def factorize_column_custom(df, column_name, label2id):
    """
    Factorizes a specified column of a pandas DataFrame according to a given label2id dictionary.

    Parameters:
        df (pd.DataFrame): The DataFrame containing the column to be factorized.
        column_name (str): The name of the column to factorize.
        label2id (dict): Dictionary mapping labels to IDs.

    Returns:
        pd.DataFrame: A DataFrame with the specified column factorized.
        pd.Index: The array containing the labels in the order used for factorization.
    """
    # Check if the column exists in the DataFrame
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' not found in the DataFrame.")
    
    # Apply the mapping from label2id to the column
    df[column_name] = df[column_name].map(label2id).fillna(-1)  # Use -1 for any unknown categories

    # Create an Index from label2id for returning, sorted by the ID values
    sorted_labels = sorted(label2id.items(), key=lambda x: x[1])
    labels_index = pd.Index([label for label, _ in sorted_labels], dtype='object')

    # Return the modified DataFrame and the sorted labels as an Index
    return df, labels_index

In [31]:
Val, vunique = factorize_column_custom(Val, 'label', label2id)
vunique

Index(['REPORTED', 'JUDGEMENTAL', 'DIRECT', '-'], dtype='object')

In [32]:
Val.head(5)

Unnamed: 0,tweet,label
0,Acompáñenme por este viaje de olvido de tu coa...,1
1,El estigma es algo que nos atraviesa a todas l...,0
2,Most married women don't know the price of alc...,2
3,𝐇𝐚𝐩𝐩𝐲 𝐖𝐨𝐦𝐞𝐧'𝐬 𝐃𝐚𝐲 to all the lovely ladies em...,3
4,@USER @USER FP trabajando por un futuro de igu...,3


In [16]:
from datasets import Dataset, DatasetDict

# Creating instances of CustomDataset for train and test sets
train_dataset = Dataset.from_pandas(Train)
val_dataset = Dataset.from_pandas(Val)

# Organizing the datasets into a dictionary-like structure
dataset_dict = DatasetDict({
    "train": train_dataset,
    "val": val_dataset
})


dataset_dict["train"][0]

{'tweet': 'El primer tuit que escribí esta mañana fue en apoyo a @USER vtrivella. Vi el video donde ella denuncia el acoso, la instigación al suicidio y la campaña de odio de Marco Michetti. No puedo entender que algo tan grave no sea tendencia. Usemos Twitter también para lo importante.',
 'label': 0}

In [17]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("sdadas/xlm-roberta-large-twitter")

tokenizer_config.json:   0%|          | 0.00/469 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [18]:
def preprocess_function(examples):
    return tokenizer(examples["tweet"], truncation=True)

In [19]:
tokenized_data = dataset_dict.map(preprocess_function, batched=True)

Map:   0%|          | 0/15449 [00:00<?, ? examples/s]

Map:   0%|          | 0/2727 [00:00<?, ? examples/s]

In [20]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [21]:
import evaluate

accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [22]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [23]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "sdadas/xlm-roberta-large-twitter", 
    num_labels=4, 
    id2label=id2label, 
    label2id=label2id
)

config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

In [24]:
training_args = TrainingArguments(
    output_dir="/kaggle/working/sexist_hard1",  # Directory for model outputs
    learning_rate=0.00000527,                  # Learning rate
    per_device_train_batch_size=4,             # Batch size for training
    per_device_eval_batch_size=4,              # Batch size for evaluation
    num_train_epochs=3,                        # Number of training epochs
    weight_decay=0.000636,                     # Weight decay for regularization
    evaluation_strategy="epoch",               # Evaluate at the end of each epoch
    save_strategy="epoch",                     # Save at the end of each epoch
    save_total_limit=1,                        # Only keep the best single model checkpoint
    load_best_model_at_end=True,               # Load the best model at the end of training
    metric_for_best_model='eval_loss',              # Decide the best model based on loss
    push_to_hub=False                          # Do not push to Hugging Face's Model Hub
)

In [25]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["val"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


{'loss': 1.1131, 'grad_norm': 24.51630973815918, 'learning_rate': 5.042629217361291e-06, 'epoch': 0.13}
{'loss': 0.7812, 'grad_norm': 39.242801666259766, 'learning_rate': 4.815258434722582e-06, 'epoch': 0.26}
{'loss': 0.6996, 'grad_norm': 33.833946228027344, 'learning_rate': 4.587887652083873e-06, 'epoch': 0.39}
{'loss': 0.6294, 'grad_norm': 72.40947723388672, 'learning_rate': 4.360516869445164e-06, 'epoch': 0.52}
{'loss': 0.5758, 'grad_norm': 125.31611633300781, 'learning_rate': 4.133146086806455e-06, 'epoch': 0.65}
{'loss': 0.537, 'grad_norm': 3.348951578140259, 'learning_rate': 3.905775304167746e-06, 'epoch': 0.78}
{'loss': 0.4724, 'grad_norm': 3.873220920562744, 'learning_rate': 3.6784045215290364e-06, 'epoch': 0.91}
{'eval_loss': 0.45175138115882874, 'eval_accuracy': 0.8892555922258892, 'eval_runtime': 30.2413, 'eval_samples_per_second': 90.175, 'eval_steps_per_second': 22.552, 'epoch': 1.0}
{'loss': 0.3932, 'grad_norm': 108.73336791992188, 'learning_rate': 3.451033738890327e-06, 

TrainOutput(global_step=11589, training_loss=0.3386301543778655, metrics={'train_runtime': 2839.3869, 'train_samples_per_second': 16.323, 'train_steps_per_second': 4.082, 'train_loss': 0.3386301543778655, 'epoch': 3.0})

In [35]:
Train.loc[1650]

tweet    @USER Esos insectos ¿serán las mismísimas "zor...
label                                                    2
Name: 1650, dtype: object

In [10]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis", model="/kaggle/working/sexist_hard1/checkpoint-11589")

In [11]:
def infer(txt):
    return classifier(txt)

In [50]:
for i in range(300, 350):
    txt = Val['tweet'].iloc[i]
    label = id2label[Val['label'].iloc[i]]
    predicted = infer(txt)
    print("id:{}, predicted={}, actual={}".format(i, predicted[0]['label'], label))

id:300, predicted=DIRECT, actual=DIRECT
id:301, predicted=REPORTED, actual=REPORTED
id:302, predicted=DIRECT, actual=DIRECT
id:303, predicted=DIRECT, actual=DIRECT
id:304, predicted=JUDGEMENTAL, actual=JUDGEMENTAL
id:305, predicted=DIRECT, actual=DIRECT
id:306, predicted=JUDGEMENTAL, actual=JUDGEMENTAL
id:307, predicted=DIRECT, actual=DIRECT
id:308, predicted=REPORTED, actual=REPORTED
id:309, predicted=REPORTED, actual=REPORTED
id:310, predicted=REPORTED, actual=REPORTED
id:311, predicted=JUDGEMENTAL, actual=JUDGEMENTAL
id:312, predicted=-, actual=-
id:313, predicted=-, actual=-
id:314, predicted=-, actual=-
id:315, predicted=DIRECT, actual=DIRECT
id:316, predicted=-, actual=-
id:317, predicted=-, actual=-
id:318, predicted=DIRECT, actual=DIRECT
id:319, predicted=DIRECT, actual=DIRECT
id:320, predicted=JUDGEMENTAL, actual=JUDGEMENTAL
id:321, predicted=DIRECT, actual=DIRECT
id:322, predicted=-, actual=-
id:323, predicted=-, actual=-
id:324, predicted=REPORTED, actual=REPORTED
id:325, pr

In [6]:
import json

# read JSON file
def read_json(url):
  with open(url) as f:
      data = json.load(f)

  data = [value for key, value in data.items()]
  data = pd.DataFrame(data)
  data["id_EXIST"] = data["id_EXIST"].astype(str)
  # print("read json returns a DATAFRAME")
  return data

In [7]:
test = read_json('/kaggle/input/existv1/test/EXIST2023_test_clean.json')
test

Unnamed: 0,id_EXIST,lang,tweet,number_annotators,annotators,gender_annotators,age_annotators,ethnicities_annotators,study_levels_annotators,countries_annotators,split
0,500001,es,@Eurogamer_es Todo gamergate desde el desarrol...,6,"[Annotator_810, Annotator_811, Annotator_812, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Hispano or Latino, White or Caucasian, White ...","[High school degree or equivalent, Master’s de...","[Mexico, Spain, Italy, United States, Portugal...",TEST_ES
1,500002,es,"@ArCaNgEl__23 @Benzenazi Hombre, no es compara...",6,"[Annotator_780, Annotator_816, Annotator_817, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Hispano or Latino, Hispano or Latino, Black o...","[High school degree or equivalent, Bachelor’s ...","[Chile, Mexico, United States, Mexico, Mexico,...",TEST_ES
2,500003,es,yo buscando las empresas metidas en el gamerga...,6,"[Annotator_821, Annotator_822, Annotator_823, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Hispano or Latino, Hispano or Latino, Asian, ...","[Bachelor’s degree, Bachelor’s degree, Master’...","[Mexico, Mexico, VietNam, United States, Mexic...",TEST_ES
3,500004,es,"@jordirico Primero fue internet, luego el game...",6,"[Annotator_827, Annotator_828, Annotator_829, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Hispano or Latino, White or Caucasian, Hispan...","[High school degree or equivalent, Bachelor’s ...","[Chile, Spain, Mexico, United Kingdom, Chile, ...",TEST_ES
4,500005,es,@AlonsoQuijano12 Yo estuve metido en el gamerg...,6,"[Annotator_827, Annotator_828, Annotator_829, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Hispano or Latino, White or Caucasian, Hispan...","[High school degree or equivalent, Bachelor’s ...","[Chile, Spain, Mexico, United Kingdom, Chile, ...",TEST_ES
...,...,...,...,...,...,...,...,...,...,...,...
2071,600974,en,@AllyMae99 This straight up sounds like “you l...,6,"[Annotator_942, Annotator_943, Annotator_351, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Black or African American, White or Caucasian...","[High school degree or equivalent, Master’s de...","[South Africa, Spain, Portugal, United States,...",TEST_EN
2072,600975,en,Nathaniel is trying to help me with a new fake...,6,"[Annotator_997, Annotator_998, Annotator_999, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Black or African American, White or Caucasian...","[High school degree or equivalent, Bachelor’s ...","[South Africa, United Kingdom, Australia, Fran...",TEST_EN
2073,600976,en,walkin back from the gym &amp; an older lady s...,6,"[Annotator_997, Annotator_998, Annotator_999, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Black or African American, White or Caucasian...","[High school degree or equivalent, Bachelor’s ...","[South Africa, United Kingdom, Australia, Fran...",TEST_EN
2074,600977,en,You look like a whore of Babylon bc that’s the...,6,"[Annotator_1009, Annotator_1010, Annotator_101...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[White or Caucasian, White or Caucasian, Multi...","[High school degree or equivalent, Bachelor’s ...","[Poland, Portugal, United Kingdom, Greece, Gre...",TEST_EN


In [8]:
test['tweet'] = test['tweet'].apply(simple_preprocess)
test['tweet'] = test['tweet'].apply(tweet_preprocessor_spanish)
test['tweet'] = test['tweet'].apply(tweet_preprocessor_english)

In [12]:
test['hard'] = test['tweet'].apply(infer)

In [14]:
test.to_csv("finalhard.csv",index=False)

In [15]:
from transformers import pipeline

classifiersoft = pipeline("sentiment-analysis", model="/kaggle/working/sexist_hard1/checkpoint-11589", return_all_scores=True)



In [35]:
def infersoft(txt):
    # Get predictions for the input text
    results = classifiersoft(txt)
    # Since results is a list of lists of dictionaries, we extract the first element
    return results

In [36]:
for i in range(300, 350):
    txt = Val['tweet'].iloc[i]
    label = id2label[Val['label'].iloc[i]]
    predicted = infersoft(txt)
    print("id:{}, predicted={}, actual={}".format(i, predicted, label))

id:300, predicted=[[{'label': 'REPORTED', 'score': 4.947224624629598e-06}, {'label': 'JUDGEMENTAL', 'score': 3.968785676988773e-06}, {'label': 'DIRECT', 'score': 0.9998898506164551}, {'label': '-', 'score': 0.00010116654448211193}]], actual=DIRECT
id:301, predicted=[[{'label': 'REPORTED', 'score': 0.9999741315841675}, {'label': 'JUDGEMENTAL', 'score': 2.726472303038463e-06}, {'label': 'DIRECT', 'score': 1.8457869373378344e-05}, {'label': '-', 'score': 4.6022892092878465e-06}]], actual=REPORTED
id:302, predicted=[[{'label': 'REPORTED', 'score': 9.892833077174146e-06}, {'label': 'JUDGEMENTAL', 'score': 3.2445561828353675e-06}, {'label': 'DIRECT', 'score': 0.9994915723800659}, {'label': '-', 'score': 0.000495360407512635}]], actual=DIRECT
id:303, predicted=[[{'label': 'REPORTED', 'score': 4.7783669288037345e-06}, {'label': 'JUDGEMENTAL', 'score': 6.806276815041201e-06}, {'label': 'DIRECT', 'score': 0.9999539852142334}, {'label': '-', 'score': 3.442712113610469e-05}]], actual=DIRECT
id:304

In [37]:
test['soft'] = test['tweet'].apply(infersoft)

In [39]:
test['soft'].iloc[0]

[[{'label': 'REPORTED', 'score': 8.335934580827598e-06},
  {'label': 'JUDGEMENTAL', 'score': 1.4897758774168324e-05},
  {'label': 'DIRECT', 'score': 3.2016891054809093e-05},
  {'label': '-', 'score': 0.9999446868896484}]]

In [41]:
test.iloc[0]

id_EXIST                                                              500001
lang                                                                      es
tweet                      @USER Todo gamergate desde el desarrollo hasta...
number_annotators                                                          6
annotators                 [Annotator_810, Annotator_811, Annotator_812, ...
gender_annotators                                         [F, F, F, M, M, M]
age_annotators                        [18-22, 23-45, 46+, 46+, 23-45, 18-22]
ethnicities_annotators     [Hispano or Latino, White or Caucasian, White ...
study_levels_annotators    [High school degree or equivalent, Master’s de...
countries_annotators       [Mexico, Spain, Italy, United States, Portugal...
split                                                                TEST_ES
hard                           [{'label': '-', 'score': 0.9999446868896484}]
soft                       [[{'label': 'REPORTED', 'score': 8.33593458082...

In [43]:
def convert_label(hard_values):
    label = hard_values[0]['label']  # Assuming there's always at least one result in 'hard'
    if label == '-':
        return 'NO'
    return label

In [44]:
test['value'] = test['hard'].apply(convert_label)

# Create the 'test_case' column
test['test_case'] = 'EXIST2024'

# Select and rename the necessary columns to create the final DataFrame
# Ensure you use the column name as a string and rename using the correct syntax.
final_df = test[['id_EXIST', 'value', 'test_case']].rename(columns={'id_EXIST': 'id'})

# Convert the DataFrame into a list of dictionaries
result_dicts = final_df.to_dict(orient='records')

final_df

Unnamed: 0,id,value,test_case
0,500001,NO,EXIST2024
1,500002,NO,EXIST2024
2,500003,NO,EXIST2024
3,500004,JUDGEMENTAL,EXIST2024
4,500005,NO,EXIST2024
...,...,...,...
2071,600974,DIRECT,EXIST2024
2072,600975,REPORTED,EXIST2024
2073,600976,REPORTED,EXIST2024
2074,600977,DIRECT,EXIST2024


In [46]:
import json

# Assuming 'result_dicts' is your list of dictionaries
# Define the path to the output JSON file
file_path = '/kaggle/working/task2_hard_UNED_1.json'

# Open the file in write mode and use json.dump() to write the list of dictionaries
with open(file_path, 'w') as file:
    json.dump(result_dicts, file, indent=4)  # The 'indent' parameter is optional but helps with readability

print(f"Data saved to {file_path}")


Data saved to /kaggle/working/task2_hard_UNED_1.json


In [51]:
def process_soft_entry(soft_values, id_value):
    value_dict = {}
    # Establish a template for the order and labels we expect
    template = {'NO': None, 'DIRECT': None, 'REPORTED': None, 'JUDGEMENTAL': None}

    for item in soft_values[0]:  # Access the first element since it's a nested list
        label = 'NO' if item['label'] == '-' else item['label']
        # Use formatted strings to avoid scientific notation and maintain high precision
        formatted_score = f"{item['score']:.20f}"
        # Remove trailing zeros and the unneeded decimal point if it's a whole number
        value_dict[label] = formatted_score.rstrip('0').rstrip('.')

    # Merge the dictionary while maintaining order and replacing None with actual values
    for key in template:
        if key in value_dict:
            template[key] = float(value_dict[key])
    
    return {
        "id": id_value,
        "value": template,
        "test_case": "EXIST2024"
    }


In [52]:
# Apply the function to each row in the DataFrame
result_dicts = test.apply(lambda row: process_soft_entry(row['soft'], row['id_EXIST']), axis=1)

# Convert the resulting Series of dictionaries to a list
result_list = result_dicts.tolist()

# Optionally, convert this list to a JSON file
file_path = '/kaggle/working/task2_soft_UNED_3.json'
with open(file_path, 'w') as file:
    json.dump(result_list, file, indent=4)

print(f"Data saved to {file_path}")

Data saved to /kaggle/working/task2_soft_UNED_3.json


In [55]:
def processsoft(soft_values):
    updated_entries = []
    for entry in soft_values[0]:  # Access the first element since it's a nested list
        label = 'NO' if entry['label'] == '-' else entry['label']
        # Format the score to avoid scientific notation and ensure full decimal expansion
        formatted_score = format(entry['score'], '.20f').rstrip('0').rstrip('.')
        if formatted_score == '':
            formatted_score = '0'  # Handling very small numbers formatted to '0.'
        # Append the updated dictionary to the list
        updated_entries.append({
            'label': label,
            'score': float(formatted_score) if formatted_score != '0' else 0.0
        })
    return [updated_entries]

# Apply the function to each 'soft' entry
test['soft'] = test['soft'].apply(processsoft)

# Now, let's print the first entry to see the changes
print(test['soft'].iloc[0])


[[{'label': 'REPORTED', 'score': 8.3359345808276e-06}, {'label': 'JUDGEMENTAL', 'score': 1.489775877416832e-05}, {'label': 'DIRECT', 'score': 3.201689105480909e-05}, {'label': 'NO', 'score': 0.9999446868896484}]]


In [53]:
test['soft'].iloc[0]

[[{'label': 'REPORTED', 'score': 8.335934580827598e-06},
  {'label': 'JUDGEMENTAL', 'score': 1.4897758774168324e-05},
  {'label': 'DIRECT', 'score': 3.2016891054809093e-05},
  {'label': '-', 'score': 0.9999446868896484}]]