## Preprocesamiento de los datos

In [1]:
import re
import json
import ast
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import nltk
import numpy as np
import pandas as pd

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

df_train = pd.read_csv('data/train.csv')

def convertir_a_lista(texto):
    try:
        # Attempt to parse the string as JSON
        return json.loads(texto)
    except json.JSONDecodeError:
        # If it fails, return the text wrapped in a list
        return [texto]  # Wrap the text as a list to maintain consistency
    
STOP_WORDS = set(stopwords.words('english'))

extra_stopwords = {'use', 'used', 'data', 'help', 'need', 'time', 'may', 'one', 'would', 'could', 'like', 'also', 'using', 'make', 'please', 'i', 'write'}

# Combine NLTK stopwords with the additional unwanted words
STOPWORDS = STOP_WORDS.union(extra_stopwords)

def limpiar_texto(texto: str) -> str:
    if not texto:
        return ""
    
    # Vectorized operations for text cleaning
    texto = texto.lower()
    texto = re.sub(r"http\S+|www\S+|https\S+|[#@']", "", texto, flags=re.MULTILINE)  # Remove URLs, hashtags, etc.
    texto = texto.encode('ascii', 'ignore').decode('ascii')  # Remove non-ASCII
    texto = texto.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation

    # Tokenization and stopword removal using the precomputed stopwords
    tokens = [word for word in word_tokenize(texto) if word not in STOP_WORDS]

    texto_limpio = ' '.join(tokens)
    return texto_limpio

def procesar_respuesta(responses: pd.Series) -> pd.Series:
    # Convert LaTeX to text and then clean the list without apply
    responses = responses.str.replace(r'\\(begin|end)\{itemize\}', '', regex=True)\
                         .str.replace(r'\\item\s*', '\n', regex=True)\
                         .str.replace(r'\\[a-zA-Z]+\{[^}]*\}', '', regex=True)\
                         .str.replace(r'\s+', ' ', regex=True).str.strip()
    
    # Convert JSON strings to lists with error handling
    responses = responses.map(convertir_a_lista)
    
    # Clean the text inside the lists using numpy's vectorize to avoid apply
    responses = responses.map(lambda lista: np.vectorize(limpiar_texto)(lista) if isinstance(lista, list) else lista)
    
    return responses

def limpiar_df(df: pd.DataFrame) -> pd.DataFrame:
    # Vectorized operations for cleaning prompts and responses
    df['prompt'] = df['prompt'].apply(ast.literal_eval)
    df['prompt_limpio'] = df['prompt'].apply(lambda lista: [limpiar_texto(texto) for texto in lista])
    
    # Convert LaTeX to text and clean responses directly
    df['response_a_limpio'] = procesar_respuesta(df['response_a'])
    df['response_b_limpio'] = procesar_respuesta(df['response_b'])

    return df

df_train = limpiar_df(df_train)

import matplotlib.pyplot as plt
# Enable inline plotting
%matplotlib inline
# Cruce de preferencias con los modelos
cross_a = pd.crosstab(df_train['model_a'], df_train['winner_model_a'])
cross_b = pd.crosstab(df_train['model_b'], df_train['winner_model_b'])
# Concatenar los modelos A y B para evitar duplicados en los empates
all_models_tie = pd.concat([df_train[['model_a', 'winner_tie']],
                            df_train[['model_b', 'winner_tie']].rename(columns={'model_b': 'model_a'})])

# Contar los empates por modelo
tie_counts = all_models_tie[all_models_tie['winner_tie'] == 1]['model_a'].value_counts()

a_winners = df_train[['model_a', 'winner_model_a', 'winner_model_b', 'winner_tie']]
b_winners = df_train[['model_b', 'winner_model_a', 'winner_model_b', 'winner_tie']]

a_winners.rename(columns={'model_a': 'model', 'winner_model_a': 'win', 'winner_model_b': 'loss', 'winner_tie': 'tie'}, inplace=True)
b_winners.rename(columns={'model_b': 'model', 'winner_model_a': 'loss', 'winner_model_b': 'win', 'winner_tie': 'tie'}, inplace=True)

winners_df = pd.concat([a_winners, b_winners], ignore_index=True)

model_occurrences = winners_df['model'].value_counts()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Majix\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Majix\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Majix\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  a_winners.rename(columns={'model_a': 'model', 'winner_model_a': 'win', 'winner_model_b': 'loss', 'winner_tie': 'tie'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-

In [2]:
import re
import json
import ast
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Load data
df_train = pd.read_csv('data/train.csv')

# Function to safely convert JSON-like strings to lists
def convertir_a_lista(texto):
    if isinstance(texto, str):
        texto = texto.replace(r'\/', '/')
        try:
            return json.loads(texto)
        except json.JSONDecodeError:
            return [texto]
    elif isinstance(texto, (int, float)):
        return [str(texto)]
    elif isinstance(texto, list):
        return texto
    else:
        return [texto]

# Preprocess columns with potential JSON-like strings
df_train['prompt'] = df_train['prompt'].map(convertir_a_lista)
df_train['response_a'] = df_train['response_a'].map(convertir_a_lista)
df_train['response_b'] = df_train['response_b'].map(convertir_a_lista)

# Define stopwords
STOP_WORDS = set(stopwords.words('english'))
extra_stopwords = {'use', 'used', 'data', 'help', 'need', 'time', 'may', 'one', 'would', 'could', 'like', 'also', 'using', 'make', 'please', 'i', 'write'}
STOPWORDS = STOP_WORDS.union(extra_stopwords)

# Clean text function
def limpiar_texto(texto) -> str:
    # Convert any non-string input to a string
    if not isinstance(texto, str):
        texto = str(texto)
    
    # Lowercase and clean the text
    texto = texto.lower()
    texto = re.sub(r"http\S+|www\S+|https\S+|[#@']", "", texto, flags=re.MULTILINE)  # Remove URLs, hashtags, etc.
    texto = texto.encode('ascii', 'ignore').decode('ascii')  # Remove non-ASCII characters
    texto = texto.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation

    # Tokenization and stopword removal
    tokens = [word for word in word_tokenize(texto) if word not in STOPWORDS]
    return ' '.join(tokens)

# Process response columns, converting LaTeX and JSON strings to clean lists
def procesar_respuesta(responses: pd.Series) -> pd.Series:
    # Clean the text by removing LaTeX markers and extra whitespace
    responses = responses.str.replace(r'\\(begin|end)\{itemize\}', '', regex=True)\
                         .str.replace(r'\\item\s*', '\n', regex=True)\
                         .str.replace(r'\\[a-zA-Z]+\{[^}]*\}', '', regex=True)\
                         .str.replace(r'\s+', ' ', regex=True).str.strip()
    
    # Convert JSON strings to lists
    responses = responses.map(convertir_a_lista)
    
    # Apply `limpiar_texto` to each item in the list if the list is not empty
    responses = responses.map(lambda lista: np.vectorize(limpiar_texto)(lista) if isinstance(lista, list) and len(lista) > 0 else lista)
    
    return responses


# Flatten prompts and responses into individual rows
def flatten_prompts(df: pd.DataFrame) -> pd.DataFrame:
    flattened_data = []
    
    for _, row in df.iterrows():
        prompts = row['prompt']
        responses_a = row['response_a']
        responses_b = row['response_b']
        
        num_entries = min(len(prompts), len(responses_a), len(responses_b))
        
        for i in range(num_entries):
            flattened_data.append({
                'id': row['id'],
                'model_a': row['model_a'],
                'model_b': row['model_b'],
                'prompt': prompts[i],
                'response_a': responses_a[i],
                'response_b': responses_b[i],
                'winner_model_a': row['winner_model_a'],
                'winner_model_b': row['winner_model_b'],
                'winner_tie': row['winner_tie']
            })

    return pd.DataFrame(flattened_data)

# Apply flattening function
df_train = flatten_prompts(df_train)

# Clean flattened DataFrame
def limpiar_df(df: pd.DataFrame) -> pd.DataFrame:
    df['prompt'] = df['prompt'].apply(limpiar_texto)
    df['response_a'] = procesar_respuesta(df['response_a'])
    df['response_b'] = procesar_respuesta(df['response_b'])
    return df

df_train = limpiar_df(df_train)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Majix\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Majix\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
df_train.head()

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie
0,30192,gpt-4-1106-preview,gpt-4-0613,morally right try certain percentage females m...,[question whether morally right aim certain pe...,[ai dont personal beliefs opinions however tel...,1,0,0
1,30192,gpt-4-1106-preview,gpt-4-0613,ok pineapple belong pizza relax give fun answer,[ah ageold culinary conundrum divided nations ...,[ai dont eat dont personal preferences world p...,1,0,0
2,53567,koala-13b,gpt-4-0613,difference marriage license marriage certificate,[marriage license legal document allows couple...,[marriage license marriage certificate two dif...,0,1,0
3,53567,koala-13b,gpt-4-0613,get quick possible california,[want get marriage license marriage certificat...,[california general steps obtain marriage lice...,0,1,0
4,53567,koala-13b,gpt-4-0613,minimal get 1 day 10 days,[minimal get marriage license marriage certifi...,[marriage license california issued immediatel...,0,1,0


## Construcción del modelo

In [7]:
%pip install tensorflow 
%pip install keras
%pip install keras_nlp

import os
os.environ["KERAS_BACKEND"] = "jax"

import keras
import tensorflow as tf
from tensorflow import keras
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import keras_nlp
import numpy as np
from sklearn.model_selection import train_test_split

# Initialize sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Derive target column based on `winner_model_a`, `winner_model_b`, and `winner_tie`
def create_target(df):
    conditions = [
        (df["winner_model_a"] == 1),  # Model A chosen
        (df["winner_model_b"] == 1),  # Model B chosen
        (df["winner_tie"] == 1)       # Tie
    ]
    choices = [0, 1, 2]  # 0: Model A, 1: Model B, 2: Tie
    df["target"] = np.select(conditions, choices, default=-1)
    return df

# Apply the target creation
df_train = create_target(df_train)

# Prepare input and output for the split
X = df_train[['prompt', 'response_a', 'response_b']]
y = df_train['target']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Function to preprocess text with sentiment
def preprocess_with_sentiment(text):
    sentiment = analyzer.polarity_scores(text)["compound"]
    return {"text": text, "sentiment": tf.constant([[sentiment]], dtype=tf.float32)}

# Function to prepare the model input for a batch of data
def prepare_model_input(df):
    texts, sentiments = [], []
    for _, row in df.iterrows():
        # Ensure prompt is a string and handle NaN values by replacing with an empty string
        prompt = str(row["prompt"]) if pd.notnull(row["prompt"]) else ""
        
        # Flatten response_a and response_b if they are lists
        response_a_text = " ".join(row["response_a"]) if isinstance(row["response_a"], list) else str(row["response_a"])
        response_b_text = " ".join(row["response_b"]) if isinstance(row["response_b"], list) else str(row["response_b"])
        
        # Handle NaN values
        response_a_text = response_a_text if response_a_text != 'nan' else ""
        response_b_text = response_b_text if response_b_text != 'nan' else ""
        
        # Concatenate prompt with responses
        response_a = preprocess_with_sentiment(prompt + " " + response_a_text)
        response_b = preprocess_with_sentiment(prompt + " " + response_b_text)
        
        # Append prompt + responses and their sentiments
        texts.append([response_a["text"], response_b["text"]])
        sentiments.append([response_a["sentiment"], response_b["sentiment"]])
        
    return {"text": texts, "sentiment": tf.concat(sentiments, axis=1)}

# Prepare training and testing data
train_inputs = prepare_model_input(X_train)
test_inputs = prepare_model_input(X_test)


# Convert labels to categorical format
y_train_cat = tf.keras.utils.to_categorical(y_train, num_classes=3)
y_test_cat = tf.keras.utils.to_categorical(y_test, num_classes=3)

# Model building with keras_nlp
classifier = keras_nlp.models.DebertaV3Classifier.from_preset(
    "deberta_v3_base_en",
    num_classes=3,
)

# Define custom inputs
input_layers = {
    "text": keras.Input(shape=(2,), dtype=tf.string, name="text"),
    "sentiment": keras.Input(shape=(2, 1), dtype=tf.float32, name="sentiment")
}

# Extract embeddings from DebertaV3Classifier
deberta_embeddings = classifier.backbone(input_layers["text"])

# Global average pooling for embeddings
pooled_embeddings = tf.reduce_mean(deberta_embeddings, axis=1)

# Concatenate sentiment scores with embeddings
combined = tf.concat([pooled_embeddings, tf.squeeze(input_layers["sentiment"], axis=-1)], axis=-1)

# Final classification layer
output = keras.layers.Dense(3, activation="softmax", name="classifier")(combined)

# Define and compile model
model = keras.Model(inputs=input_layers, outputs=output)
model.compile(
    optimizer=keras.optimizers.Adam(5e-6),
    loss=keras.losses.CategoricalCrossentropy(label_smoothing=0.02),
    metrics=[keras.metrics.CategoricalAccuracy(name="accuracy")]
)

# Model summary
print(model.summary())

# Training the model
model.fit(
    train_inputs,
    y_train_cat,
    validation_data=(test_inputs, y_test_cat),
    batch_size=8,
    epochs=3
)


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: C:\Users\Majix\AppData\Local\Programs\Python\Python312\python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: C:\Users\Majix\AppData\Local\Programs\Python\Python312\python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: C:\Users\Majix\AppData\Local\Programs\Python\Python312\python.exe -m pip install --upgrade pip


ImportError: cannot import name 'models' from 'keras_nlp' (C:\Users\Majix\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras_nlp\__init__.py)