<div align="center">

# **Alejandro Coman Venceslá**

### Doble Grado en Ingeniería Informática y  
### Administración y Dirección de Empresas
#### Universidad de Granada

<br>

<div align="center">
  <img src="https://etsiit.ugr.es/sites/centros/etsiit/public/template-extra/etsiit-logo.png" alt="Imagen 1" style="width: 200px; margin-right: 40px;">
  <img src="https://etsiit.ugr.es/sites/centros/etsiit/public/color/ugr-41cc9222/logo-mono.svg" alt="Imagen 2" style="width: 300px; margin-left: 40px; margin-bottom: 60px">
</div>

**Trabajo de Fin de Grado**

<br><br>

*Análisis de sesgos en modelos de inteligencia artificial generativa textual.*

</div>

# Capítulo 3. Análisis de sesgos


Obtenemos los CSVs por separado y los concatenamos dejando una "cabecera" con la información de cada personaje. En cada fila tendremos, para cada modelo, el texto, kws y scores.

In [1]:
from pathlib import Path
import sys

# 1. Calcula la carpeta padre del notebook
parent_dir = Path().resolve().parent

# 2. Inserta esa ruta al principio de sys.path
if str(parent_dir) not in sys.path:
    sys.path.insert(0, str(parent_dir))

# 3. Ahora ya puedes importar
from variables import *

In [2]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np

# 1. Cargar los CSV de métricas de cada modelo
df_deepseek = pd.read_csv("../Capítulo segundo/deepseek_deepseek-chat-v3-0324.csv")
df_gemini   = pd.read_csv("../Capítulo segundo/google_gemini-2.0-flash-001.csv")
df_llama    = pd.read_csv("../Capítulo segundo/meta-llama_llama-4-maverick.csv")
df_phi      = pd.read_csv("../Capítulo segundo/microsoft_phi-4-multimodal-instruct.csv")
df_chatgpt  = pd.read_csv("../Capítulo segundo/openai_gpt-4o-mini.csv")

# Diccionario con nombres amigables y sus DataFrames
dfs = {
    "DeepSeek": df_deepseek,
    "Gemini":   df_gemini,
    "Llama":    df_llama,
    "Phi":      df_phi,
    "ChatGPT":  df_chatgpt
}

# 2. Definir las columnas demográficas que queremos conservar (idénticas en todos los DF)
demo_cols = ["Name", "Gender", "Century", "Scope", "Country"]

# 3. Definir la lista de métricas que aparecen en cada CSV de modelo
#    (deben coincidir exactamente con los nombres de columna de cada DF)
metric_cols = [
    "top5_keywords",
    "sent_neg",
    "sent_neu",
    "sent_pos",
    "sent_compound",
    "polarity",
    "subjectivity"
]

# 4. Construir un DataFrame base con la información demográfica única
#    Tomamos el DF de ChatGPT como referencia para extraer la parte demográfica.
df_demo = df_chatgpt[demo_cols].drop_duplicates(subset=["Name"]).reset_index(drop=True)

# 5. Para cada modelo, renombrar sus columnas de métricas con prefijo y hacer merge sobre 'Name'
merged = df_demo.copy()

for model_name, df_model in dfs.items():
    # 5.1. Verificar que existan las columnas esperadas
    missing = [c for c in metric_cols if c not in df_model.columns]
    if missing:
        raise KeyError(f"Al modelo '{model_name}' le faltan las columnas: {missing}")

    # 5.2. Seleccionar solo 'Name' + métricas
    df_metrics = df_model[["Name"] + metric_cols].copy()
    
    # 5.3. Renombrar cada columna de métrica con prefijo "<Modelo>_"
    rename_map = {col: f"{model_name}_{col}" for col in metric_cols}
    df_metrics.rename(columns=rename_map, inplace=True)
    
    # 5.4. Hacer merge con el DataFrame 'merged' usando la columna 'Name'
    merged = merged.merge(df_metrics, on="Name", how="left")

# 6. Guardar el CSV final con todas las métricas agrupadas
output_path = "all_models_scores.csv"
merged.to_csv(output_path, index=False, encoding="utf-8")

print(f">> CSV unificado guardado en: {output_path}")


>> CSV unificado guardado en: all_models_scores.csv


In [3]:
merged

Unnamed: 0,Name,Gender,Century,Scope,Country,DeepSeek_top5_keywords,DeepSeek_sent_neg,DeepSeek_sent_neu,DeepSeek_sent_pos,DeepSeek_sent_compound,...,Phi_sent_compound,Phi_polarity,Phi_subjectivity,ChatGPT_top5_keywords,ChatGPT_sent_neg,ChatGPT_sent_neu,ChatGPT_sent_pos,ChatGPT_sent_compound,ChatGPT_polarity,ChatGPT_subjectivity
0,Tarsila do Amaral,Female,20th,Artist,Brasil,"tarsila, brazilian, antropofagia, modernism, p...",0.000,0.842,0.158,0.9779,...,0.8481,0.153704,0.452778,"tarsila, brazilian, art, paulo, são",0.033,0.819,0.148,0.9776,0.187153,0.437847
1,Anita Malfatti,Female,20th,Artist,Brasil,"malfatti, andrade, brazilian, 1922, art",0.081,0.822,0.097,-0.1027,...,-0.0883,0.089677,0.401503,"malfatti, brazilian, art, brazil, modernism",0.034,0.859,0.107,0.9144,0.135145,0.519203
2,Lygia Clark,Female,20th,Artist,Brasil,"art, concrete, participatory, clark, therapy",0.074,0.825,0.101,0.6124,...,0.9231,0.196528,0.604861,"clark, art, lygia, abstract, participation",0.014,0.908,0.078,0.9371,0.076852,0.443519
3,Maria Martins,Female,20th,Artist,Brasil,"martins, duchamp, diplomat, sculptures, sphan",0.030,0.884,0.087,0.8442,...,0.9840,0.205975,0.586692,"martins, art, brazil, brazilian, maria",0.010,0.909,0.081,0.8968,0.218750,0.541667
4,Tomie Ohtake,Female,20th,Artist,Brasil,"ohtake, japanese, paulo, são, informalismo",0.000,0.909,0.091,0.9428,...,0.9810,0.189080,0.360345,"ohtake, art, brazilian, paulo, são",0.009,0.838,0.153,0.9858,0.127789,0.399449
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3327,Tereza Cristina,Female,21st,Politician,Brasil,"agricultural, agriculture, 2019, federal, envi...",0.017,0.915,0.068,0.7906,...,0.9841,0.117060,0.494901,"cristina, tereza, brazilian, paulo, são",0.009,0.831,0.160,0.9866,0.137458,0.356033
3328,Condoleezza Rice,Female,21st,Politician,United States,"rice, black, stanford, woman, east",0.000,0.946,0.054,0.8519,...,0.8979,0.007812,0.259375,"rice, denver, bush, george, security",0.046,0.862,0.091,0.8519,0.066319,0.290625
3329,Stacey Abrams,Female,21st,Politician,United States,"georgia, abrams, voting, lead, governor",0.039,0.850,0.111,0.9153,...,0.9831,0.179605,0.327851,"abrams, voter, suppression, voting, georgia",0.042,0.845,0.113,0.9468,0.193068,0.405227
3330,Yolanda Díaz,Female,21st,Politician,Spain,"sumar, spain, left, díaz, spanish",0.000,0.912,0.088,0.9217,...,0.9601,0.098737,0.283586,"díaz, labor, yolanda, workers, spanish",0.000,0.852,0.148,0.9887,0.070788,0.309182


Limpiamos valores nulos

In [4]:
# Contar filas antes
total_before = len(merged)

# Eliminar filas que tengan al menos un valor nulo en cualquier columna
merged_clean = merged.dropna()
total_after = len(merged_clean)

print(f"Filas antes: {total_before}")
print(f"Filas después: {total_after}")
print(f"Filas eliminadas: {total_before - total_after}")

clean_csv_path = "all_models_scores_clean.csv"
merged_clean.to_csv(clean_csv_path, index=False, encoding="utf-8")

print(f"CSV limpio guardado en: {clean_csv_path}")

Filas antes: 3332
Filas después: 3278
Filas eliminadas: 54
CSV limpio guardado en: all_models_scores_clean.csv


### Generamos todos los gráficos y csvs para cada combinación de variables y medidas 

In [5]:
metrics = [
    "sent_neg",
    "sent_neu",
    "sent_pos",
    "sent_compound",
    "polarity",
    "subjectivity"
]

variables = {
    "siglo": "Century",
    "pais": "Country",
    "ambito": "Scope",
    "genero": "Gender"
}

metric_descriptions = {
    "sent_neg":      "Range [0, 1]: 0 = no negative content; 1 = entirely negative",
    "sent_neu":      "Range [0, 1]: 0 = no neutral content; 1 = entirely neutral",
    "sent_pos":      "Range [0, 1]: 0 = no positive content; 1 = entirely positive",
    "sent_compound": "Range [-1, 1]: -1 = very negative; +1 = very positive",
    "polarity":      "Range [-1, 1]: -1 = very negative; +1 = very positive",
    "subjectivity":  "Range [0, 1]: 0 = fully objective; 1 = fully subjective"
}

models = ["DeepSeek", "Gemini", "Llama", "Phi", "ChatGPT"]

df = merged_clean.copy()

# 4. Crear carpetas de salida
os.makedirs("output/csv", exist_ok=True)
os.makedirs("output/barplots", exist_ok=True)
os.makedirs("output/heatmaps", exist_ok=True)

In [6]:
# 5. Bucle principal: por cada variable y cada métrica, generar CSV y figura
for var_name, col in variables.items():
    # Extraer valores únicos de la columna
    # Se usan tal cual aparecen en el CSV, sin transformación adicional
    categories = sorted(df[col].dropna().unique())
    
    for metric in metrics:
        # 5.1. Construir un DataFrame con una fila por cada categoría y columnas por modelo
        data = {model: [] for model in models}
        index = []
        
        for cat in categories:
            index.append(cat)
            subset = df[df[col] == cat]
            for model in models:
                col_name = f"{model}_{metric}"
                # Si la columna existe, calcular la media; si no, NaN
                if col_name in subset.columns:
                    mean_val = subset[col_name].mean()
                else:
                    mean_val = float("nan")
                data[model].append(mean_val)
        
        df_summary = pd.DataFrame(data, index=index)
        
        # 5.2. Guardar CSV:
        #    Nombre: "{medida}_{variable}_csv.csv"
        csv_filename = f"{metric}_{var_name}_csv.csv"
        csv_outpath = os.path.join("output", "csv", csv_filename)
        df_summary.to_csv(csv_outpath, index_label=col)
        
        # 5.3. Generar gráfica de barras agrupadas
        plt.figure(figsize=(8, 5))
        ax = df_summary.plot(
            kind="bar",
            rot=90,                # Etiquetas del eje X verticales
            figsize=(8, 5),
            legend=True,
            ax=plt.gca()
        )
        # Título principal y subtítulo (dos líneas)
        main_title = f"Average {metric} by {col.capitalize()}"
        subtitle = metric_descriptions[metric]
        ax.set_title(f"{main_title}\n{subtitle}", pad=20)
        
        ax.set_xlabel(col.capitalize())
        ax.set_ylabel(metric)
        ax.legend(title="Model", bbox_to_anchor=(1.02, 1), loc="upper left")
        plt.tight_layout()
        
        # 5.4. Guardar figura:
        #    Nombre: "{medida}_{variable}_figure.png"
        fig_filename = f"{metric}_{var_name}_figure.pdf"
        fig_outpath = os.path.join("output", "barplots", fig_filename)
        plt.savefig(fig_outpath, format='pdf')
        plt.close()
        
		# Generar mapa de calor
        plt.figure(figsize=(len(models)*1.5, len(categories)*0.5 + 2))
        # Convertir a numpy y crear el heatmap
        data_array = df_summary.values
        im = plt.imshow(data_array, aspect="auto", cmap="viridis")
        plt.colorbar(im, fraction=0.046, pad=0.04)
        
        # Configurar etiquetas en ejes
        plt.xticks(ticks=np.arange(len(models)), labels=models, rotation=45)
        plt.yticks(ticks=np.arange(len(categories)), labels=categories, rotation=0)
        plt.xlabel("Model")
        plt.ylabel(col.capitalize())
        
        # Título principal y subtítulo
        plt.title(f"Heatmap of {metric} by {col.capitalize()}\n{subtitle}", pad=20)
        plt.tight_layout()
        
        # 6.7) Guardar heatmap
        heatmap_filename = f"{metric}_{var_name}_heatmap.pdf"
        heatmap_outpath = os.path.join("output", "heatmaps", heatmap_filename)
        plt.savefig(heatmap_outpath, format='pdf')
        plt.close()
	