<a href="https://colab.research.google.com/github/6Santiago9/Entregas/blob/main/99%20-%20modelo%20soluci%C3%B3n.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importación de Librerías

In [None]:
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import unicodedata
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Configuración Kaggle

In [None]:
# Establecer tus credenciales de Kaggle
os.environ['KAGGLE_USERNAME'] = 'cesarmartinezia'
os.environ['KAGGLE_KEY'] = '3bd3c2a5994356c24295cd5c6d8bba59'

In [None]:
# Crear manualmente el archivo kaggle.json a partir de esas variables
!mkdir -p ~/.kaggle
with open('/root/.kaggle/kaggle.json', 'w') as f:
    f.write('{"username":"%s","key":"%s"}' % (os.environ['KAGGLE_USERNAME'], os.environ['KAGGLE_KEY']))

In [None]:
# Ajustar permisos
!chmod 600 /root/.kaggle/kaggle.json

# Descarga de Dataset

In [None]:
# Descargar los datos de la competencia
!kaggle competitions download -c udea-ai-4-eng-20252-pruebas-saber-pro-colombia

Downloading udea-ai-4-eng-20252-pruebas-saber-pro-colombia.zip to /content
  0% 0.00/29.9M [00:00<?, ?B/s]
100% 29.9M/29.9M [00:00<00:00, 670MB/s]


In [None]:
!unzip udea-ai-4-eng-20252-pruebas-saber-pro-colombia.zip

Archive:  udea-ai-4-eng-20252-pruebas-saber-pro-colombia.zip
  inflating: submission_example.csv  
  inflating: test.csv                
  inflating: train.csv               


# Cargar el train y test

In [None]:
df = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [None]:
# Tamaño del dataset
print(df.shape, test.shape)

(692500, 21) (296786, 20)


# Limpieza

In [None]:
# 1. Eliminar columnas con poca variabilidad
def columnas_baja_variabilidad(data, threshold=0.99):
    columnas_remover = []
    for col in data.columns:
        top_freq = data[col].value_counts(normalize=True, dropna=False).max()
        if top_freq >= threshold:
            columnas_remover.append(col)
    return columnas_remover

# 2. Detectamos columnas malas en train

cols_baja_var = columnas_baja_variabilidad(df, threshold=0.99)
print("Columnas con poca variabilidad:", len(cols_baja_var))
print(cols_baja_var[:20])  # primeras 20

# 3. Eliminamos en train y test

df.drop(columns=cols_baja_var, inplace=True)
test.drop(columns=cols_baja_var, inplace=True)

print("Nuevo shape train:", df.shape)
print("Nuevo shape test:", test.shape)

Columnas con poca variabilidad: 1
['E_PRIVADO_LIBERTAD']
Nuevo shape train: (692500, 20)
Nuevo shape test: (296786, 19)


In [None]:
# 4. Eliminar columnas con demasiado NaN

umbral_nan = 0.60

cols_muchos_nan = df.columns[df.isna().mean() > umbral_nan].tolist()

print("Columnas con demasiados NaN:", len(cols_muchos_nan))
print(cols_muchos_nan[:20])

df.drop(columns=cols_muchos_nan, inplace=True)
test.drop(columns=cols_muchos_nan, inplace=True)

print("Shape después de eliminar columnas con NaN:")
print("Train:", df.shape)
print("Test:", test.shape)


Columnas con demasiados NaN: 0
[]
Shape después de eliminar columnas con NaN:
Train: (692500, 20)
Test: (296786, 19)


In [None]:
# 5. Imputación de valores faltantes

# Columna objetivo
target = "RENDIMIENTO_GLOBAL"

# Identificar columnas categóricas EXCLUYENDO la etiqueta
cat_cols = df.select_dtypes(include=['object']).columns
cat_cols = [c for c in cat_cols if c != target]

# Columnas numéricas
num_cols = df.select_dtypes(include=['int64', 'float64']).columns

# Imputación numérica: mediana
for col in num_cols:
    mediana = df[col].median()
    df[col].fillna(mediana, inplace=True)
    test[col].fillna(mediana, inplace=True)

# Imputación categórica (sin tocar la etiqueta)
for col in cat_cols:
    df[col].fillna("DESCONOCIDO", inplace=True)
    test[col].fillna("DESCONOCIDO", inplace=True)

print("Imputación lista ✔")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(mediana, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test[col].fillna(mediana, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves 

Imputación lista ✔


In [None]:
# 6. Unificar categorías entre Train y Test

# Obtener columnas categóricas otra vez (sin etiqueta)
cat_cols = df.select_dtypes(include=['object']).columns
cat_cols = [c for c in cat_cols if c != 'RENDIMIENTO_GLOBAL']

for col in cat_cols:
    # Convertir a string por seguridad (evita errores de tipo)
    df[col] = df[col].astype(str)
    test[col] = test[col].astype(str)

    # Unificar categorías usando categorías combinadas
    categorias = list(set(df[col].unique()) | set(test[col].unique()))

    df[col] = pd.Categorical(df[col], categories=categorias)
    test[col] = pd.Categorical(test[col], categories=categorias)

print("✔ Categorías unificadas entre train y test")


✔ Categorías unificadas entre train y test


In [None]:
from sklearn.preprocessing import LabelEncoder

# 7. Label Encoding para TODAS las categóricas

label_encoders = {}
cat_cols = df.select_dtypes(include=["object", "category"]).columns
cat_cols = [c for c in cat_cols if c != "RENDIMIENTO_GLOBAL"]  # evitar target

for col in cat_cols:
    le = LabelEncoder()

    # fit en train + test juntos (para no perder categorías)
    le.fit(list(df[col].astype(str).values) + list(test[col].astype(str).values))

    df[col] = le.transform(df[col].astype(str))
    test[col] = le.transform(test[col].astype(str))

    label_encoders[col] = le

print("✔ Todas las variables categóricas fueron codificadas.")

✔ Todas las variables categóricas fueron codificadas.


In [None]:
# 8. Agrupar en una solo columna lo que no ayuda en nada
df.F_EDUCACIONMADRE = ['Ninguno' if i in ['No sabe', 'No Aplica'] else i for i in df.F_EDUCACIONMADRE.values]
df.F_EDUCACIONPADRE = ['Ninguno' if i in ['No sabe', 'No Aplica'] else i for i in df.F_EDUCACIONPADRE.values]
test.F_EDUCACIONMADRE = ['Ninguno' if i in ['No sabe', 'No Aplica'] else i for i in test.F_EDUCACIONMADRE.values]
test.F_EDUCACIONPADRE = ['Ninguno' if i in ['No sabe', 'No Aplica'] else i for i in test.F_EDUCACIONPADRE.values]

In [None]:
# 9. Convertir variables categoricas en numericas
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['F_TIENELAVADORA'] = le.fit_transform(df['F_TIENELAVADORA']) # Aqui el 1 es si y 0 es no
df['F_TIENEINTERNET'] = le.fit_transform(df['F_TIENEINTERNET']) # Aqui el 1 es si y 0 es no
df['F_TIENEAUTOMOVIL'] = le.fit_transform(df['F_TIENEAUTOMOVIL']) # Aqui el 1 es si y 0 es no
df['E_PAGOMATRICULAPROPIO'] = le.fit_transform(df['E_PAGOMATRICULAPROPIO']) # Aqui el 1 es si y 0 es no
df['F_TIENECOMPUTADOR'] = le.fit_transform(df['F_TIENECOMPUTADOR']) # Aqui el 1 es s y 0 es n
test['F_TIENELAVADORA'] = le.fit_transform(test['F_TIENELAVADORA']) # Aqui el 1 es si y 0 es no
test['F_TIENEINTERNET'] = le.fit_transform(test['F_TIENEINTERNET']) # Aqui el 1 es si y 0 es no
test['F_TIENEAUTOMOVIL'] = le.fit_transform(test['F_TIENEAUTOMOVIL']) # Aqui el 1 es si y 0 es no
test['E_PAGOMATRICULAPROPIO'] = le.fit_transform(test['E_PAGOMATRICULAPROPIO']) # Aqui el 1 es si y 0 es no
test['F_TIENECOMPUTADOR'] = le.fit_transform(test['F_TIENECOMPUTADOR']) # Aqui el 1 es s y 0 es n

In [None]:
# 10. Aplico OneHot
df = df.copy()
df = pd.get_dummies(df, columns=['E_PRGM_DEPARTAMENTO'], dtype=int)
df = pd.get_dummies(df, columns=['F_EDUCACIONMADRE'], dtype=int)
df = pd.get_dummies(df, columns=['F_EDUCACIONPADRE'], dtype=int)
df = pd.get_dummies(df, columns=['E_VALORMATRICULAUNIVERSIDAD'], dtype=int)
df = pd.get_dummies(df, columns=['E_HORASSEMANATRABAJA'], dtype=int)
df = pd.get_dummies(df, columns=['F_ESTRATOVIVIENDA'], dtype=int)
test = test.copy()
test = pd.get_dummies(test, columns=['E_PRGM_DEPARTAMENTO'], dtype=int)
test = pd.get_dummies(test, columns=['F_EDUCACIONMADRE'], dtype=int)
test = pd.get_dummies(test, columns=['F_EDUCACIONPADRE'], dtype=int)
test = pd.get_dummies(test, columns=['E_VALORMATRICULAUNIVERSIDAD'], dtype=int)
test = pd.get_dummies(test, columns=['E_HORASSEMANATRABAJA'], dtype=int)
test = pd.get_dummies(test, columns=['F_ESTRATOVIVIENDA'], dtype=int)

In [None]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder


#  Separar X e y
y = df["RENDIMIENTO_GLOBAL"]
X = df.drop(columns=["RENDIMIENTO_GLOBAL"])

test_IDs = test["ID"]
X_test = test.drop(columns=["ID"])

#  Codificar target
le = LabelEncoder()
y_encoded = le.fit_transform(y)

print("Clases codificadas:", dict(zip(le.classes_, le.transform(le.classes_))))

#  Train / Valid
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, shuffle=True
)

#  Configuración
model = lgb.LGBMClassifier(
    boosting_type="gbdt",
    objective="multiclass",
    num_class=len(le.classes_),

    # Parámetros optimizados
    n_estimators=1500,
    learning_rate=0.02,
    num_leaves=64,
    max_depth=-1,

    feature_fraction=0.80,      # Esto reduce overfitting
    bagging_fraction=0.75,
    bagging_freq=5,

    min_data_in_leaf=50,
    lambda_l1=1.0,
    lambda_l2=1.0,

    random_state=42,
    n_jobs=-1
)

#  Entrenamiento con callbacks
from lightgbm import log_evaluation

model.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    eval_metric="multi_logloss",
    callbacks=[log_evaluation(200)]  # Muestra evaluación cada 200 iteraciones
)

#  Evaluación
y_pred = model.predict(X_valid)
acc = accuracy_score(y_valid, y_pred)

print(f"\n ACCURACY OPTIMIZADO: {acc:.5f}")

#  Alinear columnas antes de predecir

# Columnas que están en train pero no en test
missing_cols = set(X_train.columns) - set(X_test.columns)

# Crear las columnas faltantes en test
for col in missing_cols:
    X_test[col] = 0  # Valor seguro para LightGBM

# Ordenar columnas del test igual que train
X_test = X_test[X_train.columns]

#  Predicciones finales
test_pred = model.predict(X_test)
test_pred_labels = le.inverse_transform(test_pred)

submission = pd.DataFrame({
    "ID": test_IDs,
    "RENDIMIENTO_GLOBAL": test_pred_labels
})

submission.to_csv("submission1.csv", index=False)
print("\n submission1.csv generado correctamente!")

Clases codificadas: {'alto': np.int64(0), 'bajo': np.int64(1), 'medio-alto': np.int64(2), 'medio-bajo': np.int64(3)}
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.050531 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1564
[LightGBM] [Info] Number of data points in the train set: 554000, number of used features: 89
[LightGBM] [Info] Start training from score -1.372285
[LightGBM] [Info] Start training from score -1.386915
[LightGBM] [Info] Start training from score -1.394559
[LightGBM] [Info] Start training from score -1.391565
[200]	valid_0's multi_logloss: 1.21564
[400]	valid_0's multi_logloss: 1.20082
[600]	valid_0's multi_logloss: 1.19639
[800]	valid_0's multi_logloss: 1.19454
[1000]	valid_0's multi_logloss: 1.19364
[1200]	valid_0's multi_logloss: 1.19263
[1400]	valid_0's multi_logloss: 1.19231

 ACCURACY OPTIMIZADO: 0.43885

 subm