<a href="https://colab.research.google.com/github/Agudelo18/UDEA-ai4eng-20252---Pruebas-Saber-Pro-Colombia/blob/main/99%20-%20modelo%20soluci%C3%B3n.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import lightgbm as lgb
from sklearn.metrics import accuracy_score

In [None]:
os.environ['KAGGLE_CONFIG_DIR'] = "."

In [None]:
!kaggle competitions download -c udea-ai-4-eng-20252-pruebas-saber-pro-colombia

Downloading udea-ai-4-eng-20252-pruebas-saber-pro-colombia.zip to /content
  0% 0.00/29.9M [00:00<?, ?B/s]
100% 29.9M/29.9M [00:00<00:00, 1.41GB/s]


In [None]:
!unzip udea-ai-4-eng-20252-pruebas-saber-pro-colombia.zip

Archive:  udea-ai-4-eng-20252-pruebas-saber-pro-colombia.zip
  inflating: submission_example.csv  
  inflating: test.csv                
  inflating: train.csv               


In [None]:

# 3 - CARGAR DATOS INICIALES
# Carga de dataset train y test
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# 4 - APLICAR MAPPINGS Y CONVERSIONES
# Mapping et conversiones de valores categóricos en valores numéricos para un mejor procesamiento

# Conversión de valores ordinales en la columna 'ESTU_VALORMATRICULAUNIVERSIDAD
tuition_average_mapping = {
    "No pagó matrícula": 0.0, "Menos de 500 mil": 0.25, "Entre 500 mil y menos de 1 millón": 0.75,
    "Entre 1 millón y menos de 2.5 millones": 1.75, "Entre 2.5 millones y menos de 4 millones": 3.25,
    "Entre 4 millions et moins de 5.5 millions": 4.75, "Entre 5.5 millions et menos de 7 millones": 6.25, "Más de 7 millones": 7.5
}
train_df['E_VALORMATRICULAUNIVERSIDAD'] = train_df['E_VALORMATRICULAUNIVERSIDAD'].map(tuition_average_mapping)
test_df['E_VALORMATRICULAUNIVERSIDAD'] = test_df['E_VALORMATRICULAUNIVERSIDAD'].map(tuition_average_mapping)

# Conversión de valores ordinales en la columna 'FAMI_ESTRATOVIVIENDA'
housing_strata_mapping = {"Estrato 1": 1, "Estrato 2": 2, "Estrato 3": 3, "Estrato 4": 4, "Estrato 5": 5, "Estrato 6": 6, "Sin Estrato": None}
train_df['F_ESTRATOVIVIENDA'] = train_df['F_ESTRATOVIVIENDA'].map(housing_strata_mapping)
test_df['F_ESTRATOVIVIENDA'] = test_df['F_ESTRATOVIVIENDA'].map(housing_strata_mapping)

# Conversión de columnas binarias en 0 y 1 para las columnas 'FAMI_TIENEINTERNET' et 'ESTU_PAGOMATRICULAPROPIO'
binary_mappings = {"Si": 1, "No": 0}
train_df['F_TIENEINTERNET'] = train_df['F_TIENEINTERNET'].map(binary_mappings)
test_df['F_TIENEINTERNET'] = test_df['F_TIENEINTERNET'].map(binary_mappings)
train_df['E_PAGOMATRICULAPROPIO'] = train_df['E_PAGOMATRICULAPROPIO'].map(binary_mappings)
test_df['E_PAGOMATRICULAPROPIO'] = test_df['E_PAGOMATRICULAPROPIO'].map(binary_mappings)

# Codificación de la variable de destino 'RENDIMIENTO_GLOBAL' en valores numéricos
target_mapping = {'bajo': 0, 'medio-bajo': 1, 'medio-alto': 2, 'alto': 3}
train_df['RENDIMIENTO_GLOBAL'] = train_df['RENDIMIENTO_GLOBAL'].map(target_mapping)

# 5 - SEPARACIÓN DE CARACTERÍSTICAS Y VARIABLE OBJETIVO
# Separación de características (X) y objetivo (y)
X_train = train_df.drop(columns=['ID', 'RENDIMIENTO_GLOBAL'])
y_train = train_df['RENDIMIENTO_GLOBAL']
X_test = test_df.drop(columns=['ID'])

# 6 - IDENTIFICACIÓN DE LAS COLUMNAS DE PRETRATAMIENTO
# Identificación de las columnas numéricas y categóricas de pretratamiento
numerical_cols = X_train.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = X_train.select_dtypes(include=['object']).columns

# 7 - PREPROCESAMIENTO PIPELINE
# Configuración del preprocesamiento para datos numéricos y categóricos
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),  # Imputación de valores numéricos perdidos
            ('scaler', StandardScaler())  # Normalización de datos numericos
        ]), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)  # Codificación One-Hot para columnas categóricas
    ]
)

# 8 - MODELO Y VALIDACIÓN
# Configuration du modèle LightGBM
model = lgb.LGBMClassifier(random_state=42)

# Creación de un pipeline completo que incluya el preprocesador y el modelo
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)
])

# Validación cruzada estratificada para evaluar el rendimiento del modelo
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring='accuracy')

# 9 - ENTRENAMIENTO DEL MODELO
# Entrenamiento del pipeline en el dataset train completo
pipeline.fit(X_train, y_train)

# 10 - HACER PREDICCIONES SOBRE EL CONJUNTO DE PRUEBAS
# Predicciones sobre el conjunto de pruebas
predictions = pipeline.predict(X_test)

# 11 - CONVERTIR LAS PREDICCIONES EN PALABRAS ORIGINALES
# Invertir la codificación de las clases para obtener las predicciones en su formato original.
target_inverse_mapping = {0: 'bajo', 1: 'medio-bajo', 2: 'medio-alto', 3: 'alto'}
predictions_labels = [target_inverse_mapping[pred] for pred in predictions]

# 12 - CREAR EL FICHERO DE PRESENTACIÓN
# Creación del fichero de envío con predicciones
submission_df = pd.DataFrame({
    "ID": test_df['ID'],
    "RENDIMIENTO_GLOBAL": predictions_labels
})
submission_df.to_csv("submission.csv", index=False)

# Confirmación de la creación del fichero de envío
print("Fichero submission.csv generado con éxito")
print("Dimensiones de la clasificación submission.csv (filas, columnas) ", submission_df.shape)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.103751 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2504
[LightGBM] [Info] Number of data points in the train set: 554000, number of used features: 853
[LightGBM] [Info] Start training from score -1.387096
[LightGBM] [Info] Start training from score -1.391216
[LightGBM] [Info] Start training from score -1.395033
[LightGBM] [Info] Start training from score -1.371986


  X = _LGBMValidateData(


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.097568 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2509
[LightGBM] [Info] Number of data points in the train set: 554000, number of used features: 855
[LightGBM] [Info] Start training from score -1.387089
[LightGBM] [Info] Start training from score -1.391216
[LightGBM] [Info] Start training from score -1.395033
[LightGBM] [Info] Start training from score -1.371993


  X = _LGBMValidateData(


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.103151 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2510
[LightGBM] [Info] Number of data points in the train set: 554000, number of used features: 856
[LightGBM] [Info] Start training from score -1.387089
[LightGBM] [Info] Start training from score -1.391216
[LightGBM] [Info] Start training from score -1.395033
[LightGBM] [Info] Start training from score -1.371993


  X = _LGBMValidateData(


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.105469 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2498
[LightGBM] [Info] Number of data points in the train set: 554000, number of used features: 850
[LightGBM] [Info] Start training from score -1.387089
[LightGBM] [Info] Start training from score -1.391216
[LightGBM] [Info] Start training from score -1.395033
[LightGBM] [Info] Start training from score -1.371993


  X = _LGBMValidateData(


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.170425 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2509
[LightGBM] [Info] Number of data points in the train set: 554000, number of used features: 856
[LightGBM] [Info] Start training from score -1.387096
[LightGBM] [Info] Start training from score -1.391216
[LightGBM] [Info] Start training from score -1.395026
[LightGBM] [Info] Start training from score -1.371993


  X = _LGBMValidateData(


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.134676 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2587
[LightGBM] [Info] Number of data points in the train set: 692500, number of used features: 894
[LightGBM] [Info] Start training from score -1.387092
[LightGBM] [Info] Start training from score -1.391216
[LightGBM] [Info] Start training from score -1.395031
[LightGBM] [Info] Start training from score -1.371991


  X = _LGBMValidateData(


Fichero submission.csv generado con éxito
Dimensiones de la clasificación submission.csv (filas, columnas)  (296786, 2)
