## Instalacion de dependencias

In [2]:
%pip install pandas numpy scikit-learn lightgbm matplotlib

Collecting scikit-learn
  Downloading scikit_learn-1.7.0-cp313-cp313-win_amd64.whl.metadata (14 kB)
Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Collecting matplotlib
  Downloading matplotlib-3.10.3-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Downloading scipy-1.16.0-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.3.2-cp313-cp313-win_amd64.whl.metadata (5.5 kB)
Collecting cycler>=0.10 (from matplotlib)
  Downloading cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.58.4-cp313-cp313-win_amd64.whl.metadata (108 kB)
Collecting kiwisolver>=1.3


[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


## Carga de librerias y archivos

In [3]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import lightgbm as lgb

# Cargar datos
os.chdir("C:\\Users\\Cristian David\\Desktop\\Todo\\U\\Modelos\\udea-ai-4-eng-20251-pruebas-saber-pro-colombia")
train = pd.read_csv("train.csv")

## Selección de variables y división de datos

In [3]:
categorical_features = [
    "ESTU_VALORMATRICULAUNIVERSIDAD",
    "ESTU_HORASSEMANATRABAJA",
    "FAMI_ESTRATOVIVIENDA",
    "FAMI_EDUCACIONPADRE",
    "FAMI_EDUCACIONMADRE",
    "ESTU_PRGM_DEPARTAMENTO",
    "FAMI_TIENEINTERNET",
    "ESTU_PAGOMATRICULAPROPIO",
    "ESTU_PRGM_ACADEMICO"
]

# Asegurar que las variables categóricas sean string y sin nulos
temp = train.copy()
for col in categorical_features:
    if col in temp.columns:
        temp[col] = temp[col].astype(str).fillna('missing')

y = temp["RENDIMIENTO_GLOBAL"]
X = temp.drop(columns=["RENDIMIENTO_GLOBAL", "ID", "PERIODO"])
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.125, random_state=42, stratify=y_temp)

## Procesamiento para LightGBM

In [4]:
# Identificar correctamente las columnas numéricas (excluyendo categóricas y asegurando que existan en X_train)
numeric_features = [col for col in X_train.select_dtypes(include=[np.number]).columns if col not in categorical_features]

preprocessor = ColumnTransformer([
    ("num", Pipeline([
        ("scaler", StandardScaler())
    ]), numeric_features),
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_features)
])

# Creación del pipeline
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", lgb.LGBMClassifier(n_estimators=2000, learning_rate=0.1, random_state=42))
])
pipeline.fit(X_train, y_train)

# Evaluación del modelo
y_val_pred = pipeline.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Accuracy validación: {val_accuracy:.4f}")
print(classification_report(y_val, y_val_pred))

y_test_pred = pipeline.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Accuracy test: {test_accuracy:.4f}")
print(classification_report(y_test, y_test_pred))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.023510 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2507
[LightGBM] [Info] Number of data points in the train set: 554000, number of used features: 864
[LightGBM] [Info] Start training from score -1.371993
[LightGBM] [Info] Start training from score -1.387089
[LightGBM] [Info] Start training from score -1.395033
[LightGBM] [Info] Start training from score -1.391216




Accuracy validación: 0.4327
              precision    recall  f1-score   support

        alto       0.56      0.63      0.59     30733
        bajo       0.46      0.55      0.50     30272
  medio-alto       0.32      0.27      0.29     30034
  medio-bajo       0.33      0.28      0.30     30148

    accuracy                           0.43    121187
   macro avg       0.42      0.43      0.42    121187
weighted avg       0.42      0.43      0.42    121187





Accuracy test: 0.4308
              precision    recall  f1-score   support

        alto       0.56      0.62      0.59      4391
        bajo       0.46      0.54      0.50      4325
  medio-alto       0.33      0.28      0.30      4290
  medio-bajo       0.32      0.28      0.30      4307

    accuracy                           0.43     17313
   macro avg       0.42      0.43      0.42     17313
weighted avg       0.42      0.43      0.42     17313



## Kaggle Submission

### Carga de datos

In [1]:
test_data = pd.read_csv("test.csv")

NameError: name 'pd' is not defined

In [6]:
for col in categorical_features:
    if col in test_data.columns:
        test_data[col] = test_data[col].astype(str).fillna('missing')
X_test_kaggle = test_data.drop(columns=["ID", "PERIODO"], errors='ignore')
predictions = pipeline.predict(X_test_kaggle)
submission_df = test_data[["ID"]].copy()
submission_df["RENDIMIENTO_GLOBAL"] = predictions
submission_df.to_csv("submission_lightgbm.csv", index=False)
print("Archivo de submission generado: submission_lightgbm.csv")
print(submission_df.head())



Archivo de submission generado: submission_lightgbm.csv
       ID RENDIMIENTO_GLOBAL
0  550236               bajo
1   98545         medio-alto
2  499179               alto
3  782980               bajo
4  785185               bajo
