In [None]:
# 📦 Importar librerías necesarias
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Usamos la biblioteca google para poder usar archivos en nuestro drive.
from google.colab import drive
# Este comando conecta colab con drive.
drive.mount('/content/drive')

path = "/content/drive/MyDrive/datasets/train.csv"
df = pd.read_csv(path, sep=";")

In [None]:
# 🔍 Información general
df.info()

In [None]:
# 📊 Estadísticas descriptivas
df.describe()

In [None]:
# ❓ Valores faltantes
df.isnull().sum()

In [None]:
# ⚠️ Boxplot para outliers
df.select_dtypes(include=np.number).boxplot(figsize=(15,6))
plt.xticks(rotation=90)
plt.title('Boxplot de variables numéricas')
plt.show()

In [None]:
# 📈 Histogramas
df.hist(bins=30, figsize=(15, 10))
plt.tight_layout()
plt.show()

In [None]:
# 🔥 Mapa de calor de correlación
plt.figure(figsize=(12,8))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap='coolwarm')
plt.title('Matriz de Correlación')
plt.show()

In [None]:
# 🧽 Imputar valores faltantes (mediana)
df.fillna(df.median(numeric_only=True), inplace=True)

In [None]:
# 🚫 Eliminar outliers usando Z-score
from scipy import stats
z_scores = np.abs(stats.zscore(df.select_dtypes(include=np.number)))
df = df[(z_scores < 3).all(axis=1)]
df.shape

In [None]:
# 🔢 One-Hot Encoding a variables categóricas
df_encoded = pd.get_dummies(df, drop_first=True)
df_encoded.head()

In [None]:
# ⚖️ Escalado de variables
scaler = StandardScaler()
X = df_encoded.drop('target', axis=1)  # Asegúrate de que 'target' es tu variable objetivo
y = df_encoded['target']
X_scaled = scaler.fit_transform(X)

In [None]:
# ✂️ Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
# 🔁 Entrenamiento y evaluación de modelos
models = {
    'Logistic Regression': LogisticRegression(),
    'KNN': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'XGBoost': XGBClassifier(eval_metric='logloss'),
    'LightGBM': LGBMClassifier()
}
for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    print(f'{name}: Accuracy promedio = {scores.mean():.4f}')

In [None]:
# 🔍 GridSearchCV para LightGBM
params = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'num_leaves': [15, 31, 63]
}
grid_lgb = GridSearchCV(LGBMClassifier(), params, cv=3, scoring='accuracy')
grid_lgb.fit(X_train, y_train)
print('Mejores parámetros LightGBM:', grid_lgb.best_params_)

In [None]:
# 📈 Evaluar el mejor modelo
best_model = grid_lgb.best_estimator_
y_pred = best_model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
print('F1 Score:', f1_score(y_test, y_pred))
print('ROC AUC:', roc_auc_score(y_test, best_model.predict_proba(X_test)[:,1]))
print(classification_report(y_test, y_pred))