In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.feature_selection import SelectKBest, f_regression



In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("uciml/pima-indians-diabetes-database")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'pima-indians-diabetes-database' dataset.
Path to dataset files: /kaggle/input/pima-indians-diabetes-database


In [None]:
import os

# Ruta del dataset
path = "/kaggle/input/pima-indians-diabetes-database"

# Lista los archivos disponibles
print("Archivos encontrados:")
print(os.listdir(path))


Archivos encontrados:
['diabetes.csv']


In [None]:

df = pd.read_csv(os.path.join(path, "diabetes.csv"))


In [None]:
df = df.dropna()

In [None]:
#Separación de variables
x = df.drop("Glucose", axis=1)
y = df["Glucose"]

In [None]:
#Normalización
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)


In [None]:
#Selección de características
selector = SelectKBest(score_func=f_regression, k='all')  # Puedes ajustar kx
x_selected = selector.fit_transform(x_scaled, y)


In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_selected, y, test_size=0.2, random_state=42)

In [None]:
# 7. Modelos con regularización
ridge = Ridge(alpha=1.0)
lasso = Lasso(alpha=0.1)


In [None]:
# 8. Validación cruzada
kf = KFold(n_splits=5, shuffle=True, random_state=42)
ridge_scores = cross_val_score(ridge, x_selected, y, cv=kf, scoring='r2')
lasso_scores = cross_val_score(lasso, x_selected, y, cv=kf, scoring='r2')

print("Ridge CV R²:", ridge_scores.mean())
print("Lasso CV R²:", lasso_scores.mean())


Ridge CV R²: 0.32529478190613237
Lasso CV R²: 0.32627461531704965


In [None]:
# 9. Entrenamiento final y evaluación
ridge.fit(x_train, y_train)
y_pred = ridge.predict(x_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("\n🔍 Métricas de Evaluación (Ridge):")
print("MAE:", mae)
print("RMSE:", rmse)
print("R²:", r2)



🔍 Métricas de Evaluación (Ridge):
MAE: 20.606304531218107
RMSE: 26.541392891255125
R²: 0.29997093953233434


In [None]:
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('ridge', Ridge())
])

params = {
    'ridge__alpha': [0.01, 0.1, 1, 10, 100]
}

grid = GridSearchCV(pipe, params, cv=5, scoring='r2')
grid.fit(x_train, y_train)

print("Mejor alpha:", grid.best_params_)
print("Mejor R² (CV):", grid.best_score_)

# Evaluación en test
y_pred = grid.predict(x_test)
print("R² test:", r2_score(y_test, y_pred))

Mejor alpha: {'ridge__alpha': 10}
Mejor R² (CV): 0.3290078608282155
R² test: 0.30181931175226284
