# Explore here

In [50]:
# Your code here
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

# modelado (install scikit-learn)
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge


# metricas
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error, r2_score

# optimizar
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Guardado de modelo
from joblib import dump

In [51]:
archivo="https://raw.githubusercontent.com/4GeeksAcademy/regularized-linear-regression-project-tutorial/main/demographic_health_data.csv"

df_original = pd.read_csv(archivo, sep=",")
df = pd.read_csv(archivo, sep=",")

datos,atributos=df.shape
print(f"Contamos con {datos} cantidad de datos y {atributos} atributos.")
print()

# Valores duplicados

duplicados = df.duplicated()
num_duplicados = duplicados.sum()

print(f"Contamos con {num_duplicados} datos duplicados.")
print()


# Obtener información sobre tipos de datos y valores no nulos
pd.set_option('display.max_info_columns',500)
df.info()


df.isnull().sum().sort_values(ascending=False)

Contamos con 3140 cantidad de datos y 108 atributos.

Contamos con 0 datos duplicados.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3140 entries, 0 to 3139
Data columns (total 108 columns):
 #    Column                                                                         Non-Null Count  Dtype  
---   ------                                                                         --------------  -----  
 0    fips                                                                           3140 non-null   int64  
 1    TOT_POP                                                                        3140 non-null   int64  
 2    0-9                                                                            3140 non-null   int64  
 3    0-9 y/o % of total pop                                                         3140 non-null   float64
 4    19-Oct                                                                         3140 non-null   int64  
 5    10-19 y/o % of total pop           

fips                      0
TOT_POP                   0
0-9                       0
0-9 y/o % of total pop    0
19-Oct                    0
                         ..
CKD_prevalence            0
CKD_Lower 95% CI          0
CKD_Upper 95% CI          0
CKD_number                0
Urban_rural_code          0
Length: 108, dtype: int64

In [52]:
df.head()

Unnamed: 0,fips,TOT_POP,0-9,0-9 y/o % of total pop,19-Oct,10-19 y/o % of total pop,20-29,20-29 y/o % of total pop,30-39,30-39 y/o % of total pop,...,COPD_number,diabetes_prevalence,diabetes_Lower 95% CI,diabetes_Upper 95% CI,diabetes_number,CKD_prevalence,CKD_Lower 95% CI,CKD_Upper 95% CI,CKD_number,Urban_rural_code
0,1001,55601,6787,12.206615,7637,13.735364,6878,12.370281,7089,12.749771,...,3644,12.9,11.9,13.8,5462,3.1,2.9,3.3,1326,3
1,1003,218022,24757,11.355276,26913,12.344167,23579,10.814964,25213,11.564429,...,14692,12.0,11.0,13.1,20520,3.2,3.0,3.5,5479,4
2,1005,24881,2732,10.980266,2960,11.896628,3268,13.13452,3201,12.865239,...,2373,19.7,18.6,20.6,3870,4.5,4.2,4.8,887,6
3,1007,22400,2456,10.964286,2596,11.589286,3029,13.522321,3113,13.897321,...,1789,14.1,13.2,14.9,2511,3.3,3.1,3.6,595,2
4,1009,57840,7095,12.266598,7570,13.087828,6742,11.656293,6884,11.901798,...,4661,13.5,12.6,14.5,6017,3.4,3.2,3.7,1507,2


Vamos a analizar la población con obesidad. Nuestra variable objetivo será la numero 90 "Obesity_number"

In [53]:
# Seleccionar sólo columnas categóricas
object_columns = df.select_dtypes(include=['object'])

object_columns.head()

state_counts = df["STATE_NAME"].value_counts()

#Eliminar las filas (si el porcentaje es bajo) de los que tengan pocos estados. 

# Filtrar el DataFrame, conservando solo las filas donde "STATE_NAME" aparece 5 o más veces
df_filtered = df[df["STATE_NAME"].isin(state_counts[state_counts > 5].index)]

# Mostrar las primeras filas del DataFrame filtrado
df_filtered

Unnamed: 0,fips,TOT_POP,0-9,0-9 y/o % of total pop,19-Oct,10-19 y/o % of total pop,20-29,20-29 y/o % of total pop,30-39,30-39 y/o % of total pop,...,COPD_number,diabetes_prevalence,diabetes_Lower 95% CI,diabetes_Upper 95% CI,diabetes_number,CKD_prevalence,CKD_Lower 95% CI,CKD_Upper 95% CI,CKD_number,Urban_rural_code
0,1001,55601,6787,12.206615,7637,13.735364,6878,12.370281,7089,12.749771,...,3644,12.9,11.9,13.8,5462,3.1,2.9,3.3,1326,3
1,1003,218022,24757,11.355276,26913,12.344167,23579,10.814964,25213,11.564429,...,14692,12.0,11.0,13.1,20520,3.2,3.0,3.5,5479,4
2,1005,24881,2732,10.980266,2960,11.896628,3268,13.134520,3201,12.865239,...,2373,19.7,18.6,20.6,3870,4.5,4.2,4.8,887,6
3,1007,22400,2456,10.964286,2596,11.589286,3029,13.522321,3113,13.897321,...,1789,14.1,13.2,14.9,2511,3.3,3.1,3.6,595,2
4,1009,57840,7095,12.266598,7570,13.087828,6742,11.656293,6884,11.901798,...,4661,13.5,12.6,14.5,6017,3.4,3.2,3.7,1507,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3135,56037,43051,6104,14.178532,6326,14.694200,5359,12.448027,6577,15.277229,...,2098,8.9,8.3,9.6,2834,2.6,2.4,2.8,821,5
3136,56039,23081,2384,10.328842,2185,9.466661,2967,12.854729,4093,17.733200,...,928,7.2,6.5,8.0,1360,2.4,2.2,2.6,447,5
3137,56041,20299,3121,15.375142,3205,15.788955,2153,10.606434,2702,13.311001,...,1163,10.4,9.5,11.2,1500,3.0,2.8,3.2,430,5
3138,56043,7885,858,10.881420,1113,14.115409,715,9.067850,903,11.452124,...,506,11.3,10.3,12.1,686,3.4,3.2,3.7,207,6


In [54]:
#state_counts[state_counts > 5] filtra las categorías con más de 5 ocurrencias.
#.index extrae los nombres de estas categorías.
#df["STATE_NAME"].isin(...) crea un filtro booleano para conservar solo las filas donde STATE_NAME está en las categorías seleccionadas.
#df[df["STATE_NAME"].isin(...)] aplica este filtro al DataFrame original, generando uno filtrado.

In [55]:
df_filtered["STATE_NAME"].value_counts()

STATE_NAME
Texas             254
Georgia           159
Virginia          133
Kentucky          120
Missouri          115
Kansas            105
Illinois          102
North Carolina    100
Iowa               99
Tennessee          95
Nebraska           93
Indiana            92
Ohio               88
Minnesota          87
Michigan           83
Mississippi        82
Oklahoma           77
Arkansas           75
Wisconsin          72
Pennsylvania       67
Florida            67
Alabama            67
South Dakota       66
Louisiana          64
Colorado           64
New York           62
California         58
Montana            56
West Virginia      55
North Dakota       53
South Carolina     46
Idaho              44
Washington         39
Oregon             36
New Mexico         33
Utah               29
Alaska             27
Maryland           24
Wyoming            23
New Jersey         21
Nevada             17
Maine              16
Arizona            15
Vermont            14
Massachusetts      14

In [56]:
#elimino "county_name"
df=df_filtered.drop(columns=["COUNTY_NAME"])


In [57]:
#columna numerica
numerica_columna = df.select_dtypes(include=['number'])
numerica_columna.head()


Unnamed: 0,fips,TOT_POP,0-9,0-9 y/o % of total pop,19-Oct,10-19 y/o % of total pop,20-29,20-29 y/o % of total pop,30-39,30-39 y/o % of total pop,...,COPD_number,diabetes_prevalence,diabetes_Lower 95% CI,diabetes_Upper 95% CI,diabetes_number,CKD_prevalence,CKD_Lower 95% CI,CKD_Upper 95% CI,CKD_number,Urban_rural_code
0,1001,55601,6787,12.206615,7637,13.735364,6878,12.370281,7089,12.749771,...,3644,12.9,11.9,13.8,5462,3.1,2.9,3.3,1326,3
1,1003,218022,24757,11.355276,26913,12.344167,23579,10.814964,25213,11.564429,...,14692,12.0,11.0,13.1,20520,3.2,3.0,3.5,5479,4
2,1005,24881,2732,10.980266,2960,11.896628,3268,13.13452,3201,12.865239,...,2373,19.7,18.6,20.6,3870,4.5,4.2,4.8,887,6
3,1007,22400,2456,10.964286,2596,11.589286,3029,13.522321,3113,13.897321,...,1789,14.1,13.2,14.9,2511,3.3,3.1,3.6,595,2
4,1009,57840,7095,12.266598,7570,13.087828,6742,11.656293,6884,11.901798,...,4661,13.5,12.6,14.5,6017,3.4,3.2,3.7,1507,2


In [58]:
# Dividimos el conjunto de datos en muestras de train y test

X = df.drop("Obesity_number", axis = 1)
y = df["Obesity_number"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 35)

print(X_train.shape, X_test.shape)

(2500, 106) (626, 106)


In [59]:
#escalado variables numericas
num_variables=X_train.select_dtypes(include=['number']).columns

# instancio el escalador
scaler = StandardScaler()

# entreno el escalador con los datos de entrenamiento
scaler.fit(X_train[num_variables])

# aplico el escalador en ambos
X_train_num_scal = scaler.transform(X_train[num_variables])
X_train_num_scal = pd.DataFrame(X_train_num_scal, index = X_train.index, columns = num_variables)

X_test_num_scal = scaler.transform(X_test[num_variables])
X_test_num_scal = pd.DataFrame(X_test_num_scal, index = X_test.index, columns = num_variables)

X_train_num_scal.head()

Unnamed: 0,fips,TOT_POP,0-9,0-9 y/o % of total pop,19-Oct,10-19 y/o % of total pop,20-29,20-29 y/o % of total pop,30-39,30-39 y/o % of total pop,...,COPD_number,diabetes_prevalence,diabetes_Lower 95% CI,diabetes_Upper 95% CI,diabetes_number,CKD_prevalence,CKD_Lower 95% CI,CKD_Upper 95% CI,CKD_number,Urban_rural_code
1314,-0.214815,-0.201426,-0.192796,0.503681,-0.200593,0.448347,-0.212185,-0.633212,-0.197932,-0.356831,...,-0.251835,-0.941786,-0.945892,-0.936223,-0.214651,-0.259431,-0.207095,-0.182082,-0.19919,0.898353
1546,-0.073574,-0.228568,-0.224247,0.291851,-0.232958,0.17376,-0.226739,-0.367752,-0.21442,0.11861,...,-0.228578,-0.140353,-0.189626,-0.12785,-0.223758,-0.084848,-0.019168,-0.02003,-0.223663,0.898353
930,-0.674279,-0.296146,-0.294352,-0.564016,-0.30538,-1.436997,-0.278597,-1.070893,-0.271421,-1.839054,...,-0.34505,1.061797,1.020399,1.172577,-0.294722,1.486405,1.48424,1.438438,-0.296311,0.898353
1814,0.318924,-0.187502,-0.179611,0.429574,-0.19047,0.15444,-0.193473,-0.265528,-0.185377,-0.210702,...,-0.224241,1.498943,1.511972,1.488897,-0.133394,1.660988,1.672167,1.60049,-0.141917,0.236437
2688,1.202244,-0.280212,-0.277552,0.127174,-0.289054,-0.469183,-0.254576,1.735674,-0.252496,1.549457,...,-0.331473,-0.140353,-0.038373,-0.233289,-0.279724,-0.608598,-0.582947,-0.668237,-0.283626,0.898353


In [60]:
cat_variables = ["STATE_NAME"]

X_train_cat_le = X_train.copy()
X_test_cat_le = X_test.copy()

# instancio el encoder

label_encoder_sn = LabelEncoder()

# entreno el encoder con los datos de entrenamiento

label_encoder_sn.fit(X_train['STATE_NAME'])

# aplico el encoder en ambos

X_train_cat_le['STATE_NAME_le'] = label_encoder_sn.transform(X_train['STATE_NAME'])
X_test_cat_le['STATE_NAME_le'] = label_encoder_sn.transform(X_test['STATE_NAME'])

X_train_cat_le.head()

Unnamed: 0,fips,TOT_POP,0-9,0-9 y/o % of total pop,19-Oct,10-19 y/o % of total pop,20-29,20-29 y/o % of total pop,30-39,30-39 y/o % of total pop,...,diabetes_prevalence,diabetes_Lower 95% CI,diabetes_Upper 95% CI,diabetes_number,CKD_prevalence,CKD_Lower 95% CI,CKD_Upper 95% CI,CKD_number,Urban_rural_code,STATE_NAME_le
1314,27005,34371,4463,12.984784,4646,13.517209,3544,10.311018,3834,11.154753,...,10.5,9.6,11.4,2734,3.3,3.1,3.6,857,6,20
1546,29131,25336,3175,12.531576,3300,13.024945,2823,11.142248,3029,11.95532,...,12.7,11.6,13.7,2465,3.4,3.2,3.7,666,6,22
930,20089,2841,304,10.700458,288,10.137276,254,8.940514,246,8.658923,...,16.0,14.8,17.4,369,4.3,4.0,4.6,99,6,13
1814,35039,39006,5003,12.826232,5067,12.990309,4471,11.462339,4447,11.40081,...,17.2,16.1,18.3,5134,4.4,4.1,4.7,1304,5,28
2688,48335,8145,992,12.179251,967,11.872314,1444,17.728668,1170,14.364641,...,12.7,12.0,13.4,812,3.1,2.9,3.3,198,6,39


In [61]:
X_train_final = pd.concat([X_train_num_scal, X_train_cat_le['STATE_NAME_le']], axis=1)
X_test_final = pd.concat([X_test_num_scal, X_test_cat_le['STATE_NAME_le']], axis=1)

In [62]:
# como guardar los datos procesados como archivo CSV
X_train_final.to_csv('../data/processed/X_train.csv', index=False)
X_test_final.to_csv('../data/processed/X_test.csv', index=False)
y_train.to_csv('../data/processed/y_train.csv', index=False)
y_test.to_csv('../data/processed/y_test.csv', index=False)

In [63]:
# modelo
reg_lin = LinearRegression()

# entrenamiento
reg_lin.fit(X_train_final, y_train)

# predicción
y_pred = reg_lin.predict(X_test_final)
y_pred

# metricas
mse_rl = mean_squared_error(y_test, y_pred)
rmse_rl = np.sqrt(mse_rl)
r2_rl = r2_score(y_test, y_pred)

print("MSE: ", mse_rl)
print("RMSE: ", rmse_rl)
print("Coeficiente de determinación: ", r2_rl)

MSE:  9284917.372798674
RMSE:  3047.116238806566
Coeficiente de determinación:  0.9984972012355045


In [64]:
# modelo regularizacion lasso
reg_lasso = Lasso(alpha = 0.001, max_iter = 8000)

# entrenamiento
reg_lasso.fit(X_train_final, y_train)

# prediccion
y_pred = reg_lasso.predict(X_test_final)
y_pred

# metricas
mse_l1 = mean_squared_error(y_test, y_pred)
rmse_l1 = np.sqrt(mse_l1)
r2_l1 = r2_score(y_test, y_pred)

print("MSE: ", mse_l1)
print("RMSE: ", rmse_l1)
print("Coeficiente de determinación: ", r2_l1)

MSE:  7934344.769970222
RMSE:  2816.79689895637
Coeficiente de determinación:  0.9987157964859951


  model = cd_fast.enet_coordinate_descent(


el mododelo lasso es un po mejor ya que el error es de 2816 cuando el el de regresion lineal es de 3047

In [65]:
# modelo regularizacion ridge
reg_ridge = Ridge(alpha = 0.0000000001, max_iter = 400)

# entrenamiento
reg_ridge.fit(X_train_final, y_train)

# prediccion
y_pred = reg_ridge.predict(X_test_final)
y_pred

# metricas
mse_l2 = mean_squared_error(y_test, y_pred)
rmse_l2 = np.sqrt(mse_l2)
r2_l2 = r2_score(y_test, y_pred)

print("MSE: ", mse_l2)
print("RMSE: ", rmse_l2)
print("Coeficiente de determinación: ", r2_l2)

MSE:  9292429.909192516
RMSE:  3048.348718436346
Coeficiente de determinación:  0.998495985303261


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


In [66]:
# Datos a nivel informativo y comparativo
data = {
    'Modelo': ['Regresión lineal', "Regularización Lasso", "Regularización Ridge"],
    'MSE': [mse_rl, mse_l1, mse_l2],
    'RMSE': [rmse_rl, rmse_l1, rmse_l2],
    'R²': [r2_rl, r2_l1, r2_l2]
}

# Crear el DataFrame
resultados = pd.DataFrame(data)

# Mostrar el DataFrame
print(resultados)

                 Modelo           MSE         RMSE        R²
0      Regresión lineal  9.284917e+06  3047.116239  0.998497
1  Regularización Lasso  7.934345e+06  2816.796899  0.998716
2  Regularización Ridge  9.292430e+06  3048.348718  0.998496


In [67]:
# modelo
reg_lasso_1 = Lasso(alpha = 10.0, max_iter = 2083)

# entrenamiento
reg_lasso_1.fit(X_train_final, y_train)

# prediccion
y_pred = reg_lasso_1.predict(X_test_final)
y_pred

# metricas
mse_l4 = mean_squared_error(y_test, y_pred)
rmse_l4 = np.sqrt(mse_l4)
r2_l4 = r2_score(y_test, y_pred)

print("MSE: ", mse_l4)
print("RMSE: ", rmse_l4)
print("Coeficiente de determinación: ", r2_l4)

MSE:  6043484.23681644
RMSE:  2458.3499012175707
Coeficiente de determinación:  0.9990218393681194


  model = cd_fast.enet_coordinate_descent(


Después de varias pruebas, lo que optimiza el modelo son los hiperparámetros alpha=10.0 y max_iter = 2083



Optimizar alpha: Usa validación cruzada para encontrar el mejor valor de alpha

In [68]:
from sklearn.model_selection import GridSearchCV

param_grid = {'alpha': [0.1, 0.5, 1.0, 5.0, 10.0]}
grid = GridSearchCV(Lasso(max_iter=2000), param_grid, cv=5, scoring='neg_mean_squared_error')
grid.fit(X_train_final, y_train)
print("Mejor alpha:", grid.best_params_['alpha'])


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Mejor alpha: 10.0


  model = cd_fast.enet_coordinate_descent(


In [69]:
#guardado del modeelo

from joblib import dump

# Guardar el modelo en un archivo
dump(reg_lasso_1, "Regresion_regularizada_Lasso.sav")
print("Modelo guardado correctamente.")


Modelo guardado correctamente.
