# Regresión Lineal Regularizada (Regularized Linear Regression)

Librería

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error

Scaler

In [2]:
def scaler(X_train_, X_test_, X):
  X_train = X_train_.copy()
  X_test = X_test_.copy()
  scaler = StandardScaler()
  # Train
  X_train_scaled = pd.DataFrame(
             scaler.fit_transform(X_train[X]),
             columns=scaler.get_feature_names_out(),
             index = X_train.index)
  X_train_scaled = X_train_scaled.join(X_train[list(set(X_train.columns)  - set(X))])
  # Test
  X_test_scaled = pd.DataFrame(
      scaler.transform(X_test[X]),
      columns = scaler.get_feature_names_out(),
      index = X_test.index)
  X_test_scaled = X_test_scaled.join(X_test[list(set(X_test.columns)  - set(X))])
  X_test_scaled = X_test_scaled[X_train_scaled.columns]
  return X_train_scaled, X_test_scaled

ElasticNet

In [3]:
def Elastic_gridcv(X_train, y_train):
    model =  ElasticNet(random_state=42)
    hyperparams = {"alpha" :  [0.0001, 0.01, 1, 10],
                   "l1_ratio" :  np.linspace(0,1,35),
                   "max_iter": [5, 10, 50, 100, 150],
                   "selection": ['cyclic', 'random'],
                   "tol": [1e-3, 1e-5, 1e-7, 1e-10],}
    cv = KFold(n_splits=5, shuffle=True, random_state=42) # replicables...
    grid_search = GridSearchCV(estimator=model,
                               param_grid=hyperparams,
                               cv=cv,
                               scoring= 'neg_mean_absolute_error',)
    grid_result = grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_

Data

In [4]:
url = 'https://raw.githubusercontent.com/4GeeksAcademy/regularized-linear-regression-project-tutorial/main/demographic_health_data.csv'
df = pd.read_csv(url)
df.head(5)

Unnamed: 0,fips,TOT_POP,0-9,0-9 y/o % of total pop,19-Oct,10-19 y/o % of total pop,20-29,20-29 y/o % of total pop,30-39,30-39 y/o % of total pop,...,COPD_number,diabetes_prevalence,diabetes_Lower 95% CI,diabetes_Upper 95% CI,diabetes_number,CKD_prevalence,CKD_Lower 95% CI,CKD_Upper 95% CI,CKD_number,Urban_rural_code
0,1001,55601,6787,12.206615,7637,13.735364,6878,12.370281,7089,12.749771,...,3644,12.9,11.9,13.8,5462,3.1,2.9,3.3,1326,3
1,1003,218022,24757,11.355276,26913,12.344167,23579,10.814964,25213,11.564429,...,14692,12.0,11.0,13.1,20520,3.2,3.0,3.5,5479,4
2,1005,24881,2732,10.980266,2960,11.896628,3268,13.13452,3201,12.865239,...,2373,19.7,18.6,20.6,3870,4.5,4.2,4.8,887,6
3,1007,22400,2456,10.964286,2596,11.589286,3029,13.522321,3113,13.897321,...,1789,14.1,13.2,14.9,2511,3.3,3.1,3.6,595,2
4,1009,57840,7095,12.266598,7570,13.087828,6742,11.656293,6884,11.901798,...,4661,13.5,12.6,14.5,6017,3.4,3.2,3.7,1507,2


Eliminar valores nulos

In [5]:
df.isnull().value_counts()

fips   TOT_POP  0-9    0-9 y/o % of total pop  19-Oct  10-19 y/o % of total pop  20-29  20-29 y/o % of total pop  30-39  30-39 y/o % of total pop  40-49  40-49 y/o % of total pop  50-59  50-59 y/o % of total pop  60-69  60-69 y/o % of total pop  70-79  70-79 y/o % of total pop  80+    80+ y/o % of total pop  White-alone pop  % White-alone  Black-alone pop  % Black-alone  Native American/American Indian-alone pop  % NA/AI-alone  Asian-alone pop  % Asian-alone  Hawaiian/Pacific Islander-alone pop  % Hawaiian/PI-alone  Two or more races pop  % Two or more races  POP_ESTIMATE_2018  N_POP_CHG_2018  GQ_ESTIMATES_2018  R_birth_2018  R_death_2018  R_NATURAL_INC_2018  R_INTERNATIONAL_MIG_2018  R_DOMESTIC_MIG_2018  R_NET_MIG_2018  Less than a high school diploma 2014-18  High school diploma only 2014-18  Some college or associate's degree 2014-18  Bachelor's degree or higher 2014-18  Percent of adults with less than a high school diploma 2014-18  Percent of adults with a high school diploma only

Eliminar variables no necesarias

In [6]:
df.drop(columns = ['COUNTY_NAME', 'STATE_NAME'], inplace=True)

Verificar data

In [7]:
df.head(5)

Unnamed: 0,fips,TOT_POP,0-9,0-9 y/o % of total pop,19-Oct,10-19 y/o % of total pop,20-29,20-29 y/o % of total pop,30-39,30-39 y/o % of total pop,...,COPD_number,diabetes_prevalence,diabetes_Lower 95% CI,diabetes_Upper 95% CI,diabetes_number,CKD_prevalence,CKD_Lower 95% CI,CKD_Upper 95% CI,CKD_number,Urban_rural_code
0,1001,55601,6787,12.206615,7637,13.735364,6878,12.370281,7089,12.749771,...,3644,12.9,11.9,13.8,5462,3.1,2.9,3.3,1326,3
1,1003,218022,24757,11.355276,26913,12.344167,23579,10.814964,25213,11.564429,...,14692,12.0,11.0,13.1,20520,3.2,3.0,3.5,5479,4
2,1005,24881,2732,10.980266,2960,11.896628,3268,13.13452,3201,12.865239,...,2373,19.7,18.6,20.6,3870,4.5,4.2,4.8,887,6
3,1007,22400,2456,10.964286,2596,11.589286,3029,13.522321,3113,13.897321,...,1789,14.1,13.2,14.9,2511,3.3,3.1,3.6,595,2
4,1009,57840,7095,12.266598,7570,13.087828,6742,11.656293,6884,11.901798,...,4661,13.5,12.6,14.5,6017,3.4,3.2,3.7,1507,2


imprime var

In [9]:
for var in df.columns:
  print(var)

fips
TOT_POP
0-9
0-9 y/o % of total pop
19-Oct
10-19 y/o % of total pop
20-29
20-29 y/o % of total pop
30-39
30-39 y/o % of total pop
40-49
40-49 y/o % of total pop
50-59
50-59 y/o % of total pop
60-69
60-69 y/o % of total pop
70-79
70-79 y/o % of total pop
80+
80+ y/o % of total pop
White-alone pop
% White-alone
Black-alone pop
% Black-alone
Native American/American Indian-alone pop
% NA/AI-alone
Asian-alone pop
% Asian-alone
Hawaiian/Pacific Islander-alone pop
% Hawaiian/PI-alone
Two or more races pop
% Two or more races
POP_ESTIMATE_2018
N_POP_CHG_2018
GQ_ESTIMATES_2018
R_birth_2018
R_death_2018
R_NATURAL_INC_2018
R_INTERNATIONAL_MIG_2018
R_DOMESTIC_MIG_2018
R_NET_MIG_2018
Less than a high school diploma 2014-18
High school diploma only 2014-18
Some college or associate's degree 2014-18
Bachelor's degree or higher 2014-18
Percent of adults with less than a high school diploma 2014-18
Percent of adults with a high school diploma only 2014-18
Percent of adults completing some college 

data

In [10]:
df[['anycondition_prevalence','COPD_prevalence','% Two or more races','% Hawaiian/PI-alone','% Asian-alone','% NA/AI-alone','% Black-alone','% White-alone','80+ y/o % of total pop','diabetes_prevalence', 'Heart disease_prevalence']].corr()

Unnamed: 0,anycondition_prevalence,COPD_prevalence,% Two or more races,% Hawaiian/PI-alone,% Asian-alone,% NA/AI-alone,% Black-alone,% White-alone,80+ y/o % of total pop,diabetes_prevalence,Heart disease_prevalence
anycondition_prevalence,1.0,0.8107,-0.194781,-0.06139,-0.446036,0.073028,0.313323,-0.211981,0.138,0.850219,0.784088
COPD_prevalence,0.8107,1.0,-0.166834,-0.06442,-0.409859,0.052143,0.139383,-0.056862,0.163229,0.789044,0.886726
% Two or more races,-0.194781,-0.166834,1.0,0.412133,0.426943,0.343234,-0.0899,-0.275984,-0.138806,-0.13242,-0.14751
% Hawaiian/PI-alone,-0.06139,-0.06442,0.412133,1.0,0.236267,0.003219,-0.023472,-0.120163,0.147859,-0.022165,-0.029712
% Asian-alone,-0.446036,-0.409859,0.426943,0.236267,1.0,-0.022794,0.032083,-0.246722,-0.23179,-0.310418,-0.450503
% NA/AI-alone,0.073028,0.052143,0.343234,0.003219,-0.022794,1.0,-0.111618,-0.402107,-0.145635,0.122774,0.102282
% Black-alone,0.313323,0.139383,-0.0899,-0.023472,0.032083,-0.111618,1.0,-0.828582,-0.211917,0.4748,0.0647
% White-alone,-0.211981,-0.056862,-0.275984,-0.120163,-0.246722,-0.402107,-0.828582,1.0,0.301321,-0.410392,-0.011185
80+ y/o % of total pop,0.138,0.163229,-0.138806,0.147859,-0.23179,-0.145635,-0.211917,0.301321,1.0,0.160768,0.472387
diabetes_prevalence,0.850219,0.789044,-0.13242,-0.022165,-0.310418,0.122774,0.4748,-0.410392,0.160768,1.0,0.831021


Definir "X" (train) y "y" (test)

In [11]:
X = df[['anycondition_prevalence','COPD_prevalence','% Asian-alone','% Black-alone','% White-alone','Heart disease_prevalence']]
y = df['diabetes_prevalence']

Variables numéricas

In [12]:
nums = ['anycondition_prevalence','COPD_prevalence','% Asian-alone','% Black-alone','% White-alone','Heart disease_prevalence']

establecer los grupos de Train y test

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=123)
X_train_ml, X_test_ml = scaler(X_train, X_test, nums)

Entrenar el modelo (ElasticNet)

In [14]:
mde=Elastic_gridcv(X_train_ml, y_train)
preds = mde.predict(X_test_ml)
mean_squared_error(y_test, preds)

Linear regression models with a zero l1 penalization strength are more efficiently fitted using one of the solvers implemented in sklearn.linear_model.Ridge/RidgeCV instead.
  model = cd_fast.enet_coordinate_descent(
Linear regression models with a zero l1 penalization strength are more efficiently fitted using one of the solvers implemented in sklearn.linear_model.Ridge/RidgeCV instead.
  model = cd_fast.enet_coordinate_descent(
Linear regression models with a zero l1 penalization strength are more efficiently fitted using one of the solvers implemented in sklearn.linear_model.Ridge/RidgeCV instead.
  model = cd_fast.enet_coordinate_descent(
Linear regression models with a zero l1 penalization strength are more efficiently fitted using one of the solvers implemented in sklearn.linear_model.Ridge/RidgeCV instead.
  model = cd_fast.enet_coordinate_descent(
Linear regression models with a zero l1 penalization strength are more efficiently fitted using one of the solvers implemented in sk

0.6663813156422224

In [15]:
mde

0,1,2
,alpha,0.0001
,l1_ratio,np.float64(0.0)
,fit_intercept,True
,precompute,False
,max_iter,10
,copy_X,True
,tol,0.001
,warm_start,False
,positive,False
,random_state,42


In [16]:
print(mean_absolute_error(y_test, preds))

0.5917328256490048
