# Explore here

In [18]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
import xgboost as xgb
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler , StandardScaler
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import *
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
import ast

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/regularized-linear-regression-project-tutorial/main/demographic_health_data.csv",sep=",")
df.head(5)


Unnamed: 0,fips,TOT_POP,0-9,0-9 y/o % of total pop,19-Oct,10-19 y/o % of total pop,20-29,20-29 y/o % of total pop,30-39,30-39 y/o % of total pop,...,COPD_number,diabetes_prevalence,diabetes_Lower 95% CI,diabetes_Upper 95% CI,diabetes_number,CKD_prevalence,CKD_Lower 95% CI,CKD_Upper 95% CI,CKD_number,Urban_rural_code
0,1001,55601,6787,12.206615,7637,13.735364,6878,12.370281,7089,12.749771,...,3644,12.9,11.9,13.8,5462,3.1,2.9,3.3,1326,3
1,1003,218022,24757,11.355276,26913,12.344167,23579,10.814964,25213,11.564429,...,14692,12.0,11.0,13.1,20520,3.2,3.0,3.5,5479,4
2,1005,24881,2732,10.980266,2960,11.896628,3268,13.13452,3201,12.865239,...,2373,19.7,18.6,20.6,3870,4.5,4.2,4.8,887,6
3,1007,22400,2456,10.964286,2596,11.589286,3029,13.522321,3113,13.897321,...,1789,14.1,13.2,14.9,2511,3.3,3.1,3.6,595,2
4,1009,57840,7095,12.266598,7570,13.087828,6742,11.656293,6884,11.901798,...,4661,13.5,12.6,14.5,6017,3.4,3.2,3.7,1507,2


In [3]:
df.dtypes

fips                        int64
TOT_POP                     int64
0-9                         int64
0-9 y/o % of total pop    float64
19-Oct                      int64
                           ...   
CKD_prevalence            float64
CKD_Lower 95% CI          float64
CKD_Upper 95% CI          float64
CKD_number                  int64
Urban_rural_code            int64
Length: 108, dtype: object

In [4]:
# Eliminar las columnas 'diabetes_Lower 95% CI' y 'diabetes_Upper 95% CI'
df = df.drop(columns=['diabetes_Lower 95% CI', 'diabetes_Upper 95% CI'])

In [6]:
columnas_num = df.select_dtypes(include=[np.number]).columns.tolist()
columnas_num = [col for col in columnas_num if col != 'diabetes_number']

scaler = StandardScaler()
nor_feat = scaler.fit_transform(df[columnas_num])

# Create a new DataFrame with the scaled numerical variables
df_scal = pd.DataFrame(nor_feat, index = df.index, columns = columnas_num)
df_scal["diabetes_number"] = df["diabetes_number"]
df_scal.head()

Unnamed: 0,fips,TOT_POP,0-9,0-9 y/o % of total pop,19-Oct,10-19 y/o % of total pop,20-29,20-29 y/o % of total pop,30-39,30-39 y/o % of total pop,...,COPD_Lower 95% CI,COPD_Upper 95% CI,COPD_number,diabetes_prevalence,CKD_prevalence,CKD_Lower 95% CI,CKD_Upper 95% CI,CKD_number,Urban_rural_code,diabetes_number
0,-1.940874,-0.145679,-0.142421,0.158006,-0.135556,0.573496,-0.153144,0.02761,-0.139384,0.588469,...,-0.256304,-0.206071,-0.1389,-0.063696,-0.609615,-0.582796,-0.669652,-0.147523,-1.082865,5462
1,-1.940742,0.341296,0.287476,-0.242861,0.320383,-0.193107,0.183774,-0.469965,0.23062,-0.1103,...,-0.304203,-0.129545,0.563986,-0.394103,-0.433549,-0.393279,-0.343373,0.389791,-0.420704,20520
2,-1.94061,-0.237785,-0.239429,-0.419441,-0.246181,-0.439718,-0.225971,0.272104,-0.218759,0.656538,...,1.372252,1.094865,-0.219763,2.432709,1.855312,1.880929,1.777443,-0.204321,0.903618,3870
3,-1.940478,-0.245223,-0.246032,-0.426966,-0.254791,-0.609076,-0.230792,0.396168,-0.220555,1.264959,...,0.462177,0.329609,-0.256918,0.376846,-0.257483,-0.203761,-0.180233,-0.2421,-1.745026,2511
4,-1.940346,-0.138966,-0.135053,0.186249,-0.13714,0.216679,-0.155888,-0.200808,-0.14357,0.088582,...,0.605873,0.597448,-0.074198,0.156575,-0.081417,-0.014244,-0.017093,-0.124105,-1.745026,6017


In [7]:
# Dividir los datos en entrenamiento y prueba
X = df_scal.drop(columns=["diabetes_number"])
y = df_scal["diabetes_number"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

In [10]:
yhat_train = linear_model.predict(X_train)
yhat_test = linear_model.predict(X_test)

In [11]:
def get_metrics(yhat, y_test, yhat_train, y_train):
  metrics_train = r2_score(y_train, yhat_train), median_absolute_error(y_train, yhat_train), mean_absolute_percentage_error(y_train, yhat_train)*100
  metrics_test = r2_score(y_test, yhat), median_absolute_error(y_test, yhat), mean_absolute_percentage_error(y_test, yhat)*100
  metrics_diff = list(map(lambda x: x[1]-x[0], zip(metrics_train, metrics_test)))
  return pd.DataFrame(data=[metrics_train, metrics_test, metrics_diff], columns=['R2', 'Median AE', 'MAPE'], index=['Train set', 'Test set', 'Diferencia'])

In [12]:
get_metrics(yhat_test, y_test, yhat_train, y_train)

Unnamed: 0,R2,Median AE,MAPE
Train set,0.999036,277.714293,38.091603
Test set,0.995692,290.429987,35.318467
Diferencia,-0.003344,12.715694,-2.773136


In [20]:
lasso_model = Lasso(alpha = 1.0)

# Training the model
lasso_model.fit(X_train, y_train)


  model = cd_fast.enet_coordinate_descent(


In [23]:
yhat_train_lasso = lasso_model.predict(X_train)
yhat_lasso = lasso_model.predict(X_test)

In [22]:
print(f'El porcentaje de variables canceladas es: {round(len(lasso_model.coef_[np.abs(lasso_model.coef_)==0])/len(lasso_model.coef_)*100,2)}%')
print(f'El modelo utiliza {len(lasso_model.coef_[np.abs(lasso_model.coef_)>0])} variables.')

El porcentaje de variables canceladas es: 11.65%
El modelo utiliza 91 variables.


In [24]:
get_metrics(yhat_lasso, y_test, yhat_train_lasso, y_train)

Unnamed: 0,R2,Median AE,MAPE
Train set,0.998746,292.662687,40.698263
Test set,0.995443,312.607681,41.267118
Diferencia,-0.003303,19.944994,0.568854
