# Explore here

In [11]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

# modelado (install scikit-learn)
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge


# metricas
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error, r2_score

# optimizar
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [12]:
archivo="https://raw.githubusercontent.com/4GeeksAcademy/regularized-linear-regression-project-tutorial/main/demographic_health_data.csv"

df_original = pd.read_csv(archivo, sep=",")
df = pd.read_csv(archivo, sep=",")

datos,atributos=df.shape
print(f"Contamos con {datos} cantidad de datos y {atributos} atributos.")
print()

# Valores duplicados

duplicados = df.duplicated()
num_duplicados = duplicados.sum()

print(f"Contamos con {num_duplicados} dato duplicado.")
print()

# Obtener información sobre tipos de datos y valores no nulos
df.info()

df.isnull().sum().sort_values(ascending=False)

Contamos con 3140 cantidad de datos y 108 atributos.

Contamos con 0 dato duplicado.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3140 entries, 0 to 3139
Columns: 108 entries, fips to Urban_rural_code
dtypes: float64(61), int64(45), object(2)
memory usage: 2.6+ MB


fips                      0
TOT_POP                   0
0-9                       0
0-9 y/o % of total pop    0
19-Oct                    0
                         ..
CKD_prevalence            0
CKD_Lower 95% CI          0
CKD_Upper 95% CI          0
CKD_number                0
Urban_rural_code          0
Length: 108, dtype: int64

- Contamos con 3.140 datos y 108 atributos.
- No tenemos datos duplicados
- Ninguno de los atributos posee valores nulos.
- 106 características son numéricas y 2 características son categóricas. 

### Objetivo: 

Vamos a analizar la población con obesidad. Nuestra variable objetivo será "Obesity_number"

In [13]:
# Seleccionar sólo columnas categóricas
object_columns = df.select_dtypes(include=['object'])

object_columns.head()


Unnamed: 0,COUNTY_NAME,STATE_NAME
0,Autauga,Alabama
1,Baldwin,Alabama
2,Barbour,Alabama
3,Bibb,Alabama
4,Blount,Alabama


In [14]:
# Seleccionar sólo columnas numéricas
numerical_columns = df.select_dtypes(include=['number'])

numerical_columns.head()

Unnamed: 0,fips,TOT_POP,0-9,0-9 y/o % of total pop,19-Oct,10-19 y/o % of total pop,20-29,20-29 y/o % of total pop,30-39,30-39 y/o % of total pop,...,COPD_number,diabetes_prevalence,diabetes_Lower 95% CI,diabetes_Upper 95% CI,diabetes_number,CKD_prevalence,CKD_Lower 95% CI,CKD_Upper 95% CI,CKD_number,Urban_rural_code
0,1001,55601,6787,12.206615,7637,13.735364,6878,12.370281,7089,12.749771,...,3644,12.9,11.9,13.8,5462,3.1,2.9,3.3,1326,3
1,1003,218022,24757,11.355276,26913,12.344167,23579,10.814964,25213,11.564429,...,14692,12.0,11.0,13.1,20520,3.2,3.0,3.5,5479,4
2,1005,24881,2732,10.980266,2960,11.896628,3268,13.13452,3201,12.865239,...,2373,19.7,18.6,20.6,3870,4.5,4.2,4.8,887,6
3,1007,22400,2456,10.964286,2596,11.589286,3029,13.522321,3113,13.897321,...,1789,14.1,13.2,14.9,2511,3.3,3.1,3.6,595,2
4,1009,57840,7095,12.266598,7570,13.087828,6742,11.656293,6884,11.901798,...,4661,13.5,12.6,14.5,6017,3.4,3.2,3.7,1507,2


In [15]:
# Dividimos el conjunto de datos en muestras de train y test

X = df.drop("Obesity_number", axis = 1)
y = df["Obesity_number"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 35)

print(X_train.shape, X_test.shape)

(2512, 107) (628, 107)


Escalado de variables numéricas

In [16]:
num_variables=X_train.select_dtypes(include=['number']).columns

# instancio el escalador
scaler = StandardScaler()

# entreno el escalador con los datos de entrenamiento
scaler.fit(X_train[num_variables])

# aplico el escalador en ambos
X_train_num_scal = scaler.transform(X_train[num_variables])
X_train_num_scal = pd.DataFrame(X_train_num_scal, index = X_train.index, columns = num_variables)

X_test_num_scal = scaler.transform(X_test[num_variables])
X_test_num_scal = pd.DataFrame(X_test_num_scal, index = X_test.index, columns = num_variables)

X_train_num_scal.head()


Unnamed: 0,fips,TOT_POP,0-9,0-9 y/o % of total pop,19-Oct,10-19 y/o % of total pop,20-29,20-29 y/o % of total pop,30-39,30-39 y/o % of total pop,...,COPD_number,diabetes_prevalence,diabetes_Lower 95% CI,diabetes_Upper 95% CI,diabetes_number,CKD_prevalence,CKD_Lower 95% CI,CKD_Upper 95% CI,CKD_number,Urban_rural_code
3131,1.706936,-0.22228,-0.223113,-0.311093,-0.231635,-0.46211,-0.226141,-0.568788,-0.210583,-0.00694,...,-0.262562,-1.276346,-1.293295,-1.25877,-0.23523,-0.786316,-0.774211,-0.67017,-0.225324,0.91331
2321,0.976745,0.330532,0.369504,0.596449,0.360215,0.353559,0.323789,0.607968,0.337879,1.449402,...,0.357709,-0.250427,-0.190255,-0.339935,0.362788,-0.961986,-0.963286,-1.159266,0.283617,-1.069035
945,-0.673776,-0.294861,-0.28541,1.006543,-0.29701,0.994603,-0.276112,-0.064144,-0.270178,-1.161556,...,-0.348837,0.006053,-7.6e-05,0.013463,-0.293867,-0.083639,-0.206988,-0.018043,-0.299584,0.91331
1275,-0.27772,0.244433,0.166354,-0.614021,0.255356,0.174289,0.130666,-0.336571,0.140531,-0.352757,...,0.319601,-1.532826,-1.52151,-1.506148,0.124614,-1.313325,-1.341435,-1.322298,0.191689,-1.729817
828,-0.74246,-0.258187,-0.25878,-0.740248,-0.262436,-0.064785,-0.250352,-0.45805,-0.240841,-0.286238,...,-0.304998,-0.726746,-0.760793,-0.657993,-0.258627,-0.434978,-0.396063,-0.344106,-0.259303,0.91331


In [17]:
cat_variables = ["COUNTY_NAME",	"STATE_NAME"]

X_train_cat_le = X_train.copy()
X_test_cat_le = X_test.copy()

# instancio el encoder
label_encoder_cn = LabelEncoder()
label_encoder_sn = LabelEncoder()

# entreno el encoder con los datos de entrenamiento
label_encoder_cn.fit(X_train['COUNTY_NAME'])
label_encoder_sn.fit(X_train['STATE_NAME'])

# aplico el encoder en ambos
X_train_cat_le['COUNTY_NAME_le'] = label_encoder_cn.transform(X_train['COUNTY_NAME'])
X_train_cat_le['STATE_NAME_le'] = label_encoder_sn.transform(X_train['STATE_NAME'])

X_test_cat_le['COUNTY_NAME_le'] = label_encoder_cn.transform(X_test['COUNTY_NAME'])
X_test_cat_le['STATE_NAME_le'] = label_encoder_sn.transform(X_test['STATE_NAME'])

X_train_cat_le.head()

ValueError: y contains previously unseen labels: 'Real'

In [18]:
# Crear copias de los datos originales
X_train_cat_le = X_train.copy()
X_test_cat_le = X_test.copy()

# Crear un diccionario para almacenar un LabelEncoder por columna
encoders = {}

# Lista de columnas categóricas
categorical_columns = ["COUNTY_NAME","STATE_NAME"]

# Aplicar el LabelEncoder a cada columna categórica de X_train
for col in categorical_columns:
    encoder = LabelEncoder()  # Crear un nuevo LabelEncoder para cada columna
    X_train_cat_le[f'{col}_le'] = encoder.fit_transform(X_train[col])  # Codificar y agregar como nueva columna
    X_test_cat_le[f'{col}_le'] = encoder.transform(X_test[col])  # Aplicar la misma transformación a X_test
    encoders[col] = encoder  # Guardar el encoder por si lo necesitas más tarde

# Mostrar las primeras filas del DataFrame de entrenamiento transformado
X_train_cat_le.head()

ValueError: y contains previously unseen labels: 'Real'