# MODEL SELECTION

In [2]:
import pandas as pd
import statsmodels.api as sm

In [3]:
data = pd.read_csv("insurance.csv")
data

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


## Función GET DUMMIES: cambia los datos que tienen forma de carácter a 0 y 1 cuando solo son dos tipos de datos.

In [4]:
cleaned_data = pd.get_dummies(data, drop_first = True)
cleaned_data

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27.900,0,16884.92400,0,1,0,0,1
1,18,33.770,1,1725.55230,1,0,0,1,0
2,28,33.000,3,4449.46200,1,0,0,1,0
3,33,22.705,0,21984.47061,1,0,1,0,0
4,32,28.880,0,3866.85520,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0,1,0,0
1334,18,31.920,0,2205.98080,0,0,0,0,0
1335,18,36.850,0,1629.83350,0,0,0,1,0
1336,21,25.800,0,2007.94500,0,0,0,0,1


## Ahora que los datos están limpios renombraremos columnas para guardar en "cleaned_data".

In [5]:
cleaned_data = cleaned_data.rename(columns = {'sex_male':'sex','smoker_yes':'smoker','region_northwest':'N.W.','region_southeast':'S.E.','region_southwest':'S.W.'})

## Exportar los datos limpios

In [6]:
cleaned_data.to_csv('insurance_cleaned.csv', index = False)

In [7]:
data_2 = pd.read_csv("insurance_cleaned.csv")
data_2

Unnamed: 0,age,bmi,children,charges,sex,smoker,N.W.,S.E.,S.W.
0,19,27.900,0,16884.92400,0,1,0,0,1
1,18,33.770,1,1725.55230,1,0,0,1,0
2,28,33.000,3,4449.46200,1,0,0,1,0
3,33,22.705,0,21984.47061,1,0,1,0,0
4,32,28.880,0,3866.85520,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0,1,0,0
1334,18,31.920,0,2205.98080,0,0,0,0,0
1335,18,36.850,0,1629.83350,0,0,0,1,0
1336,21,25.800,0,2007.94500,0,0,0,0,1


## Ahora usaremos los métodos de statsmodels para ver como se comportan las demás variables del modelo (importar librería).

In [8]:
predictors = ['age','bmi','children','sex','smoker','N.W.','S.E.','S.W.']
outcome = 'charges'

# RECORDAR QUE EN STATSMODELS VA PRIMERO EL OUTCOME!!!
insurance_full_lm = sm.OLS(cleaned_data[outcome],cleaned_data[predictors])
results = insurance_full_lm.fit()
results.summary()

0,1,2,3
Dep. Variable:,charges,R-squared (uncentered):,0.874
Model:,OLS,Adj. R-squared (uncentered):,0.874
Method:,Least Squares,F-statistic:,1158.0
Date:,"Thu, 17 Nov 2022",Prob (F-statistic):,0.0
Time:,02:18:12,Log-Likelihood:,-13618.0
No. Observations:,1338,AIC:,27250.0
Df Residuals:,1330,BIC:,27290.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
age,203.0019,11.619,17.471,0.000,180.208,225.796
bmi,69.2858,18.815,3.683,0.000,32.376,106.196
children,289.0531,144.213,2.004,0.045,6.144,571.962
sex,-665.6124,347.528,-1.915,0.056,-1347.376,16.151
smoker,2.337e+04,433.137,53.966,0.000,2.25e+04,2.42e+04
N.W.,-1736.0707,486.883,-3.566,0.000,-2691.213,-780.928
S.E.,-1281.8646,503.664,-2.545,0.011,-2269.926,-293.803
S.W.,-1950.0825,495.875,-3.933,0.000,-2922.864,-977.301

0,1,2,3
Omnibus:,275.143,Durbin-Watson:,2.062
Prob(Omnibus):,0.0,Jarque-Bera (JB):,642.991
Skew:,1.123,Prob(JB):,2.38e-140
Kurtosis:,5.548,Cond. No.,206.0


## Por último, eliminaremos las variables que no aportan al modelo en base a "t" y "p".

In [9]:
predictors = ['age','bmi','children','smoker']
outcome = 'charges'

insurance_full_lm = sm.OLS(cleaned_data[outcome],cleaned_data[predictors])
results = insurance_full_lm.fit()
results.summary()

0,1,2,3
Dep. Variable:,charges,R-squared (uncentered):,0.872
Model:,OLS,Adj. R-squared (uncentered):,0.872
Method:,Least Squares,F-statistic:,2277.0
Date:,"Thu, 17 Nov 2022",Prob (F-statistic):,0.0
Time:,02:18:16,Log-Likelihood:,-13629.0
No. Observations:,1338,AIC:,27270.0
Df Residuals:,1334,BIC:,27290.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
age,197.6732,11.589,17.058,0.000,174.939,220.407
bmi,28.0102,15.948,1.756,0.079,-3.276,59.296
children,240.5490,144.750,1.662,0.097,-43.414,524.512
smoker,2.331e+04,433.801,53.732,0.000,2.25e+04,2.42e+04

0,1,2,3
Omnibus:,277.987,Durbin-Watson:,2.07
Prob(Omnibus):,0.0,Jarque-Bera (JB):,640.617
Skew:,1.14,Prob(JB):,7.7899999999999995e-140
Kurtosis:,5.509,Cond. No.,126.0


In [10]:
predictors = ['age','smoker']
outcome = 'charges'

insurance_full_lm = sm.OLS(cleaned_data[outcome],cleaned_data[predictors])
results = insurance_full_lm.fit()
results.summary()

0,1,2,3
Dep. Variable:,charges,R-squared (uncentered):,0.872
Model:,OLS,Adj. R-squared (uncentered):,0.871
Method:,Least Squares,F-statistic:,4531.0
Date:,"Thu, 17 Nov 2022",Prob (F-statistic):,0.0
Time:,02:19:40,Log-Likelihood:,-13633.0
No. Observations:,1338,AIC:,27270.0
Df Residuals:,1336,BIC:,27280.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
age,222.5144,4.657,47.781,0.000,213.379,231.650
smoker,2.348e+04,428.588,54.785,0.000,2.26e+04,2.43e+04

0,1,2,3
Omnibus:,271.407,Durbin-Watson:,2.073
Prob(Omnibus):,0.0,Jarque-Bera (JB):,629.573
Skew:,1.111,Prob(JB):,1.95e-137
Kurtosis:,5.521,Cond. No.,101.0
