# Explore here

In [2]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm

df = pd.read_csv('https://raw.githubusercontent.com/4GeeksAcademy/linear-regression-project-tutorial/main/medical_insurance_cost.csv')

df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [3]:
#Vamos a ver si hay valores nulos

df.isnull().sum()



age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [4]:
# eliminamos duplicados. Parece que había un duplicado. 
df = df.drop_duplicates().reset_index (drop = True)
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1332,50,male,30.970,3,no,northwest,10600.54830
1333,18,female,31.920,0,no,northeast,2205.98080
1334,18,female,36.850,0,no,southeast,1629.83350
1335,21,female,25.800,0,no,southwest,2007.94500


In [5]:
# Miramos los valores únicos. 
df.nunique()

#viendo los resultados solamente vamos a tener que trabajar un poco más la columna region porque los otros valores no numéricos tienes 2 posibilidades

age           47
sex            2
bmi          548
children       6
smoker         2
region         4
charges     1337
dtype: int64

In [6]:
#No hacer caso a esto
'''
print(df[df['region'] == 'northwest']['charges'].mean())
print(df[df['region'] == 'southwest']['charges'].mean())
print(df[df['region'] == 'northeast']['charges'].mean())
print(df[df['region'] == 'southeast']['charges'].mean())
'''

"\nprint(df[df['region'] == 'northwest']['charges'].mean())\nprint(df[df['region'] == 'southwest']['charges'].mean())\nprint(df[df['region'] == 'northeast']['charges'].mean())\nprint(df[df['region'] == 'southeast']['charges'].mean())\n"

In [7]:
# coinvertimos las variables en numéricas. Vamos a usar, esta ve la función de pandas factorize() en lugar de labelencoder

df['sex_n'] = pd.factorize(df['sex'])[0]
df['smoker_n'] = pd.factorize(df['smoker'])[0]
df['region_n'] = pd.factorize(df['region'])[0]

#generamos una lista con los títulos de las variables porque luego nos vana a vernir bien

variables_n= ["age", "bmi", "children", "sex_n", "smoker_n", "region_n", "charges"]

In [8]:
# aplicamos ahora minmax scaler

escalador = MinMaxScaler()
variables_escaladas = escalador.fit_transform(df[variables_n])
df_escalado = pd.DataFrame(variables_escaladas, index = df.index, columns = variables_n)

df_escalado


Unnamed: 0,age,bmi,children,sex_n,smoker_n,region_n,charges
0,0.021739,0.321227,0.0,0.0,0.0,0.000000,0.251611
1,0.000000,0.479150,0.2,1.0,1.0,0.333333,0.009636
2,0.217391,0.458434,0.6,1.0,1.0,0.333333,0.053115
3,0.326087,0.181464,0.0,1.0,1.0,0.666667,0.333010
4,0.304348,0.347592,0.0,1.0,1.0,0.666667,0.043816
...,...,...,...,...,...,...,...
1332,0.695652,0.403820,0.6,1.0,1.0,0.666667,0.151299
1333,0.000000,0.429379,0.0,0.0,1.0,1.000000,0.017305
1334,0.000000,0.562012,0.0,0.0,1.0,0.333333,0.008108
1335,0.065217,0.264730,0.0,0.0,1.0,0.000000,0.014144


In [9]:
X = df_escalado.drop('charges', axis=1)
y = df_escalado['charges']

X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# una vez tenemos separadas las variables y los distintos conjuntos de datos, vamos a elegir las variables que son más represntativas. 

seleccion_variables = SelectKBest(f_regression, k = 4)

seleccion_variables.fit(X_train, y_train)

variables_elegidas = X_train.columns[seleccion_variables.get_support()]

X_train_sel = pd.DataFrame(seleccion_variables.transform(X_train), columns = variables_elegidas)
X_test_sel = pd.DataFrame(seleccion_variables.transform(X_test), columns = variables_elegidas)

X_train_sel['charges'] = y_train.values
X_test_sel["charges"] = y_test.values


In [19]:
variables_elegidas

Index(['age', 'bmi', 'children', 'smoker_n'], dtype='object')

In [10]:
train_data = X_train_sel.copy()
test_data = X_test_sel.copy()



In [11]:
X_train = train_data.drop(["charges"], axis = 1)
y_train = train_data["charges"]
X_test = test_data.drop(["charges"], axis = 1)
y_test = test_data["charges"]

In [12]:
# voy a hacer pimero el modelo de Ordinary Least Squares para ver una foto

X = sm.add_constant(X_train)
modelo2 = sm.OLS(y_train, X_train)

result = modelo2.fit()
print(result.summary())

# vemos en este caso que el modelo no nos devuelve el intercepto. 

                                 OLS Regression Results                                
Dep. Variable:                charges   R-squared (uncentered):                   0.754
Model:                            OLS   Adj. R-squared (uncentered):              0.753
Method:                 Least Squares   F-statistic:                              815.5
Date:                Mon, 20 Jan 2025   Prob (F-statistic):                   2.52e-322
Time:                        19:19:13   Log-Likelihood:                          646.15
No. Observations:                1069   AIC:                                     -1284.
Df Residuals:                    1065   BIC:                                     -1264.
Df Model:                           4                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [13]:
# Con los datos seleccionados ye escalados, pasamos a aplicar el modelo de regresión lineal

modelo = LinearRegression()
modelo.fit(X_train, y_train)

In [18]:
print (modelo.intercept_)
print(modelo.coef_)

# podemos ver el intercepto del modelo y los coeficientes que acompañarían a cada una de las 4 variables independientes que hemos considerado. 


0.3195827183308946
[ 0.1829699   0.18111814  0.04293572 -0.36780589]


In [16]:
y_pred = modelo.predict(X_test)
y_pred

array([ 0.10702315,  0.07685187,  0.2083241 ,  0.48685684,  0.12995459,
        0.19726353,  0.45755859,  0.00832554,  0.1556988 ,  0.16287349,
        0.14781084,  0.51037008,  0.4699569 ,  0.25944432,  0.14442757,
        0.13842465,  0.05134994,  0.49351746,  0.03848664,  0.0705325 ,
        0.04276279,  0.45855586,  0.22245537,  0.47182868,  0.47460549,
        0.06698449,  0.55116209,  0.5686984 ,  0.15588314,  0.20127518,
        0.0770878 ,  0.19051001, -0.00377298,  0.17967676,  0.61602286,
        0.18050297,  0.06028104,  0.0447116 ,  0.48287007,  0.13064786,
        0.08387671,  0.45610213,  0.54228697,  0.17124485,  0.09976616,
        0.03950023,  0.07028081,  0.12772953,  0.04874519,  0.13229562,
        0.09175793,  0.1653832 ,  0.47297483,  0.04437078,  0.16108797,
        0.14414371,  0.14902535,  0.02014774,  0.48426463,  0.13172362,
        0.2337337 ,  0.11691   ,  0.18103216,  0.00130779,  0.25281818,
        0.14612565,  0.14268127,  0.47671561,  0.38076578,  0.23

In [17]:
print(f"MSE: {mean_squared_error(y_test, y_pred)}")
print(f"R2 Score: {r2_score(y_test, y_pred)}")

# vemos que el modelo tiene un error cuadrático medio bajo y que explicaría el 80,4% de la variabilidad de los datos. No es perfecto pero está bastante bien. 



MSE: 0.009150574110332004
R2 Score: 0.8045531086669286
