In [1]:
import re
import pandas as pd
import numpy as np
from tableone import TableOne
from scipy import stats
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
url = 'https://raw.githubusercontent.com/4GeeksAcademy/linear-regression-project-tutorial/main/medical_insurance_cost.csv'

In [3]:
df = pd.read_csv(url)
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [4]:
df.rename(columns={'sex':'female'}, inplace = True)

In [5]:
df['female'] = np.where(df['female']=='female',1,0)
df['smoker'] = np.where(df['smoker']=='yes',1,0)
dummies = pd.get_dummies(df['region'], drop_first = True, dtype = int)
df = df.join(dummies)
df.drop(columns = 'region', inplace = True)
df = df[['age','female','bmi','children','smoker','northwest','southeast','southwest','charges']]

In [6]:
df.corr()

Unnamed: 0,age,female,bmi,children,smoker,northwest,southeast,southwest,charges
age,1.0,0.020856,0.109272,0.042469,-0.025019,-0.000407,-0.011642,0.010016,0.299008
female,0.020856,1.0,-0.046371,-0.017163,-0.076185,0.011156,-0.017117,0.004184,-0.057292
bmi,0.109272,-0.046371,1.0,0.012759,0.00375,-0.135996,0.270025,-0.006205,0.198341
children,0.042469,-0.017163,0.012759,1.0,0.007673,0.024806,-0.023066,0.021914,0.067998
smoker,-0.025019,-0.076185,0.00375,0.007673,1.0,-0.036945,0.068498,-0.036945,0.787251
northwest,-0.000407,0.011156,-0.135996,0.024806,-0.036945,1.0,-0.346265,-0.320829,-0.039905
southeast,-0.011642,-0.017117,0.270025,-0.023066,0.068498,-0.346265,1.0,-0.346265,0.073982
southwest,0.010016,0.004184,-0.006205,0.021914,-0.036945,-0.320829,-0.346265,1.0,-0.04321
charges,0.299008,-0.057292,0.198341,0.067998,0.787251,-0.039905,0.073982,-0.04321,1.0


In [7]:
def Xy(df,target):
    """
    Split the data in X,y to ML implementations
    """
    X = df.loc[ : , df.columns != target]
    y = df[target].astype('int')
    return X,y
X , y = Xy(df,'charges')

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle = True, random_state = 1245)

In [9]:
X_train.to_csv('/workspaces/Linear-Regression-Project/data/X_train.csv')
X_test.to_csv('/workspaces/Linear-Regression-Project/data/X_test.csv')
y_train.to_csv('/workspaces/Linear-Regression-Project/data/y_train.csv')
y_test.to_csv('/workspaces/Linear-Regression-Project/data/y_test.csv')

In [10]:
results = smf.ols(formula='charges ~ age + female + bmi + children + smoker + northwest + southeast + southwest',data=df).fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                charges   R-squared:                       0.751
Model:                            OLS   Adj. R-squared:                  0.749
Method:                 Least Squares   F-statistic:                     500.8
Date:                Mon, 23 Sep 2024   Prob (F-statistic):               0.00
Time:                        13:28:45   Log-Likelihood:                -13548.
No. Observations:                1338   AIC:                         2.711e+04
Df Residuals:                    1329   BIC:                         2.716e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept  -1.207e+04    999.649    -12.074      0.0

In [11]:
results = smf.ols(formula='np.log(charges) ~ age + female + np.log(bmi) + children + smoker + northwest + southeast + southwest',data=df).fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:        np.log(charges)   R-squared:                       0.769
Model:                            OLS   Adj. R-squared:                  0.767
Method:                 Least Squares   F-statistic:                     551.8
Date:                Mon, 23 Sep 2024   Prob (F-statistic):               0.00
Time:                        13:28:45   Log-Likelihood:                -806.62
No. Observations:                1338   AIC:                             1631.
Df Residuals:                    1329   BIC:                             1678.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept       5.9403      0.212     28.029      

Otra forma

In [12]:
model = LinearRegression()
model.fit(X_train, y_train)

In [13]:
y_pred = model.predict(X_test)
y_pred

print(f"Mean squared error: {mean_squared_error(y_test, y_pred)}")
print(f"Coefficient of determination: {r2_score(y_test, y_pred)}")

Mean squared error: 34668536.230701566
Coefficient of determination: 0.7510130326381084
