# Model Analysis

# Step 1. Get the data

In [19]:
import pandas as pd

In [20]:
from src.constants import X_TRAIN_PATH, X_TEST_PATH, Y_TRAIN_PATH, Y_TEST_PATH

# save the processed data to their corresponding files
X_train = pd.read_csv(filepath_or_buffer=X_TRAIN_PATH, sep=',')
X_test = pd.read_csv(filepath_or_buffer=X_TEST_PATH, sep=',')

y_train = pd.read_csv(filepath_or_buffer=Y_TRAIN_PATH, sep=',')
y_test = pd.read_csv(filepath_or_buffer=Y_TEST_PATH, sep=',')

In [21]:
# verify x_train
X_train.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,0.472227,0.0,-1.756525,0.734336,1.0,2.0
1,0.543313,0.0,-1.033082,-0.911192,1.0,3.0
2,0.898745,0.0,-0.943687,-0.911192,1.0,1.0
3,-0.025379,0.0,0.622393,3.202629,1.0,1.0
4,1.040918,0.0,-1.504893,1.5571,1.0,2.0


In [22]:
# verify x_test
X_test.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,0.458596,0.0,-0.937152,0.822509,1.0,3.0
1,-0.187133,0.0,-0.167527,-0.89962,1.0,2.0
2,1.821801,0.0,-0.665519,-0.89962,0.0,2.0
3,0.530343,1.0,-0.846608,1.683574,1.0,2.0
4,-1.406842,1.0,0.134287,-0.89962,0.0,2.0


In [23]:
# verify y_train
y_train.head()

Unnamed: 0,charges
0,9193.8385
1,8534.6718
2,27117.99378
3,8596.8278
4,12475.3513


In [24]:
# verify y_test
y_test.head()

Unnamed: 0,charges
0,9095.06825
1,5272.1758
2,29330.98315
3,9301.89355
4,33750.2918


# Option 1. Model with all the features

## Step 1: Initialization and training of the model

In [25]:
from sklearn.linear_model import LinearRegression

# model 
model = LinearRegression()
model.fit(X_train, y_train.values.ravel())

## Step 2: Model prediction

In [26]:
# make the prediction
y_pred = model.predict(X_test)
y_pred

array([ 9.08521280e+03,  7.15018118e+03,  3.70524385e+04,  9.68311549e+03,
        2.69807765e+04,  1.06646845e+04,  3.21501279e+02,  1.69364233e+04,
        1.24433906e+03,  1.14103229e+04,  2.82038566e+04,  9.50203680e+03,
        5.36910209e+03,  3.84097055e+04,  4.02465877e+04,  3.71255863e+04,
        1.52204653e+04,  3.59552264e+04,  9.28352376e+03,  3.16040469e+04,
        3.98746422e+03,  1.01259756e+04,  2.30065276e+03,  6.94188729e+03,
        1.15049101e+04,  1.29485554e+04,  1.44710990e+04,  6.27222981e+03,
        1.00632358e+04,  2.22871898e+03,  9.19293985e+03,  1.31801357e+04,
        4.54983678e+03,  3.68362856e+03,  4.43278487e+03,  1.28903819e+04,
        1.99443645e+03,  8.82995950e+03,  3.34963031e+04,  3.26217760e+04,
        3.96389118e+03,  4.37252844e+03,  1.41037268e+04,  1.16294643e+04,
        8.81770352e+03,  1.21890544e+04,  5.31135817e+03,  3.24537433e+03,
        3.55757566e+04,  9.14940519e+03,  1.59073595e+04,  2.42550822e+03,
        1.23058785e+04,  

## Step 3: Metrics

Let's print the full report of the model

In [27]:
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, y_pred)
r2  = r2_score(y_test, y_pred)

print(f"MSE: {mse}")
print(f"RMSE: {mse**0.5}")
print(f"Coefficient of determination: {r2}")

MSE: 33624835.693370104
RMSE: 5798.692584830662
Coefficient of determination: 0.7834131372909037


We can see that the error is big, with a slippage of 33 millions and a coverage of 78% of the data. This model cannot be optimized. Let's try with different, the ones that had the best correlation.

# Option 2. Model with the selected params

## Step 1: Modify the params

In [28]:
# selected columns
selected_features = ['age', 'bmi', 'smoker']

In [29]:
# update the selected features
X_train = X_train[selected_features]
X_test = X_test[selected_features]

In [30]:
X_train.head()

Unnamed: 0,age,bmi,smoker
0,0.472227,-1.756525,1.0
1,0.543313,-1.033082,1.0
2,0.898745,-0.943687,1.0
3,-0.025379,0.622393,1.0
4,1.040918,-1.504893,1.0


In [31]:
X_test.head()

Unnamed: 0,age,bmi,smoker
0,0.458596,-0.937152,1.0
1,-0.187133,-0.167527,1.0
2,1.821801,-0.665519,0.0
3,0.530343,-0.846608,1.0
4,-1.406842,0.134287,0.0


## Step 2: Initialization and training of the model

In [32]:
from sklearn.linear_model import LinearRegression

# model 
model = LinearRegression()
model.fit(X_train, y_train.values.ravel())

## Step 3: Model prediction

In [33]:
# make the prediction
y_pred = model.predict(X_test)
y_pred

array([ 8303.76487684,  7465.01383403, 37489.41821759,  8744.13574931,
       27284.55314151, 10975.92192054,   663.14230306, 17265.97084616,
        1382.86146742, 10452.10197172, 27809.02851793,  8952.67692567,
        5497.55848276, 37620.88744272, 40838.47445666, 36814.55600926,
       14834.35454962, 36090.96748496,  9466.15806469, 31991.13406696,
        2667.92050461, 10105.93902393,  3133.05378103,  6864.56193719,
       10535.37720883, 13011.05196947, 15340.03541891,  5227.63821936,
        9273.78053063,  2789.78786203,  8286.7719677 , 13528.69940732,
        5056.8906714 ,  2739.73072619,  4840.51845041, 12886.4921974 ,
        2403.73274897,  9260.18620332, 33484.56044128, 32762.97812911,
        2569.21856729,  4216.10264154, 14512.01244954, 12058.56155276,
        8474.42010006, 12697.574795  ,  4668.47085749,  2793.21721877,
       34836.93568742,  8777.73230396, 16501.39472578,  2548.59168726,
       12083.26240688,  1532.12209361, 14082.87120352, 11785.21193262,
      

## Step 4: Metrics

Let's print the full report of the model

In [34]:
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, y_pred)
r2  = r2_score(y_test, y_pred)

print(f"MSE: {mse}")
print(f"RMSE: {mse**0.5}")
print(f"Coefficient of determination: {r2}")

MSE: 34520308.99137751
RMSE: 5875.398624040543
Coefficient of determination: 0.7776451462135991


We can see that the error is big, with a slippage of 34 millions and a coverage of 77.7% of the data. This model cannot be optimized. We got almost the same result with just 3 features.

# Conclusion

With the processing of the data that was made we got result with similar coefficients of determination, this model could be used but the result of using it should be study. Another type of data analysis and processing could end up in better results. Another type of model could get us bette results, like for example Regularized Linear Regression. 