In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import mean_squared_error,r2_score, make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV , LogisticRegression
from sklearn.pipeline import Pipeline


In [3]:
df=pd.read_csv("Data.csv")

##### Preparing the dataframe

In [4]:
#temp_copy
ddf=df.copy()
ddf["region"]=df["region"].map({
    'southwest': 0,
    'southeast': 2,
    'northwest': 3,
    'northeast': 4})
mapped_values_smoking = ddf["smoker"].map({"yes": 1, "no": 0})
mapped_values_sex=ddf["sex"].map({"male": 1, "female": 0})

In [5]:
# when map fucntion wroks with only two valus 
# it needs to be separted otherwise it will assian NaN values 
ddf.smoker=mapped_values_smoking
ddf.sex=mapped_values_sex
ddf.head(3)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,0,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462


#### Linear regression models (a beginner 😂 machine learning algorithm)

### Implementing a standard scaler to standardize the data

In [19]:
# importing from dataframe
data_x=ddf[["age","bmi","smoker","children"]]
data_y=ddf["charges"]
X_train, X_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.25,random_state=42 )

# scaling the data 
scale=StandardScaler()
X_train=scale.fit_transform(X_train)
X_test=scale.transform(X_test)


##### Linear regression


In [7]:
# building the model
lr=LinearRegression()
# training the model
lr.fit(X_train,y_train)
# examin the trained model 
print(f" the coeffciencts {lr.coef_}")
print(f" the intercept {lr.intercept_} ")
y_pred=lr.predict(X_test)
# evaluting the model
mse=mean_squared_error(y_true=y_test,y_pred=y_pred)
r2=r2_score(y_true=y_test,y_pred=y_pred)
print(f" MSE =  {mse}")
print(f" R-squred = {r2}")

 the coeffciencts [3648.466473   1968.78192674 9547.15727997  524.0369871 ]
 the intercept 13267.935817337988 
 MSE =  35403760.365285985
 R-squred = 0.7653688584061044


##### Implementing Ridge regression with cross-validation

In [20]:
# setting alpha values
alpha_values = [0.01, 0.1, 1, 10, 100] 
#Building the model 
ridge_cv = RidgeCV(alphas=alpha_values,store_cv_values=True,alpha_per_target=False) #it will autmaitcally use Leave-One-Out Cross-Validation.
#train the model 
ridge_cv.fit(X_train,y_train)
# examin the trained model
print(f" the best alpha = {ridge_cv.alpha_}")
print(f" the best score = {ridge_cv.best_score_}")
print(f" the coeffcients = {ridge_cv.coef_}")
# predicting 
y_pred=ridge_cv.predict(X_test)
#evaluting the model
mse=mean_squared_error(y_true=y_test,y_pred=y_pred)
r2=r2_score(y_true=y_test,y_pred=y_pred)
print(f" MSE =  {mse}")
print(f" R-squred = {r2}")


 the best alpha = 1.0
 the best score = -37565982.46502742
 the coeffcients = [3644.64319649 1967.23148377 9537.50283005  523.96585031]
 MSE =  35408021.75909756
 R-squred = 0.7653406168949068




### Linear Regression pipeline

In [9]:
data_x=ddf[["age","bmi","smoker","children"]]
data_y=ddf["charges"]
X_train, X_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.25,random_state=42 )


In [10]:
# building the pipline
lr_pipe=Pipeline([
    ("scaler",StandardScaler()),
    ("model",LinearRegression()),
])
#training the piplein
lr_pipe.fit(X_train,y_train)
y_pred = lr_pipe.predict(X_test)

#evaluting the model
mse=mean_squared_error(y_true=y_test,y_pred=y_pred)
r2=r2_score(y_true=y_test,y_pred=y_pred)
print(f" MSE =  {mse}")
print(f" R-squred = {r2}")


 MSE =  35403760.365285985
 R-squred = 0.7653688584061044


In [11]:
alpha_values = [0.01, 0.1, 1, 10, 100]

#building the pipleine
Ridge_pipe=Pipeline([
    ("scale",StandardScaler()),
    ("model",RidgeCV(alphas=alpha_values,cv=50))
])
# train the pipeline
Ridge_pipe.fit(X_train,y_train)
Ridge_pipe.predict(X_test)

#evaluting the pipline
mse=mean_squared_error(y_true=y_test,y_pred=y_pred)
r2=r2_score(y_true=y_test,y_pred=y_pred)
print(f" MSE =  {mse}")
print(f" R-squred = {r2}")


 MSE =  35403760.365285985
 R-squred = 0.7653688584061044


### Combining the GridSearch and Piplines

In [12]:
data_x=ddf[["age","bmi","smoker","children"]]
data_y=ddf["charges"]
X_train, X_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.25,random_state=42 )

In [13]:
# Building the placeholder pipelines that will be used in grid_search
place_holder = Pipeline([
    ("scaler", StandardScaler()),  # Step for scaling
    ("model", LogisticRegression())  # Placeholder for model, will be replaced in param_grid
])

In [14]:
alpha_values = [0.01, 0.1, 1, 10, 100]

# Building the actual pipelines that will be used in the GridSearch instnace "grid_search" .
param_grid=[
    {          
         "scaler" :  [StandardScaler()],
         "model"  :  [LinearRegression()]
    },
    {
        "scaler" :  [StandardScaler()],
        "model"  :  [RidgeCV()],
        "model__alphas" : [alpha_values],
    },
        ]

In [15]:
# building the grid_search
grid_search=GridSearchCV(estimator=place_holder,
                         param_grid=param_grid,
                         cv=5,
                         scoring= make_scorer(mean_squared_error, greater_is_better=False) )
# training the grid_search

grid_search.fit(X_train,y_train)

In [24]:
# examine the grid_search 
print("best estimator = ",grid_search.best_estimator_)
print("#"*90)
print("best params =",grid_search.best_params_)
print("#"*90)

print("best Alpha value used by the best estimator = ",grid_search.best_estimator_.named_steps["model"].alpha_)
print("#"*90)

print("best score ",grid_search.best_score_) # The negative score is due to how grid search operates. Remember, -1000 is smaller than -1.

print("#"*90)

print("                 the Cross validation results")

pd.DataFrame(grid_search.cv_results_)



best estimator =  Pipeline(steps=[('scaler', StandardScaler()),
                ('model', RidgeCV(alphas=[0.01, 0.1, 1, 10, 100]))])
##########################################################################################
best params = {'model': RidgeCV(), 'model__alphas': [0.01, 0.1, 1, 10, 100], 'scaler': StandardScaler()}
##########################################################################################
best Alpha value used by the best estimator =  1.0
##########################################################################################
best score  -37458698.78853901
##########################################################################################
                 the Cross validation results


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model,param_scaler,param_model__alphas,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.005602,0.001359,0.002204,0.000398,LinearRegression(),StandardScaler(),,"{'model': LinearRegression(), 'scaler': Standa...",-35757320.0,-31361830.0,-39259470.0,-45253670.0,-35663090.0,-37459080.0,4632387.0,2
1,0.006596,0.00224,0.001199,0.0004,RidgeCV(),StandardScaler(),"[0.01, 0.1, 1, 10, 100]","{'model': RidgeCV(), 'model__alphas': [0.01, 0...",-35765010.0,-31370700.0,-39248880.0,-45225730.0,-35683170.0,-37458700.0,4617711.0,1


In [25]:
# Based on our analysis, 
# Ridge Regression with cross-validation excled as the most effective model.

In [17]:
# predict using the grid_search
y_pred=grid_search.predict(X_test)

# evaluting using MSE and R^2 metrics
mse=mean_squared_error(y_true=y_test,y_pred=y_pred)
r2=r2_score(y_true=y_test,y_pred=y_pred)


print(f"MSE =  {mse}")
print(f"R-squred = {r2}")

# Getting the different weights for each feature
Ws_var=pd.DataFrame(grid_search.best_estimator_["model"].coef_, columns=['Weights'],index=["age","bmi","smoker","children"])
Ws_var

MSE =  35408021.75909756
R-squred = 0.7653406168949068


Unnamed: 0,Weights
age,3644.643196
bmi,1967.231484
smoker,9537.50283
children,523.96585


## The feature weights align with our expectations based on the exploratory data analysis (EDA)