# Dataset Preparation


### Setup

In [92]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')
seed = 1855

## Data Collection


In [93]:
from sklearn.datasets import load_diabetes

diabetes = load_diabetes()
dataset = pd.DataFrame(data=diabetes.data, columns=diabetes.feature_names)

dataset['target'] = diabetes.target

dataset.head()


Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204,75.0
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641,135.0


## Handling missing values


In [94]:
# First let's check if there are any missing values in the dataset
dataset.isnull().sum()

# no missing values

age       0
sex       0
bmi       0
bp        0
s1        0
s2        0
s3        0
s4        0
s5        0
s6        0
target    0
dtype: int64

## Encoding categorical features with One-Hot Encoding


In [95]:
# First let's check if there are any categorical features in the dataset
categorical_columns = dataset.select_dtypes(include=['object']).columns.tolist()


categorical_columns

# no categorical columns

[]

## Encoding binary class label


In [96]:
# this is a regression problem, so we will check the correlation of the features with the target variable, no encoding is needed

# Model Building


In [97]:
from sklearn.model_selection import *
from sklearn.metrics import *
from sklearn.tree import *
from sklearn.ensemble import *
from sklearn.linear_model import *
from sklearn.neighbors import *

## Splitting the dataset into training and testing sets


In [98]:
x = dataset.drop('target', axis=1)
y = dataset['target']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=seed)

## Evaluation function


In [99]:
def evaluate(expected, predicted):
    mae = mean_absolute_error(expected, predicted)
    mse = mean_squared_error(expected, predicted)
    r2 = r2_score(expected, predicted)
    print(f'MAE: {mae:.2f}')
    print(f'MSE: {mse:.2f}')
    print(f'R2: {r2:.2f}')

## Model Building


In [100]:
model = LinearRegression()

## Model Evaluation


### Cross Validation

In [101]:
scoring = ['r2', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_root_mean_squared_error']
cross_validation_results = cross_validate(model, x_train, y_train, cv=10, scoring=scoring)

for metric, scores in cross_validation_results.items():
  if 'time' not in metric:
    name  = metric.replace('test_','' ).capitalize()
    print(f'{name}: {np.mean(scores):.2f} += {np.std(scores):.2f}')
  


R2: 0.44 += 0.11
Neg_mean_absolute_error: -44.32 += 4.57
Neg_mean_squared_error: -3069.22 += 719.28
Neg_root_mean_squared_error: -55.04 += 6.32


### Cross Validation with KFold

In [102]:

k_fold = KFold(n_splits=10, random_state=seed, shuffle=True)
scoring = ['r2', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_root_mean_squared_error']
cross_validation_results = cross_validate(model, x_train, y_train, cv=k_fold, scoring=scoring)

for metric, scores in cross_validation_results.items():
  if 'time' not in metric:
    name  = metric.replace('test_','' ).capitalize()
    print(f'{name}: {np.mean(scores):.2f} += {np.std(scores):.2f}')
  


R2: 0.45 += 0.15
Neg_mean_absolute_error: -44.70 += 6.75
Neg_mean_squared_error: -3084.47 += 862.01
Neg_root_mean_squared_error: -55.06 += 7.30


### Cross Validation with StratifiedKFold

In [103]:

k_fold = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)
scoring = ['r2', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_root_mean_squared_error']
cross_validation_results = cross_validate(model, x_train, y_train, cv=k_fold, scoring=scoring)

for metric, scores in cross_validation_results.items():
  if 'time' not in metric:
    name  = metric.replace('test_','' ).capitalize()
    print(f'{name}: {np.mean(scores):.2f} += {np.std(scores):.2f}')
  


R2: 0.47 += 0.08
Neg_mean_absolute_error: -43.95 += 4.31
Neg_mean_squared_error: -2999.56 += 539.95
Neg_root_mean_squared_error: -54.53 += 5.09


## Model Building with Hyperparameter Tuning


In [104]:
model = LinearRegression()
k_fold = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)
params = {
    'fit_intercept': [True, False],
}

grid_search = GridSearchCV(model, param_grid=params, cv=k_fold, scoring='r2')


grid_search.fit(x_train, y_train)

print(grid_search.best_params_)


{'fit_intercept': True}



## Model Evaluation


In [105]:
best_model = grid_search.best_estimator_

best_model.fit(x_train, y_train)

y_pred = best_model.predict(x_test)

evaluate(y_test, y_pred)

pd.DataFrame({'Actual': y_test, 'Predicted': y_pred}).head(10)

MAE: 44.33
MSE: 2907.13
R2: 0.56


Unnamed: 0,Actual,Predicted
393,69.0,100.030358
363,58.0,164.061422
325,192.0,210.171042
168,268.0,211.817259
297,31.0,99.678574
262,308.0,264.550208
359,311.0,169.453903
320,122.0,188.93891
397,198.0,181.169235
401,93.0,81.026392


# Comparing Different Models Performance

In [106]:
models = {
    'LinearRegression': LinearRegression(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'RandomForestRegressor': RandomForestRegressor(),
    'KNeighborsRegressor': KNeighborsRegressor(),
}

results = {}
for model_name, model in models.items():
  results[model_name] = -cross_val_score(model, x_train, y_train, cv=10, scoring='neg_mean_squared_error')
  
  
results_df = pd.DataFrame(results).transpose()

results_df['mean'] = results_df.mean(axis=1)
results_df['std'] = results_df.std(axis=1)
results_df.sort_values(by=['mean', 'std'], ascending=False)

results_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,mean,std
LinearRegression,3826.50518,2743.185179,4546.647674,2579.328425,2715.026646,2554.12489,2115.20753,3347.969213,2547.061455,3717.099938,3069.215613,719.276068
DecisionTreeRegressor,4767.388889,6730.444444,9054.944444,4351.4,7093.771429,3995.457143,6473.685714,7323.257143,6145.057143,10879.342857,6681.474921,2011.922215
RandomForestRegressor,3385.379675,2919.463097,5339.387564,2511.720331,3210.289694,3148.538654,3052.985911,3648.819897,3413.690531,4728.338974,3535.861433,815.048211
KNeighborsRegressor,3640.347778,2218.703333,6358.202222,3450.472,2717.721143,4393.798857,4147.739429,4566.28,3195.701714,4927.067429,3961.60339,1131.321776


## Model Building

## Model Comparison