In [412]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import TheilSenRegressor

### Getting the data from the transformed data set

In [413]:
df = pd.read_csv('data/transformed_data.csv')

In [414]:
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,avg_glucose_level,bmi,stroke,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,Residence_type_Rural,Residence_type_Urban
0,0,67.0,0,1,1,228.69,36.600000,1,0,0,1,0,0,1,0,0,0,1
1,1,61.0,0,0,1,202.21,28.893237,1,0,0,0,1,0,0,1,0,1,0
2,0,80.0,0,1,1,105.92,32.500000,1,0,0,1,0,0,0,1,0,1,0
3,1,49.0,0,0,1,171.23,34.400000,1,0,0,1,0,0,0,0,1,0,1
4,1,79.0,1,0,1,174.12,24.000000,1,0,0,0,1,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5105,1,80.0,1,0,1,83.75,28.893237,0,0,0,1,0,0,0,1,0,0,1
5106,1,81.0,0,0,1,125.20,40.000000,0,0,0,0,1,0,0,1,0,0,1
5107,1,35.0,0,0,1,82.99,30.600000,0,0,0,0,1,0,0,1,0,1,0
5108,0,51.0,0,0,1,166.29,25.600000,0,0,0,1,0,0,1,0,0,1,0


In [415]:
y = df['stroke'].to_numpy()
del df['stroke']
x = df.to_numpy()

### Splitting the dataset with 0.7 train size

In [416]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

### Training the model

In [417]:
lir = LinearRegression().fit(X_train, y_train)
rfr = RandomForestRegressor().fit(X_train, y_train)
gbr = GradientBoostingRegressor().fit(X_train, y_train)
dtr = DecisionTreeRegressor().fit(X_train, y_train)
knr = KNeighborsRegressor().fit(X_train, y_train)
ridge = Ridge().fit(X_train, y_train)
lasso = Lasso().fit(X_train, y_train)
en = ElasticNet().fit(X_train, y_train)
tsr = TheilSenRegressor().fit(X_train, y_train)

In [418]:
models = [
  ['Linear Regression', lir],
  ['Random Forest', rfr],
  ['Gradient Boosting', gbr],
  ['Decision Tree', dtr],
  ['K Neighbors', knr],
  ['Ridge', ridge],
  ['Lasso', lasso],
  ['Elastic Net', en],
  ['TheilSen', tsr],
]

The function below is to calculate the mean squared error and mean absolute error of predicted values

In [419]:
def evaluate(model, X, y):
  y_pred = model.predict(X)
  mse = mean_squared_error(y, y_pred)
  mae = mean_absolute_error(y, y_pred)
  return mse, mae

The function below is to use the function above and compare the results and return the best model and its data alongside the mse and mae of all the models

In [420]:
def compare_models(models, X, y):
  best_mse = 0
  best_mae = 0
  best_model = None
  results = pd.DataFrame(columns=['mse', 'mae'])
  for i in range(len(models)):
    mse, mae = evaluate(models[i][1], X, y)
    results.loc[models[i][0]] = [mse, mae]
    if i == 0:
      best_mse, best_mae = mse, mae
      best_model = models[i][0]
      continue
    if mse < best_mse and mae < best_mae:
      best_mse = mse
      best_mae = mae
      best_model = models[i][0]
  return {
    'best_model': best_model,
    'best_mse': best_mse,
    'best_mae': best_mae,
    'results': results
  }

The function below returns the result of function above but also displays the results

In [421]:
def view_results(title, models, X, y):
  data = compare_models(models, X, y)
  print('\033[94m' + title + '\033[0m')
  print('\033[92m' + data['best_model'] + '\033[0m' + f" is the best model for the {title}.")
  print("MSE: ", '\033[92m' + str(data['best_mse']) + '\033[0m')
  print("MAE: ", '\033[92m' + str(data['best_mae']) + '\033[0m')
  display(data['results'])
  return data

Getting the errors and displaying them for the test data set

In [422]:
test_data_result = view_results('Testing Data', models, X_test, y_test)

[94mTesting Data[0m
[92mLinear Regression[0m is the best model for the Testing Data.
MSE:  [92m0.049965083884116385[0m
MAE:  [92m0.0981108639850805[0m


Unnamed: 0,mse,mae
Linear Regression,0.049965,0.098111
Random Forest,0.051406,0.092531
Gradient Boosting,0.049973,0.088238
Decision Tree,0.097195,0.097195
K Neighbors,0.056751,0.089106
Ridge,0.049965,0.098099
Lasso,0.054241,0.096983
Elastic Net,0.051967,0.094233
TheilSen,0.05804,0.058103


Likewise as above but with the Training data set

In [423]:
train_data_result = view_results('Training Data', models, X_train, y_train)

[94mTraining Data[0m
[92mDecision Tree[0m is the best model for the Training Data.
MSE:  [92m0.0[0m
MAE:  [92m0.0[0m


Unnamed: 0,mse,mae
Linear Regression,0.03931,0.087239
Random Forest,0.006197,0.031736
Gradient Boosting,0.030024,0.067792
Decision Tree,0.0,0.0
K Neighbors,0.03177,0.063573
Ridge,0.03931,0.08723
Lasso,0.042259,0.084962
Elastic Net,0.040673,0.083005
TheilSen,0.044718,0.044781


Calculating the differences to check for underfitting and overfitting

In [424]:
diff_df = pd.DataFrame(columns=['mse difference', 'mae difference'])
diff_df['mse difference'] = train_data_result['results']['mse'] - test_data_result['results']['mse']
diff_df['mae difference'] = train_data_result['results']['mae'] - test_data_result['results']['mae']
display(diff_df)

Unnamed: 0,mse difference,mae difference
Linear Regression,-0.010655,-0.010872
Random Forest,-0.045209,-0.060795
Gradient Boosting,-0.019949,-0.020446
Decision Tree,-0.097195,-0.097195
K Neighbors,-0.024982,-0.025534
Ridge,-0.010655,-0.01087
Lasso,-0.011982,-0.012021
Elastic Net,-0.011294,-0.011228
TheilSen,-0.013323,-0.013323


Since there seems to be a small difference between testing and training data for the Gradient Boosting Regressor, I shall choose this model and find best hyperparameters to improve this model and then save this model to be used for the flask app

In [425]:
# Do grid search for best parameters on best model so far
# Gradient Boosting

In [426]:
# Do feature selection see if we can improve the model

In [427]:
# Save to pkl file