In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# Read data
path = 'data/medcenter.csv'
medcenter_df_small = pd.read_csv(path)

In [7]:
# remove when physical health and mental health are equal to 0
medcenter_df_small = medcenter_df_small[medcenter_df_small['Physical Health'] != 0]
medcenter_df_small = medcenter_df_small[medcenter_df_small['Mental Health'] != 0]

In [8]:
sleep_hrs_outliers = medcenter_df_small.loc[(medcenter_df_small['Hours of sleep'] > 14) | (medcenter_df_small['Hours of sleep'] < 2)].shape[0]
bmi_outliers = medcenter_df_small.loc[(medcenter_df_small['Body Mass Index'] > 50) | (medcenter_df_small['Body Mass Index'] < 12)].shape[0]


potential_outliers_to_remove = sleep_hrs_outliers + bmi_outliers

print(f"Potential outliers to remove are {potential_outliers_to_remove} on {medcenter_df_small.shape[0]} total data")

# remove outliers
medcenter_df_small = medcenter_df_small[(medcenter_df_small['Hours of sleep'] < 14) & (medcenter_df_small['Hours of sleep'] > 2)]

Potential outliers to remove are 792 on 41393 total data


In [10]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# Columns to be one-hot encoded
onehot_encoded_columns_small = ['Gender', 'Ethnicity', 'Walking Difficulty', 'Skin Cancer', 'Torsades de Pointes', 'Asthma Status', 'Do you Exercise', 'Kidney Disease','Is Smoking','History of Stroke', 'How many Drinks per Week']
# Columns to be label encoded
label_encoded_columns_small = ["How do you Feel", "Age Group", "Diabetes"]

# Applying OneHotEncoder
encoder = OneHotEncoder(drop='first', sparse_output=False)  # 'drop' parameter is used to avoid multicollinearity
for column in onehot_encoded_columns_small:
    # The encoder returns a 2D array, which we need to convert to a DataFrame
    onehot_df = pd.DataFrame(encoder.fit_transform(medcenter_df_small[[column]]))
    
    # The new columns will be named after the classes detected by the encoder
    onehot_df.columns = encoder.get_feature_names_out([column])
    
    # Drop the original column from the main DataFrame
    medcenter_df_small.drop([column], axis=1, inplace=True)
    
    # Concatenate the one-hot encoded DataFrame to the main one
    medcenter_df_small = pd.concat([medcenter_df_small, onehot_df], axis=1)

# Applying LabelEncoder
le = LabelEncoder()
for column in label_encoded_columns_small:
    medcenter_df_small[column] = le.fit_transform(medcenter_df_small[column])

medcenter_df_small.head()


Unnamed: 0,Hours of sleep,How do you Feel,Diabetes,Age Group,Mental Health,Body Mass Index,Physical Health,Patient ID,How many Drinks per Week_Y,Gender_M,...,Ethnicity_White,Walking Difficulty_Y,Skin Cancer_Y,Torsades de Pointes_Y,Asthma Status_Y,Do you Exercise_Y,Kidney Disease_Y,Is Smoking_Y,History of Stroke_Y,How many Drinks per Week_Y.1
0,10.0,2,0,12,0.0,15.55,7.0,100074,0.0,0.0,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
1,7.0,1,2,9,0.0,38.62,2.0,100086,0.0,0.0,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
2,7.0,2,0,8,0.0,21.62,3.0,100094,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,8.0,2,0,9,0.0,22.14,0.0,100154,0.0,0.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,8.0,1,2,10,0.0,43.05,0.0,100158,0.0,1.0,...,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0


In [11]:
column_to_drop = ['Ethnicity_Black','Ethnicity_Asian','Gender_M','Ethnicity_Hispanic',  'Ethnicity_Other', 'Ethnicity_White', 'Skin Cancer_Y','How many Drinks per Week_Y']
medcenter_df_small.drop(column_to_drop, axis = 1, inplace=True)  

In [12]:
from sklearn.preprocessing import StandardScaler
cols_to_scale = ['Hours of sleep', 'Mental Health', 'Body Mass Index']

scaler = StandardScaler()
medcenter_df_small[cols_to_scale] = scaler.fit_transform(medcenter_df_small[cols_to_scale])
medcenter_df_small.head()

Unnamed: 0,Hours of sleep,How do you Feel,Diabetes,Age Group,Mental Health,Body Mass Index,Physical Health,Patient ID,Walking Difficulty_Y,Torsades de Pointes_Y,Asthma Status_Y,Do you Exercise_Y,Kidney Disease_Y,Is Smoking_Y,History of Stroke_Y
0,2.004904,2,0,12,-0.490285,-2.012859,7.0,100074,1.0,1.0,0.0,1.0,0.0,1.0,1.0
1,-0.06871,1,2,9,-0.490285,1.61338,2.0,100086,0.0,1.0,1.0,0.0,0.0,0.0,0.0
2,-0.06871,2,0,8,-0.490285,-1.058751,3.0,100094,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.622495,2,0,9,-0.490285,-0.977015,0.0,100154,1.0,1.0,0.0,0.0,0.0,1.0,0.0
4,0.622495,1,2,10,-0.490285,2.309706,0.0,100158,1.0,1.0,0.0,1.0,1.0,0.0,0.0


In [13]:
from sklearn.model_selection import train_test_split


X = medcenter_df_small.drop('Physical Health', axis=1)
y = medcenter_df_small['Physical Health']
# Split the dataset into train and test sets
X_train_small, X_test_small, y_train_small, y_test_small = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
# Function to evaluate or regression models

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def print_score(reg, X_train, y_train, X_test, y_test, train=True):
    if train:
        pred = reg.predict(X_train)
        print("Train Result:\n================================================")
        print(f"Mean Squared Error: {mean_squared_error(y_train, pred):.3f}")
        print(f"Mean Absolute Error: {mean_absolute_error(y_train, pred):.3f}")
        print(f"R-squared: {r2_score(y_train, pred):.3f}")
        
    elif not train:
        pred = reg.predict(X_test)
        print("Test Result:\n================================================")        
        print(f"Mean Squared Error: {mean_squared_error(y_test, pred):.3f}")
        print(f"Mean Absolute Error: {mean_absolute_error(y_test, pred):.3f}")
        print(f"R-squared: {r2_score(y_test, pred):.3f}")



In [16]:
from sklearn.tree import DecisionTreeRegressor
tree_reg_small = DecisionTreeRegressor(random_state=42)
tree_reg_small.fit(X_train_small, y_train_small)

print_score(tree_reg_small, X_train_small, y_train_small, X_test_small, y_test_small, train=True)
print_score(tree_reg_small, X_train_small, y_train_small, X_test_small, y_test_small, train=False)

decision_tree_mae_small = mean_absolute_error(y_test_small, tree_reg_small.predict(X_test_small))
decision_tree_r2_small = r2_score(y_test_small, tree_reg_small.predict(X_test_small))

Train Result:
Mean Squared Error: 0.000
Mean Absolute Error: 0.000
R-squared: 1.000
Test Result:
Mean Squared Error: 78.957
Mean Absolute Error: 4.147
R-squared: -0.208


In [17]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2', None]
}

# Instantiate the GridSearchCV object
grid_search = GridSearchCV(
    estimator=tree_reg_small, 
    param_grid=param_grid, 
    cv=5, 
    n_jobs=-1, 
    verbose=2)

# Fit the model with GridSearchCV
grid_search.fit(X_train_small, y_train_small)

# Print the best parameters and the corresponding score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")

decision_tree_tuned_mae_small = mean_absolute_error(y_test_small, grid_search.predict(X_test_small))
decision_tree_tuned_r2_small = r2_score(y_test_small, grid_search.predict(X_test_small))


# use the best parameter to train the model and print the performance
best_tree_reg_small = grid_search.best_estimator_
print_score(best_tree_reg_small, X_train_small, y_train_small, X_test_small, y_test_small, train=True)
print_score(best_tree_reg_small, X_train_small, y_train_small, X_test_small, y_test_small, train=False)


Fitting 5 folds for each of 144 candidates, totalling 720 fits


180 fits failed out of a total of 720.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
71 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\alexa\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\alexa\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "C:\Users\alexa\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\L

Best parameters: {'max_depth': 10, 'max_features': None, 'min_samples_leaf': 4, 'min_samples_split': 10}
Best score: 0.3974525126252667
Train Result:
Mean Squared Error: 36.362
Mean Absolute Error: 3.314
R-squared: 0.443
Test Result:
Mean Squared Error: 39.573
Mean Absolute Error: 3.438
R-squared: 0.394


In [19]:
from sklearn.ensemble import RandomForestRegressor

rf_reg_small = RandomForestRegressor(random_state=42)
rf_reg_small.fit(X_train_small, y_train_small)

print_score(rf_reg_small, X_train_small, y_train_small, X_test_small, y_test_small, train=True)
print_score(rf_reg_small, X_train_small, y_train_small, X_test_small, y_test_small, train=False)

random_forest_mae_small = mean_absolute_error(y_test_small, tree_reg_small.predict(X_test_small))
random_forest_r2_small = r2_score(y_test_small, tree_reg_small.predict(X_test_small))

Train Result:
Mean Squared Error: 5.676
Mean Absolute Error: 1.329
R-squared: 0.913
Test Result:
Mean Squared Error: 41.299
Mean Absolute Error: 3.624
R-squared: 0.368


In [20]:
# gradient boosting
from sklearn.ensemble import GradientBoostingRegressor

gb_reg_small = GradientBoostingRegressor(n_estimators=100)
gb_reg_small.fit(X_train_small, y_train_small)

print_score(gb_reg_small, X_train_small, y_train_small, X_test_small, y_test_small, train=True)
print_score(gb_reg_small, X_train_small, y_train_small, X_test_small, y_test_small, train=False)

gradient_boosting_mae_small = mean_absolute_error(y_test_small, gb_reg_small.predict(X_test_small))
gradient_boosting_r2_small = r2_score(y_test_small, gb_reg_small.predict(X_test_small))


Train Result:
Mean Squared Error: 37.725
Mean Absolute Error: 3.428
R-squared: 0.422
Test Result:
Mean Squared Error: 38.543
Mean Absolute Error: 3.461
R-squared: 0.410


In [21]:
from sklearn.model_selection import RandomizedSearchCV

# Reduced parameter grid
param_grid = {
    'n_estimators': [100, 150, 200],
    'max_depth': [3, 5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['auto', 'sqrt']
}

# Using RandomizedSearchCV for faster execution
random_search_small = RandomizedSearchCV(
    estimator=gb_reg_small, 
    param_distributions=param_grid, 
    n_iter=500,  # number of parameter settings that are sampled
    cv=4,  # reduced number of CV splits
    n_jobs=-1, 
    verbose=2,
    random_state=42
)

# Fit the model with RandomizedSearchCV
random_search_small.fit(X_train_small, y_train_small)

# Print the best parameters and the corresponding score
print(f"Best parameters: {random_search_small.best_params_}")
print(f"Best score: {random_search_small.best_score_}")

print_score(random_search_small, X_train_small, y_train_small, X_test_small, y_test_small, train=True)
print_score(random_search_small, X_train_small, y_train_small, X_test_small, y_test_small, train=False)

gradient_boosting_tuned_mae_small = mean_absolute_error(y_test_small, random_search_small.predict(X_test_small))
gradient_boosting_tuned_r2_small = r2_score(y_test_small, random_search_small.predict(X_test_small))




Fitting 4 folds for each of 72 candidates, totalling 288 fits


144 fits failed out of a total of 288.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
94 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\alexa\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\alexa\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "C:\Users\alexa\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\L

Best parameters: {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 5}
Best score: 0.4205134779113838
Train Result:
Mean Squared Error: 36.729
Mean Absolute Error: 3.360
R-squared: 0.437
Test Result:
Mean Squared Error: 38.478
Mean Absolute Error: 3.432
R-squared: 0.411
