In [5]:
import pandas as pd


# Load the dataset
data = pd.read_csv('pima_indians_diabetes.csv')

# Display the first few rows of the dataset
data.head()

Unnamed: 0,time_pregnant_no,plasma_concentration,diastolic_blood_pressure,triceps_skinfold_thickness,serum_insulin,bmi,diabetes_pedigree,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [9]:
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
columns_with_zeros = ['plasma_concentration', 'diastolic_blood_pressure', 'triceps_skinfold_thickness', 
                      'serum_insulin', 'bmi']
data[columns_with_zeros] = data[columns_with_zeros].replace(0, np.nan)

# Impute missing values
imputer = SimpleImputer(strategy='mean')
data[columns_with_zeros] = imputer.fit_transform(data[columns_with_zeros])

# Standardize the features
scaler = StandardScaler()
features = data.drop(columns=['class'])
scaled_features = scaler.fit_transform(features)

# Combine scaled features with the target variable
df_scaled = pd.DataFrame(scaled_features, columns=features.columns)
df_scaled['class'] = data['class']
X_scaled= df_scaled.drop(columns=['class'])
y = df_scaled['class']

In [8]:
df_scaled.head()

Unnamed: 0,time_pregnant_no,plasma_concentration,diastolic_blood_pressure,triceps_skinfold_thickness,serum_insulin,bmi,diabetes_pedigree,age,class
0,0.639947,0.865108,-0.033518,0.6655021,-3.345079e-16,0.166292,0.468492,1.425995,1
1,-0.844885,-1.206162,-0.529859,-0.01746338,-3.345079e-16,-0.852531,-0.365061,-0.190672,0
2,1.23388,2.015813,-0.695306,8.087936e-16,-3.345079e-16,-1.332833,0.604397,-0.105584,1
3,-0.844885,-1.074652,-0.529859,-0.7004289,-0.7243887,-0.634212,-0.920763,-1.041549,0
4,-1.141852,0.503458,-2.680669,0.6655021,0.1465506,1.54898,5.484909,-0.020496,1


In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)


In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score , precision_score, recall_score, f1_score, classification_report  

# Define the parameter grid
param_grid = {
    'n_estimators': [10, 25, 40],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [10, 25, 40]
}

# Initialize the Random Forest classifier
rf = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10, n_jobs=-1, verbose=2)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best model
best_rf = grid_search.best_estimator_

# Report the accuracy on train and test datasets
train_accuracy = accuracy_score(y_train, best_rf.predict(X_train))
test_accuracy = accuracy_score(y_test, best_rf.predict(X_test))
test_precision = precision_score(y_test, best_rf.predict(X_test))
test_recall = recall_score(y_test, best_rf.predict(X_test))
test_f1 = f1_score(y_test, best_rf.predict(X_test))

print(f'Train Accuracy: {train_accuracy}')
print(f'Test Accuracy: {test_accuracy}')
print(f'Test Precision: {test_precision}')
print(f'Test Recall: {test_recall}')
print(f'Test F1 Score: {test_f1}')


Fitting 10 folds for each of 27 candidates, totalling 270 fits
[CV] END ...max_depth=10, max_features=auto, n_estimators=10; total time=   0.0s
[CV] END ...max_depth=10, max_features=auto, n_estimators=10; total time=   0.0s
[CV] END ...max_depth=10, max_features=auto, n_estimators=10; total time=   0.0s
[CV] END ...max_depth=10, max_features=auto, n_estimators=10; total time=   0.0s
[CV] END ...max_depth=10, max_features=auto, n_estimators=10; total time=   0.0s
[CV] END ...max_depth=10, max_features=auto, n_estimators=25; total time=   0.0s
[CV] END ...max_depth=10, max_features=auto, n_estimators=10; total time=   0.0s
[CV] END ...max_depth=10, max_features=auto, n_estimators=25; total time=   0.0s
[CV] END ...max_depth=10, max_features=auto, n_estimators=25; total time=   0.0s
[CV] END ...max_depth=10, max_features=auto, n_estimators=25; total time=   0.0s
[CV] END ...max_depth=10, max_features=auto, n_estimators=25; total time=   0.0s
[CV] END ...max_depth=10, max_features=auto, n

90 fits failed out of a total of 270.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
53 fits failed with the following error:
Traceback (most recent call last):
  File "/home/ajf/Share/PhD_Mechatronics/semester_2/ML_Applied/HomeWork/Applied_ML_Course/ML-HW02/.venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/ajf/Share/PhD_Mechatronics/semester_2/ML_Applied/HomeWork/Applied_ML_Course/ML-HW02/.venv/lib/python3.10/site-packages/sklearn/base.py", line 1467, in wrapper
    estimator._validate_params()
  File "/home/ajf/Share/PhD_Mechatronics/semester_2/ML_Applied/HomeWork/Applied_ML_Course/ML-HW02/.venv/lib/python3.10/site-pac

In [22]:
# Get the results from GridSearchCV
results = pd.DataFrame(grid_search.cv_results_)

# Display the results
results[['param_n_estimators', 'param_max_features', 'param_max_depth', 'mean_test_score',]]


Unnamed: 0,param_n_estimators,param_max_features,param_max_depth,mean_test_score
0,10,auto,10,
1,25,auto,10,
2,40,auto,10,
3,10,sqrt,10,0.739203
4,25,sqrt,10,0.772642
5,40,sqrt,10,0.770755
6,10,log2,10,0.729769
7,25,log2,10,0.737212
8,40,log2,10,0.742802
9,10,auto,25,


In [23]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(random_state=42)
gb.fit(X_train, y_train)
gb_train_accuracy = accuracy_score(y_train, gb.predict(X_train))
gb_test_accuracy = accuracy_score(y_test, gb.predict(X_test))
gb_test_precision = precision_score(y_test, gb.predict(X_test))
gb_test_recall = recall_score(y_test, gb.predict(X_test))
gb_test_f1 = f1_score(y_test, gb.predict(X_test))

print(f'Train Accuracy: {gb_train_accuracy}')
print(f'Test Accuracy: {gb_test_accuracy}')
print(f'Test Precision: {gb_test_precision}')
print(f'Test Recall: {gb_test_recall}')
print(f'Test F1 Score: {gb_test_f1}')

Train Accuracy: 0.9422718808193669
Test Accuracy: 0.7402597402597403
Test Precision: 0.6111111111111112
Test Recall: 0.6875
Test F1 Score: 0.6470588235294118


In [24]:
from sklearn.ensemble import AdaBoostClassifier

ab = AdaBoostClassifier(random_state=42)
ab.fit(X_train, y_train)
ab_train_accuracy = accuracy_score(y_train, ab.predict(X_train))
ab_test_accuracy = accuracy_score(y_test, ab.predict(X_test))
ab_test_precision = precision_score(y_test, ab.predict(X_test))
ab_test_recall = recall_score(y_test, ab.predict(X_test))
ab_test_f1 = f1_score(y_test, ab.predict(X_test))



print(f'Train Accuracy: {ab_train_accuracy}')
print(f'Test Accuracy: {ab_test_accuracy}')
print(f'Test Precision: {ab_test_precision}')
print(f'Test Recall: {ab_test_recall}')
print(f'Test F1 Score: {ab_test_f1}')





Train Accuracy: 0.8603351955307262
Test Accuracy: 0.7532467532467533
Test Precision: 0.6352941176470588
Test Recall: 0.675
Test F1 Score: 0.6545454545454545


In [25]:
from xgboost import XGBClassifier

xgb = XGBClassifier(random_state=42)
xgb.fit(X_train, y_train)
xgb_train_accuracy = accuracy_score(y_train, xgb.predict(X_train))
xgb_test_accuracy = accuracy_score(y_test, xgb.predict(X_test))
xgb_test_precision = precision_score(y_test, xgb.predict(X_test))
xgb_test_recall = recall_score(y_test, xgb.predict(X_test))
xgb_test_f1 = f1_score(y_test, xgb.predict(X_test))

print(f'Train Accuracy: {xgb_train_accuracy}')
print(f'Test Accuracy: {xgb_test_accuracy}')
print(f'Test Precision: {xgb_test_precision}')
print(f'Test Recall: {xgb_test_recall}')
print(f'Test F1 Score: {xgb_test_f1}')


Train Accuracy: 1.0
Test Accuracy: 0.7445887445887446
Test Precision: 0.6153846153846154
Test Recall: 0.7
Test F1 Score: 0.6549707602339181
