Imports + read data

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report


In [2]:
# Update for your file path maybe? 
trimmed_dataset = pd.read_csv("diabetes_dataset_trimmed.csv")

labels = trimmed_dataset.iloc[:, -1]
features = trimmed_dataset.iloc[:, :-1]

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)


Logistic Regression

In [3]:
scaler = StandardScaler()
normalized_features_train_log = scaler.fit_transform(X_train)

logreg = LogisticRegression(random_state=42)
logreg.fit(normalized_features_train_log, y_train)

normalized_features_test_log = scaler.transform(X_test)

test_predictions_log = logreg.predict(normalized_features_test_log)

f1_log_reg = f1_score(y_test, test_predictions_log, zero_division=1)  
accuracy_log_reg = accuracy_score(y_test, test_predictions_log)

accuracy_f1_dict = {
    'Logistic Regression': [f'F1 Score: {f1_log_reg:.4f}', f'Accuracy: {accuracy_log_reg:.4f}']
}

print(accuracy_f1_dict)

{'Logistic Regression': ['F1 Score: 0.2390', 'Accuracy: 0.8650']}


In [4]:
# Step 1: Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0]
}

# Step 2: Initialize the GBC model
gbc = GradientBoostingClassifier()

# Step 3: Perform Grid Search
grid_search = GridSearchCV(estimator=gbc, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=10)
grid_search.fit(X_train, y_train)

# Step 4: Get the best model
best_model = grid_search.best_estimator_
print("Best parameters:", grid_search.best_params_)

# Step 5: Evaluate the model
y_pred = best_model.predict(X_test)
print("Accuracy on test data:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Fitting 5 folds for each of 54 candidates, totalling 270 fits


KeyboardInterrupt: 

In [None]:
# Random Forest
normalized_features_train_rf = scaler.fit_transform(X_train)

rf = RandomForestClassifier(random_state=42)
rf.fit(normalized_features_train_rf, y_train)

normalized_features_test_rf = scaler.transform(X_test)

test_predictions_rf = rf.predict(normalized_features_test_rf)

f1_rf = f1_score(y_test, test_predictions_rf, zero_division=1)
accuracy_rf = accuracy_score(y_test, test_predictions_rf)

accuracy_f1_dict['Random Forest'] = [f'F1 Score: {f1_rf:.4f}', f'Accuracy: {accuracy_rf:.4f}']

print(f'F1 Score: {f1_rf:.4f}', f'Accuracy: {accuracy_rf:.4f}')