Imports + read data

In [18]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline


In [5]:
# Update for your file path maybe? 
trimmed_dataset = pd.read_csv("diabetes_dataset_trimmed.csv")

labels = trimmed_dataset.iloc[:, -1]
features = trimmed_dataset.iloc[:, :-1]

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)


# Logistic Regression

In [6]:
scaler = StandardScaler()
normalized_features_train_log = scaler.fit_transform(X_train)

logreg = LogisticRegression(random_state=42)
logreg.fit(normalized_features_train_log, y_train)

normalized_features_test_log = scaler.transform(X_test)

test_predictions_log = logreg.predict(normalized_features_test_log)

f1_log_reg = f1_score(y_test, test_predictions_log, zero_division=1)  
accuracy_log_reg = accuracy_score(y_test, test_predictions_log)

accuracy_f1_dict = {
    'Logistic Regression': [f'F1 Score: {f1_log_reg:.4f}', f'Accuracy: {accuracy_log_reg:.4f}']
}

print(accuracy_f1_dict)

{'Logistic Regression': ['F1 Score: 0.2390', 'Accuracy: 0.8650']}


# Decision Tree

In [None]:
scaler = StandardScaler()
normalized_features_train_log = scaler.fit_transform(X_train)

DT = DecisionTreeClassifier(random_state=42)
DT.fit(normalized_features_train_log, y_train)

normalized_features_test_log = scaler.transform(X_test)

test_predictions_dt = DT.predict(normalized_features_test_log)

f1_dt = f1_score(y_test, test_predictions_dt, zero_division=1)  
accuracy_dt = accuracy_score(y_test, test_predictions_dt)

accuracy_f1_dict = {
    'Decision Tree': [f'F1 Score: {f1_dt:.4f}', f'Accuracy: {accuracy_dt:.4f}']
}

print(accuracy_f1_dict)

{'Decision Tree': ['F1 Score: 0.2842', 'Accuracy: 0.8323']}


# Random Forest

In [8]:
# Random Forest
normalized_features_train_rf = scaler.fit_transform(X_train)

rf = RandomForestClassifier(random_state=42)
rf.fit(normalized_features_train_rf, y_train)

normalized_features_test_rf = scaler.transform(X_test)

test_predictions_rf = rf.predict(normalized_features_test_rf)

f1_rf = f1_score(y_test, test_predictions_rf, zero_division=1)
accuracy_rf = accuracy_score(y_test, test_predictions_rf)

accuracy_f1_dict['Random Forest'] = [f'F1 Score: {f1_rf:.4f}', f'Accuracy: {accuracy_rf:.4f}']

print(f'F1 Score: {f1_rf:.4f}', f'Accuracy: {accuracy_rf:.4f}')

F1 Score: 0.2939 Accuracy: 0.8446


# XGBoost

In [None]:
scaler = StandardScaler()
normalized_features_train_log = scaler.fit_transform(X_train)

XGB = GradientBoostingClassifier(random_state=42)
XGB.fit(normalized_features_train_log, y_train)

normalized_features_test_log = scaler.transform(X_test)

test_predictions_XGB = XGB.predict(normalized_features_test_log)

f1_XGB = f1_score(y_test, test_predictions_XGB, zero_division=1)  
accuracy_XGB = accuracy_score(y_test, test_predictions_XGB)

accuracy_f1_dict = {
    'XGBoost': [f'F1 Score: {f1_XGB:.4f}', f'Accuracy: {accuracy_XGB:.4f}']
}

print(accuracy_f1_dict)

{'XGBoost': ['F1 Score: 0.2570', 'Accuracy: 0.8673']}


# Create an sklearn pipeline for these models.

In [None]:
# Example dataset (Replace with your own dataset)
# X, y = load_your_data()  # Replace with actual data loading code
# Update for your file path maybe? 
trimmed_dataset = pd.read_csv("diabetes_dataset_trimmed.csv")


labels = trimmed_dataset.iloc[:, -1]
features = trimmed_dataset.iloc[:, :-1]

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Standardizing the data
scaler = StandardScaler()
normalized_features_train_log = scaler.fit_transform(X_train)
normalized_features_test_log = scaler.transform(X_test)

# Models to evaluate
models = {
    "Logistic Regression": LogisticRegression(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": GradientBoostingClassifier(random_state=42),
}

accuracy_f1_dict = {}

# Train and evaluate each model
for model_name, model in models.items():
    model.fit(normalized_features_train_log, y_train)
    test_predictions = model.predict(normalized_features_test_log)
    f1 = f1_score(y_test, test_predictions, zero_division=1)
    accuracy = accuracy_score(y_test, test_predictions)
    accuracy_f1_dict[model_name] = [f'F1 Score: {f1:.4f}', f'Accuracy: {accuracy:.4f}']

# Print the results
print(accuracy_f1_dict)

{'Logistic Regression': ['F1 Score: 0.2390', 'Accuracy: 0.8650'], 'Decision Tree': ['F1 Score: 0.2842', 'Accuracy: 0.8323'], 'Random Forest': ['F1 Score: 0.2939', 'Accuracy: 0.8446'], 'XGBoost': ['F1 Score: 0.2570', 'Accuracy: 0.8673']}


# All Datasets All Models

In [19]:
# List of file paths for the datasets
dataset_files = [
    "diabetes_balanced_filtered.csv",  # Replace with actual file paths
    "diabetes_balanced_full.csv",
    "diabetes_unbalanced_full.csv",
    "diabetes_unbalanced_filtered.csv",
]

# Models to evaluate
models = {
    "Logistic Regression": LogisticRegression(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": GradientBoostingClassifier(random_state=42),
}

# Dictionary to store results for all datasets
all_results = {}

# Loop through each file
for file in dataset_files:
    # Load the dataset
    df = pd.read_csv(file)  # Modify if the format isn't CSV
    dataset_name = file.split('/')[-1]  # Use the file name as the dataset name
    
    # Assume the label is in a column named 'label'
    X = df.drop(columns=["Diabetes_binary"])  # Features
    y = df["Diabetes_binary"]                 # Target
    
    # Splitting the dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Dictionary to store results for the current dataset
    dataset_results = {}

    # Train and evaluate each model using a pipeline
    for model_name, model in models.items():
        # Define the pipeline
        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('classifier', model)
        ])
        
        # Fit the pipeline
        pipeline.fit(X_train, y_train)
        
        # Make predictions
        test_predictions = pipeline.predict(X_test)
        
        # Calculate metrics
        f1 = f1_score(y_test, test_predictions, zero_division=1)
        accuracy = accuracy_score(y_test, test_predictions)
        dataset_results[model_name] = [f'F1 Score: {f1:.4f}', f'Accuracy: {accuracy:.4f}']

    # Store the results for the current dataset
    all_results[dataset_name] = dataset_results

# Print the results for all datasets
for dataset_name, results in all_results.items():
    print(f"Results for {dataset_name}:")
    for model_name, metrics in results.items():
        print(f"  {model_name}: {metrics}")
    print()

Results for diabetes_balanced_filtered.csv:
  Logistic Regression: ['F1 Score: 0.7480', 'Accuracy: 0.7437']
  Decision Tree: ['F1 Score: 0.6544', 'Accuracy: 0.6681']
  Random Forest: ['F1 Score: 0.7126', 'Accuracy: 0.7062']
  XGBoost: ['F1 Score: 0.7608', 'Accuracy: 0.7505']

Results for diabetes_balanced_full.csv:
  Logistic Regression: ['F1 Score: 0.7531', 'Accuracy: 0.7484']
  Decision Tree: ['F1 Score: 0.6487', 'Accuracy: 0.6493']
  Random Forest: ['F1 Score: 0.7474', 'Accuracy: 0.7375']
  XGBoost: ['F1 Score: 0.7628', 'Accuracy: 0.7536']

Results for diabetes_unbalanced_full.csv:
  Logistic Regression: ['F1 Score: 0.2538', 'Accuracy: 0.8659']
  Decision Tree: ['F1 Score: 0.3150', 'Accuracy: 0.7984']
  Random Forest: ['F1 Score: 0.2561', 'Accuracy: 0.8607']
  XGBoost: ['F1 Score: 0.2662', 'Accuracy: 0.8675']

Results for diabetes_unbalanced_filtered.csv:
  Logistic Regression: ['F1 Score: 0.2390', 'Accuracy: 0.8650']
  Decision Tree: ['F1 Score: 0.2842', 'Accuracy: 0.8323']
  Rando