In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split


In [20]:
data= pd.read_csv("../Assignment 2_diabetes_classification/diabetes_binary_classification_data.csv")

In [21]:
##Checks whats in the dataset
data.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [24]:
#checks duplicated in dataset
# There is 24206 duplicates
data.duplicated().sum()

24206

In [26]:
#Remove the duplicates
#data = data.drop_duplicates()

In [32]:
# Imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, classification_report

# Define features (X) and target (y)
X = data.drop('Diabetes_binary', axis=1)
y = data['Diabetes_binary']

# Split the dataset into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)  # 70% train
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)  # 15% validation, 15% test

print("Training set size:", X_train.shape[0])
print("Validation set size:", X_val.shape[0])
print("Test set size:", X_test.shape[0])

# Create preprocessing pipelines for numeric and categorical features
numeric_features = ['BMI', 'MentHlth', 'PhysHlth', 'Age', 'Income']  # Non-binary variables
categorical_features = ['GenHlth', 'Education', 'HighBP', 'HighChol', 
                        'CholCheck', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 
                        'PhysActivity', 'Fruits', 'Veggies', 
                        'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 
                        'DiffWalk', 'Sex']  # Binary variables

# Create preprocessing for numeric and categorical features
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Preprocess training data (numerical and categorical features)
X_train_processed = preprocessor.fit_transform(X_train)

# Preprocess validation and test data
X_val_processed = preprocessor.transform(X_val)
X_test_processed = preprocessor.transform(X_test)

# Define models
log_reg = LogisticRegression(max_iter=1000)
decision_tree = DecisionTreeClassifier(max_depth=10)
random_forest = RandomForestClassifier(n_estimators=50, max_depth=10)

# Create a voting classifier
voting_clf = VotingClassifier(estimators=[
    ('log_reg', log_reg),
    ('decision_tree', decision_tree),
    ('random_forest', random_forest)],
    voting='hard')  # 'hard' for majority voting

# Fit models on the processed training data
log_reg.fit(X_train_processed, y_train)
decision_tree.fit(X_train_processed, y_train)
random_forest.fit(X_train_processed, y_train)
voting_clf.fit(X_train_processed, y_train)

# Make predictions on the validation set
log_reg_val_pred = log_reg.predict(X_val_processed)
decision_tree_val_pred = decision_tree.predict(X_val_processed)
random_forest_val_pred = random_forest.predict(X_val_processed)
voting_val_pred = voting_clf.predict(X_val_processed)

# Evaluate models on validation data and print results
for model_name, predictions in zip(
    ["Logistic Regression", "Decision Tree", "Random Forest", "Voting Classifier"],
    [log_reg_val_pred, decision_tree_val_pred, random_forest_val_pred, voting_val_pred]
):
    accuracy = accuracy_score(y_val, predictions)
    print(f"{model_name} Validation Accuracy: {accuracy:.4f}")
    print(classification_report(y_val, predictions))

# Make predictions on the test set
log_reg_test_pred = log_reg.predict(X_test_processed)
decision_tree_test_pred = decision_tree.predict(X_test_processed)
random_forest_test_pred = random_forest.predict(X_test_processed)
voting_test_pred = voting_clf.predict(X_test_processed)

# Evaluate models on test data and print results
for model_name, predictions in zip(
    ["Logistic Regression", "Decision Tree", "Random Forest", "Voting Classifier"],
    [log_reg_test_pred, decision_tree_test_pred, random_forest_test_pred, voting_test_pred]
):
    accuracy = accuracy_score(y_test, predictions)
    print(f"{model_name} Test Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, predictions))


Training set size: 177576
Validation set size: 38052
Test set size: 38052
Logistic Regression Validation Accuracy: 0.8637
              precision    recall  f1-score   support

         0.0       0.88      0.98      0.93     32747
         1.0       0.54      0.15      0.24      5305

    accuracy                           0.86     38052
   macro avg       0.71      0.57      0.58     38052
weighted avg       0.83      0.86      0.83     38052

Decision Tree Validation Accuracy: 0.8628
              precision    recall  f1-score   support

         0.0       0.88      0.98      0.92     32747
         1.0       0.53      0.15      0.24      5305

    accuracy                           0.86     38052
   macro avg       0.70      0.57      0.58     38052
weighted avg       0.83      0.86      0.83     38052

Random Forest Validation Accuracy: 0.8645
              precision    recall  f1-score   support

         0.0       0.87      0.99      0.93     32747
         1.0       0.60      0.

In [165]:
# Imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# Example dataset loading (replace with actual data file)
# data = pd.read_csv('../Assignment2_diabetes_classification/diabetes_binary_classification_data.csv')

# Define features (X) and target (y)
X = data.drop('Diabetes_binary', axis=1)
y = data['Diabetes_binary']

# Split the dataset into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)  # 70% train
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)  # 15% validation, 15% test

print("Training set size:", X_train.shape[0])
print("Validation set size:", X_val.shape[0])
print("Test set size:", X_test.shape[0])

# Create preprocessing pipelines for numeric and categorical features
numeric_features = ['BMI', 'MentHlth', 'PhysHlth','Age','Income']  # Non-binary variables
categorical_features = ['GenHlth', 'Education', 'HighBP', 'HighChol', 'CholCheck', 
                        'Smoker', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 
                        'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 
                        'NoDocbcCost', 'DiffWalk', 'Sex']  # Binary variables

# Create preprocessing for numeric and categorical features
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Preprocess training data (numerical and categorical features)
X_train_processed = preprocessor.fit_transform(X_train)

# Retrieve the column names from preprocessor (numeric + one-hot encoded features)
processed_columns = numeric_features + list(preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features))

# Convert the processed NumPy array back into a DataFrame with column names
X_train_processed_df = pd.DataFrame(X_train_processed, columns=processed_columns)

# Add the target variable to the DataFrame
X_train_processed_df['Diabetes_binary'] = y_train.reset_index(drop=True)

# Perform correlation analysis
correlation_matrix = X_train_processed_df.corr()
sorted_correlations = correlation_matrix['Diabetes_binary'].sort_values(ascending=False)

print("Top correlations with Diabetes_binary:")
print(sorted_correlations)

# Define models
log_reg = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', LogisticRegression(max_iter=1000))])

decision_tree = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('classifier', DecisionTreeClassifier(max_depth=10))])

random_forest = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('classifier', RandomForestClassifier(n_estimators=50, max_depth=10))])

# Create a voting classifier
voting_clf = VotingClassifier(estimators=[
    ('log_reg', log_reg),
    ('decision_tree', decision_tree),
    ('random_forest', random_forest)],
    voting='hard')  # 'hard' for majority voting

# Fit models on the training data
log_reg.fit(X_train, y_train)
decision_tree.fit(X_train, y_train)
random_forest.fit(X_train, y_train)
voting_clf.fit(X_train, y_train)

# Make predictions on the validation set
log_reg_val_pred = log_reg.predict(X_val)
decision_tree_val_pred = decision_tree.predict(X_val)
random_forest_val_pred = random_forest.predict(X_val)
voting_val_pred = voting_clf.predict(X_val)

# Evaluate models on validation data and print results
for model_name, predictions in zip(
    ["Logistic Regression", "Decision Tree", "Random Forest", "Voting Classifier"],
    [log_reg_val_pred, decision_tree_val_pred, random_forest_val_pred, voting_val_pred]
):
    accuracy = accuracy_score(y_val, predictions)
    print(f"{model_name} Validation Accuracy: {accuracy:.4f}")
    print(classification_report(y_val, predictions))

# Make predictions on the test set
log_reg_test_pred = log_reg.predict(X_test)
decision_tree_test_pred = decision_tree.predict(X_test)
random_forest_test_pred = random_forest.predict(X_test)
voting_test_pred = voting_clf.predict(X_test)

# Evaluate models on test data and print results
for model_name, predictions in zip(
    ["Logistic Regression", "Decision Tree", "Random Forest", "Voting Classifier"],
    [log_reg_test_pred, decision_tree_test_pred, random_forest_test_pred, voting_test_pred]
):
    accuracy = accuracy_score(y_test, predictions)
    print(f"{model_name} Test Accuracy: {accuracy:.4f}")


Training set size: 177576
Validation set size: 38052
Test set size: 38052
Top correlations with Diabetes_binary:
Diabetes_binary             1.000000
HighBP_1.0                  0.263531
DiffWalk_1.0                0.221507
BMI                         0.218647
HighChol_1.0                0.197866
GenHlth_4.0                 0.186045
HeartDiseaseorAttack_1.0    0.179718
Age                         0.176577
PhysHlth                    0.171694
GenHlth_5.0                 0.156761
PhysActivity_0.0            0.118985
Stroke_1.0                  0.107874
GenHlth_3.0                 0.071468
MentHlth                    0.070958
CholCheck_1.0               0.065421
Education_4.0               0.062446
Smoker_1.0                  0.062066
Education_3.0               0.058617
Education_2.0               0.058124
Veggies_0.0                 0.057552
HvyAlcoholConsump_0.0       0.057065
Fruits_0.0                  0.039896
NoDocbcCost_1.0             0.033110
Sex_1.0                     0.032150