In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


# Modelling

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split


## Import Dataframe 

In [4]:
df = pd.read_csv('data/diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


## Seperate dependent and Independent features


In [32]:
x= df.drop(['Outcome'], axis=1)
y= df['Outcome']
print(type(x))

<class 'pandas.core.frame.DataFrame'>


## Transform the data
## Standardize the data

In [None]:
# We have to create pipelines for different types of features

num_features = x.select_dtypes(exclude='object').columns
#cat_features = x.select_dtypes(include='object').columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age'],
      dtype='object')

In [28]:
numeric_transformer = StandardScaler()
# oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [('Scaler',numeric_transformer,num_features)]
)

In [33]:
x = preprocessor.fit_transform(x)
x


array([[ 0.63994726,  0.84832379,  0.14964075, ...,  0.20401277,
         0.46849198,  1.4259954 ],
       [-0.84488505, -1.12339636, -0.16054575, ..., -0.68442195,
        -0.36506078, -0.19067191],
       [ 1.23388019,  1.94372388, -0.26394125, ..., -1.10325546,
         0.60439732, -0.10558415],
       ...,
       [ 0.3429808 ,  0.00330087,  0.14964075, ..., -0.73518964,
        -0.68519336, -0.27575966],
       [-0.84488505,  0.1597866 , -0.47073225, ..., -0.24020459,
        -0.37110101,  1.17073215],
       [-0.84488505, -0.8730192 ,  0.04624525, ..., -0.20212881,
        -0.47378505, -0.87137393]], shape=(768, 8))

In [36]:
# We will do train test split
X_train, X_test, y_train, y_test = train_test_split(
    x, y, test_size=0.33, random_state=42)

## Create an evalutaion function to give all metrics after model training


In [38]:
def evaluate_model (true , predicted):
    accuracy = accuracy_score(true,predicted)
    return accuracy

In [40]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Define models
models = {
    "LogisticRegression": LogisticRegression(),
    "AdaBoostClassifier": AdaBoostClassifier(),
    "KNeighborsClassifier": KNeighborsClassifier(),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "RandomForestClassifier": RandomForestClassifier()
}

# Initialize lists to store results
model_list = []
accuracy_list = []

# Evaluate each model
for model_name, model in models.items():
    model.fit(X_train, y_train)  # Train the model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Evaluate accuracy
    accuracy = evaluate_model(y_test, y_test_pred)
    print(f"{model_name}: {accuracy:.4f}")

    # Store results
    model_list.append(model_name)
    accuracy_list.append(accuracy)

# Optionally, create a DataFrame for better visualization
import pandas as pd
results_df = pd.DataFrame({
    'Model': model_list,
    'Accuracy': accuracy_list
})

print(results_df)


LogisticRegression: 0.7402
AdaBoostClassifier: 0.7598
KNeighborsClassifier: 0.6969
DecisionTreeClassifier: 0.6929
RandomForestClassifier: 0.7559
                    Model  Accuracy
0      LogisticRegression  0.740157
1      AdaBoostClassifier  0.759843
2    KNeighborsClassifier  0.696850
3  DecisionTreeClassifier  0.692913
4  RandomForestClassifier  0.755906


In [41]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

# Example for one model
y_pred = model.predict(X_test)

print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# For ROC AUC, if it's a binary classification problem
try:
    print("ROC AUC:", roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))
except AttributeError:
    print("ROC AUC: Model does not support predict_proba.")


Precision: 0.6363636363636364
Recall: 0.6511627906976745
F1 Score: 0.6436781609195402
Confusion Matrix:
 [[136  32]
 [ 30  56]]
ROC AUC: 0.8082779623477299
