In [164]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import VotingClassifier

In [58]:
# Load data
df = pd.read_csv('diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [59]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [60]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [61]:
feature_columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age']

for column in feature_columns:
    print(f"{column} --> Missing value: {len(df.loc[df[column]==0])}")

Pregnancies --> Missing value: 111
Glucose --> Missing value: 5
BloodPressure --> Missing value: 35
SkinThickness --> Missing value: 227
Insulin --> Missing value: 374
BMI --> Missing value: 11
DiabetesPedigreeFunction --> Missing value: 0
Age --> Missing value: 0


In [62]:
from sklearn.impute import SimpleImputer

fill_values = SimpleImputer(missing_values = 0, strategy = 'mean', copy = False)
df[feature_columns] = fill_values.fit_transform(df[feature_columns])

df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148.0,72.0,35.0,155.548223,33.6,0.627,50.0,1
1,1.0,85.0,66.0,29.0,155.548223,26.6,0.351,31.0,0
2,8.0,183.0,64.0,29.15342,155.548223,23.3,0.672,32.0,1
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0
4,4.494673,137.0,40.0,35.0,168.0,43.1,2.288,33.0,1


In [63]:
X = df[feature_columns]
y = df.Outcome

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [98]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test)

In [102]:
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10, 100],  # regularization strength
    'penalty': ['l1', 'l2', 'elasticnet'],  # regularization types
    'solver': ['saga'],  # solver for handling L1 regularization
    'l1_ratio': [0.1, 0.5, 0.9]  # ratio for elasticnet
}
lr = LogisticRegression(max_iter=1000)
grid_search_lr = GridSearchCV(lr, param_grid_lr, cv=5)
grid_search_lr.fit(X_train_scaled,y_train)

print("Best parameters for Logistic Regression: ", grid_search_lr.best_params_)
print("Best accuracy for Logistic Regression: ", grid_search_lr.best_score_)



Best parameters for Logistic Regression:  {'C': 0.1, 'l1_ratio': 0.9, 'penalty': 'elasticnet', 'solver': 'saga'}
Best accuracy for Logistic Regression:  0.7719978675196588


In [148]:
model_lr = LogisticRegression(max_iter=1000,C= 0.1, l1_ratio= 0.9, penalty= 'elasticnet', solver= 'saga')
model_lr.fit(X_train_scaled, y_train)

y_pred_lr = model_lr.predict(X_test_scaled)
acc_lr = accuracy_score(y_test, y_pred_lr)

print("Test set accuracy: {:.2f}".format(acc_lr))
print(f"Test set accuracy: {acc_lr}")

Test set accuracy: 0.78
Test set accuracy: 0.7792207792207793


In [150]:
param_grid_svc = {
    'C': [0.1, 1, 10],
    'degree': [2, 3, 4],  # degree of polynomial kernel
    'kernel': ['poly'],  # use polynomial kernel
    'coef0': [0.0, 0.5, 1.0],  # independent term in poly kernel
}
svm = SVC()
grid_search_svm = GridSearchCV(svm, param_grid_svc,cv=5)
grid_search_svm.fit(X_train_scaled,y_train)

print("Best parameters for SVM Polynomial: ", grid_search_svm.best_params_)
print("Best accuracy for SVM Polynomial: ", grid_search_svm.best_score_)

Best parameters for SVM Polynomial:  {'C': 1, 'coef0': 0.5, 'degree': 3, 'kernel': 'poly'}
Best accuracy for SVM Polynomial:  0.7687458349993336


In [151]:
model_svm = SVC(C= 1, coef0= 0.5, degree= 3, kernel= 'poly')
model_svm.fit(X_train_scaled, y_train)

y_pred_svm = model_svm.predict(X_test_scaled)
acc_svm = accuracy_score(y_test, y_pred_svm)

print("Test set accuracy: {:.2f}".format(acc_svm))
print(f"Test set accuracy: {acc_svm}")

Test set accuracy: 0.75
Test set accuracy: 0.7467532467532467


In [156]:
param_grid_dt = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']  # impurity criterion
}
dt = DecisionTreeClassifier()
grid_search_dt = GridSearchCV(dt,param_grid_dt,cv=5)
grid_search_dt.fit(X_train_scaled, y_train)

print("Best parameters for Decision Tree: ", grid_search_dt.best_params_)
print("Best score for Decision Tree: ", grid_search_dt.best_score_)

Best parameters for Decision Tree:  {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}
Best score for Decision Tree:  0.7198320671731308


In [162]:
model_dt = DecisionTreeClassifier(criterion= 'gini', max_depth= 10, min_samples_leaf= 4, min_samples_split= 10)
model_dt.fit(X_train_scaled,y_train)

y_pred_dt = model_dt.predict(X_test_scaled)
acc_dt = accuracy_score(y_test, y_pred_dt)

print("Test set accuracy: {:.2f}".format(acc_dt))
print(f"Test set accuracy: {acc_dt}")

Test set accuracy: 0.74
Test set accuracy: 0.7402597402597403


In [186]:
clf1 = LogisticRegression(max_iter=1000,C= 0.1, l1_ratio= 0.9, penalty= 'elasticnet', solver= 'saga')
clf2 = SVC(C= 1, coef0= 0.5, degree= 3, kernel= 'poly')
clf3 = DecisionTreeClassifier(criterion= 'gini', max_depth= 10, min_samples_leaf= 4, min_samples_split= 10)

voting = VotingClassifier(estimators=[('Logistic regression', clf1),('SVM Polynomial',clf2),('Decision Tree',clf3)],voting ='hard')
voting.fit(X_train_scaled, y_train)

y_pred_vt = voting.predict(X_test_scaled)
acc_vt = accuracy_score(y_test, y_pred_vt)

print('Hard Voting')
print('Test set accuracy: {:.2f}'.format(acc_vt))
print(f'Test set accuracy: {acc_vt}')

Hard Voting
Test set accuracy: 0.74
Test set accuracy: 0.7402597402597403
