In [80]:
import pandas as pd
from sklearn.model_selection import RepeatedStratifiedKFold,GridSearchCV,StratifiedKFold
    
# Load the dataset
df = pd.read_csv('diabetes.csv')
df


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [56]:
# Check the dimensions of the dataset
print("Shape of the dataset:", df.shape)


Shape of the dataset: (768, 9)


In [57]:

# Check for missing values
print("Missing values:\n", df.isnull().sum())


Missing values:
 Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [58]:
# Statistical summary of numerical features
print("Statistical summary:\n", df.describe())


Statistical summary:
        Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894531      69.105469      20.536458   79.799479   
std       3.369578   31.972618      19.355807      15.952218  115.244002   
min       0.000000    0.000000       0.000000       0.000000    0.000000   
25%       1.000000   99.000000      62.000000       0.000000    0.000000   
50%       3.000000  117.000000      72.000000      23.000000   30.500000   
75%       6.000000  140.250000      80.000000      32.000000  127.250000   
max      17.000000  199.000000     122.000000      99.000000  846.000000   

              BMI  DiabetesPedigreeFunction         Age     Outcome  
count  768.000000                768.000000  768.000000  768.000000  
mean    31.992578                  0.471876   33.240885    0.348958  
std      7.884160                  0.331329   11.760232    0.476951  
min      0.00

In [59]:
# Replace missing values with mean
df.fillna(df.mean(), inplace=True)


In [60]:
# Split the dataset into features and target variable
X = df.drop('Outcome', axis=1)
y = df['Outcome']


In [81]:
from sklearn.feature_selection import mutual_info_classif
# Calculate mutual information gain for each feature
info_gain = mutual_info_classif(X, y)
info_gain

array([0.        , 0.11977819, 0.        , 0.01684259, 0.02940391,
       0.08062594, 0.00952434, 0.04282841])

In [82]:
# Drop features with low information gain
X = X.drop(['Pregnancies','BloodPressure'],axis=1)

In [84]:
from sklearn.model_selection import train_test_split
# Split the dataset into training and testing sets (e.g., 70/30 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [62]:
from sklearn.preprocessing import StandardScaler
# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the training set
X_train_scaled = scaler.fit_transform(X_train)

# Transform the testing set
X_test_scaled = scaler.transform(X_test)

In [63]:
# Import sklearn classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report


In [64]:
# Initialize models
logistic_regression = LogisticRegression()
random_forest = RandomForestClassifier()
svm = SVC()
ridge = RidgeClassifier()

In [85]:
# Train the models
logistic_regression.fit(X_train, y_train)
random_forest.fit(X_train, y_train)
svm.fit(X_train, y_train)

In [88]:

# Predictions
logistic_regression_preds = logistic_regression.predict(X_test)
random_forest_preds = random_forest.predict(X_test)
svm_preds = svm.predict(X_test)

In [89]:
from sklearn.metrics import classification_report

# Evaluate performance
print("Logistic Regression:")
print(classification_report(y_test, logistic_regression_preds))

print("Random Forest:")
print(classification_report(y_test, random_forest_preds))

print("SVM:")
print(classification_report(y_test, svm_preds))


Logistic Regression:
              precision    recall  f1-score   support

           0       0.79      0.82      0.81       151
           1       0.64      0.59      0.61        80

    accuracy                           0.74       231
   macro avg       0.71      0.70      0.71       231
weighted avg       0.74      0.74      0.74       231

Random Forest:
              precision    recall  f1-score   support

           0       0.80      0.78      0.79       151
           1       0.61      0.64      0.62        80

    accuracy                           0.73       231
   macro avg       0.70      0.71      0.71       231
weighted avg       0.73      0.73      0.73       231

SVM:
              precision    recall  f1-score   support

           0       0.76      0.88      0.82       151
           1       0.68      0.47      0.56        80

    accuracy                           0.74       231
   macro avg       0.72      0.68      0.69       231
weighted avg       0.73      0.74

In [68]:
# Define a function for tuning a model
def search(model,grid,X,y,cv=None):
    
    #cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    #cv = StratifiedKFold(n_splits = 15, random_state = 10, shuffle = True)
    grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
    grid_result = grid_search.fit(X, y)
    return grid_result

In [91]:
# Define a tuner function for Logistic Regression
def tune_logr(X,y):
    from sklearn.linear_model import LogisticRegression
    model = LogisticRegression()
    solvers = ['newton-cg', 'lbfgs', 'liblinear']
    penalty = ['l1','l2','elasticnet']
    c_values = [200,100, 10, 1.0, 0.1, 0.01]
    # define grid search
    grid = dict(solver=solvers,penalty=penalty,C=c_values)
    cv = StratifiedKFold(n_splits = 100, random_state = 10, shuffle = True)
    return search(model, grid,X,y,cv=cv)

In [70]:
#Define a tuning function for Ridge Classifier
def tune_ridge(X,y):
    from sklearn.linear_model import RidgeClassifier
    model = RidgeClassifier()
    alpha = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    # define grid search
    grid = dict(alpha=alpha)
    return search(model, grid,X,y)

In [71]:
# Tuning function for Knn Classifier
def tune_knn(X,y):
    from sklearn.neighbors import KNeighborsClassifier
    model = KNeighborsClassifier()
    n_neighbors = range(1, 21, 2)
    weights = ['uniform', 'distance']
    metric = ['euclidean', 'manhattan', 'minkowski']
    # define grid search
    grid = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)
    return search(model, grid,X,y)

In [72]:
# Tuner function for SVC
def tune_svc(X,y):
    from sklearn.svm import SVC
    model = SVC()
    kernel = ['linear','poly', 'rbf', 'sigmoid']
    C = [100, 10, 1.0, 0.1, 0.01]
    gamma = [0.1, 0.01, 0.001, 0.0001]
    # define grid search
    grid = dict(kernel=kernel,C=C,gamma=gamma)
    return search(model, grid,X,y)

In [73]:
# Tuner function for Random Forest Classifier
def tune_randomf(X,y):
    from sklearn.ensemble import RandomForestClassifier
    model = RandomForestClassifier()
    n_estimators = [10, 100,500,1000]
    max_features = ['sqrt', 'log2']
    # define grid search
    grid = dict(n_estimators=n_estimators,max_features=max_features)
    return search(model, grid,X,y)

In [74]:
# Function to evaluate model
def evaluate_hyperparams(model,xtrain,ytrain,xtest,ytest):
    from sklearn.metrics import classification_report
    model.fit(xtrain, ytrain)
    best_preds = model.predict(xtest) 
    return classification_report(ytest, best_preds)


In [75]:
# Tune svc
svm_grid_search = tune_svc(X_train_scaled, y_train) 
# Get best hyperparameters
best_params = svm_grid_search.best_params_
print("Best Hyperparameters:", best_params)
# Re-evaluate SVM with best hyperparameters
svm_best = svm_grid_search.best_estimator_
print('SVC classification report:')
print(evaluate_hyperparams(svm_best,X_train, y_train, X_test, y_test))


Best Hyperparameters: {'C': 100, 'gamma': 0.1, 'kernel': 'linear'}
SVC classification report:
              precision    recall  f1-score   support

           0       0.80      0.81      0.81       151
           1       0.63      0.62      0.63        80

    accuracy                           0.74       231
   macro avg       0.72      0.72      0.72       231
weighted avg       0.74      0.74      0.74       231



In [76]:
# Tune random_forest
randomf_grid_search = tune_randomf(X_train_scaled, y_train) 
# Get best hyperparameters
best_params = randomf_grid_search.best_params_
print("Best Hyperparameters:", best_params)#Best Hyperparameters: {'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}

# Re-evaluate Random Forest  with best hyperparameters
svm_best = randomf_grid_search.best_estimator_
print('Random Forest classification report:')
print(evaluate_hyperparams(svm_best,X_train, y_train, X_test, y_test))


Best Hyperparameters: {'max_features': 'log2', 'n_estimators': 1000}
Random Forest classification report:
              precision    recall  f1-score   support

           0       0.82      0.79      0.81       151
           1       0.64      0.68      0.65        80

    accuracy                           0.75       231
   macro avg       0.73      0.73      0.73       231
weighted avg       0.76      0.75      0.75       231



In [77]:
# Tune Ridge Classifier
from tuning import tune_ridge
randomf_grid_search = tune_ridge(X_train, y_train) 
# Get best hyperparameters
best_params = randomf_grid_search.best_params_
print("Best Hyperparameters:", best_params)#Best Hyperparameters: {'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}

# Re-evaluate RidgeClassifier  with best hyperparameters
svm_best = randomf_grid_search.best_estimator_
print('Ridge Classifier Classification Report:')
print(evaluate_hyperparams(svm_best,X_train, y_train, X_test, y_test))


Best Hyperparameters: {'alpha': 0.1}
Ridge Classifier Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.80      0.80       151
           1       0.62      0.60      0.61        80

    accuracy                           0.73       231
   macro avg       0.70      0.70      0.70       231
weighted avg       0.73      0.73      0.73       231



In [92]:
# Tune logistic regressor

from tuning import tune_logr
randomf_grid_search = tune_logr(X_train, y_train) 
# Get best hyperparameters
best_params = randomf_grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Re-evaluate LogisticRegression  with best hyperparameters
svm_best = randomf_grid_search.best_estimator_
print('LogisticRegression classification report:')
print(evaluate_hyperparams(svm_best,X_train, y_train, X_test, y_test))


750 fits failed out of a total of 1350.
The score on these train-test partitions for these parameters will be set to 0.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
150 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\ADMIN\AppData\Roaming\Python\Python311\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\ADMIN\AppData\Roaming\Python\Python311\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\ADMIN\AppData\Roaming\Python\Python311\site-packages\sklearn\linear_model\_logistic.py", line 1169, in fit
    solver = _check_solver(self.solver, self.penalty, self.du

Best Hyperparameters: {'C': 1.0, 'penalty': 'l2', 'solver': 'liblinear'}
LogisticRegression classification report:
              precision    recall  f1-score   support

           0       0.79      0.84      0.81       151
           1       0.66      0.57      0.61        80

    accuracy                           0.75       231
   macro avg       0.72      0.71      0.71       231
weighted avg       0.74      0.75      0.74       231



In [79]:
# Tune knn
from tuning import tune_knn
randomf_grid_search = tune_knn(X_train, y_train) 
# Get best hyperparameters
best_params = randomf_grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Re-evaluate KNeighborsClassifier  with best hyperparameters
svm_best = randomf_grid_search.best_estimator_
print('Evaluating KNeighborsClassifier on tuned hyperparameters ')
print(evaluate_hyperparams(svm_best,X_train, y_train, X_test, y_test))


Best Hyperparameters: {'metric': 'manhattan', 'n_neighbors': 17, 'weights': 'distance'}
Evaluating KNeighborsClassifier on tuned hyperparameters 
              precision    recall  f1-score   support

           0       0.77      0.81      0.79       151
           1       0.61      0.54      0.57        80

    accuracy                           0.72       231
   macro avg       0.69      0.68      0.68       231
weighted avg       0.71      0.72      0.71       231

