# Dependency

In [20]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

# Load dataset

In [5]:
df = pd.read_csv('E:\Projects\AI_playground\datasets\diabetes.csv')
df.head()

  df = pd.read_csv('E:\Projects\AI_playground\datasets\diabetes.csv')


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


# Data preprocessing

In [7]:
df.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [11]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [12]:
noisy_values = [ '?', 'NA', 'NaN', 'nan', '', ' ', 'null', 'NULL', None, '!', '#', '$', '%', '&','*']
df.replace(noisy_values, np.nan, inplace=True)

In [14]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


# dividing Data set in to dependent and independent variable

In [16]:
x = df.drop('Outcome', axis = 1)
y =df['Outcome']

# Standard Scaling of Data

In [17]:
sc = StandardScaler()
x_scaled = sc.fit_transform(x)

# Spliting Data in to training and testing set

In [19]:
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.2, random_state=42)

# Define Models (Same Dataset, Fair Comparison)

In [21]:
model = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "SVM": SVC(),
    "Naive Bayes": GaussianNB(),
    "KNN": KNeighborsClassifier(n_neighbors=5)
}

# Cross Validation

In [22]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [23]:
result = []

for name, model in model.items():
    score = cross_val_score(model, x_train, y_train, cv=cv, scoring='accuracy')
    result.append({
        "Model": name,
        "Accuracy Mean": score.mean(),  
        "Std Dev": score.std()
    })

In [24]:
results_df = pd.DataFrame(result).sort_values(by='Accuracy Mean', ascending=False)
results_df

Unnamed: 0,Model,Accuracy Mean,Std Dev
2,Random Forest,0.768772,0.019184
0,Logistic Regression,0.76544,0.01453
3,SVM,0.763788,0.031743
5,KNN,0.747514,0.021735
4,Naive Bayes,0.737692,0.04588
1,Decision Tree,0.713421,0.036412


# Training model using best performing algorithm

In [25]:
model = LogisticRegression(max_iter=1000)
model.fit(x_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [27]:
y_pred = model.predict(x_test)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0])

In [28]:

from sklearn.metrics import accuracy_score, confusion_matrix
accuracy_score(y_test, y_pred)

0.7532467532467533

In [29]:
confusion_matrix(y_test, y_pred)

array([[79, 20],
       [18, 37]])

# Saving Model

In [32]:
import joblib
joblib.dump(model, "E:\Projects\AI_playground\models\diabetes_model.pkl")

  joblib.dump(model, "E:\Projects\AI_playground\models\diabetes_model.pkl")


['E:\\Projects\\AI_playground\\models\\diabetes_model.pkl']

In [None]:
print('hello word')