In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('/content/drive/MyDrive/AI/diabetes.csv')


In [3]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
features_of_interest = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'Age', 'BMI']

# Count occurrences of "0" for each feature
zero_counts = df[features_of_interest].eq(0).sum()

# Display the counts
print(zero_counts)

Glucose            5
BloodPressure     35
SkinThickness    227
Insulin          374
Age                0
BMI               11
dtype: int64


In [6]:
import pandas as pd

# Assuming df is your original DataFrame
features_of_interest = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'Age', 'BMI']

# Create a new DataFrame to store the imputed values
df_imputed = df.copy()

# Impute missing values with the mean of their respective features
for feature in features_of_interest:
    mean_value = df[feature].mean()
    df_imputed[feature] = df_imputed[feature].replace(0, mean_value)

In [7]:
features_of_interest = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'Age', 'BMI']

# Count occurrences of "0" for each feature
zero_counts = df_imputed[features_of_interest].eq(0).sum()

# Display the counts
print(zero_counts)

Glucose          0
BloodPressure    0
SkinThickness    0
Insulin          0
Age              0
BMI              0
dtype: int64


In [8]:
df_imputed.head(30)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,79.799479,33.6,0.627,50,1
1,1,85.0,66.0,29.0,79.799479,26.6,0.351,31,0
2,8,183.0,64.0,20.536458,79.799479,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
5,5,116.0,74.0,20.536458,79.799479,25.6,0.201,30,0
6,3,78.0,50.0,32.0,88.0,31.0,0.248,26,1
7,10,115.0,69.105469,20.536458,79.799479,35.3,0.134,29,0
8,2,197.0,70.0,45.0,543.0,30.5,0.158,53,1
9,8,125.0,96.0,20.536458,79.799479,31.992578,0.232,54,1


In [11]:
from sklearn.preprocessing import MinMaxScaler

# Assuming df_imputed is your DataFrame with imputed values
features_to_scale = [col for col in df_imputed.columns if col != 'Outcome']

# Separate features and target variable
X = df_imputed[features_to_scale]
y = df_imputed['Outcome']

# Create an instance of MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform the scaler on features (excluding 'Outcome')
X_scaled = scaler.fit_transform(X)

# Create a new DataFrame with the scaled features and 'Outcome'
df_scaled = pd.DataFrame(X_scaled, columns=features_to_scale)
df_scaled['Outcome'] = y

# Display the scaled DataFrame
print(df_scaled.head(30))

    Pregnancies   Glucose  BloodPressure  SkinThickness   Insulin       BMI  \
0      0.352941  0.670968       0.489796       0.304348  0.079086  0.314928   
1      0.058824  0.264516       0.428571       0.239130  0.079086  0.171779   
2      0.470588  0.896774       0.408163       0.147135  0.079086  0.104294   
3      0.058824  0.290323       0.428571       0.173913  0.096154  0.202454   
4      0.000000  0.600000       0.163265       0.304348  0.185096  0.509202   
5      0.294118  0.464516       0.510204       0.147135  0.079086  0.151329   
6      0.176471  0.219355       0.265306       0.271739  0.088942  0.261759   
7      0.588235  0.458065       0.460260       0.147135  0.079086  0.349693   
8      0.117647  0.987097       0.469388       0.413043  0.635817  0.251534   
9      0.470588  0.522581       0.734694       0.147135  0.079086  0.282057   
10     0.235294  0.425806       0.693878       0.147135  0.079086  0.396728   
11     0.588235  0.800000       0.510204       0.147

In [13]:
from imblearn.over_sampling import RandomOverSampler
import pandas as pd

# Assuming df_scaled is your DataFrame with scaled and imputed values
features_to_scale = [col for col in df_scaled.columns if col != 'Outcome']

# Separate features and target variable
X = df_scaled[features_to_scale]
y = df_scaled['Outcome']

# Create an instance of RandomOverSampler
ros = RandomOverSampler(random_state=42)

# Fit and apply oversampling to the data
X_resampled, y_resampled = ros.fit_resample(X, y)

# Create a DataFrame with the oversampled data
df_overfitted = pd.concat([pd.DataFrame(X_resampled, columns=features_to_scale), pd.Series(y_resampled, name='Outcome')], axis=1)

# Display the class distribution after oversampling
print("Class Distribution After Oversampling:")
print(df_overfitted['Outcome'].value_counts())

Class Distribution After Oversampling:
1    500
0    500
Name: Outcome, dtype: int64


In [16]:
df_overfitted.shape

(1000, 9)

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Assuming df_overfitted is your DataFrame with scaled, imputed, and oversampled values
features_to_scale = [col for col in df_overfitted.columns if col != 'Outcome']

# Separate features and target variable
X = df_overfitted[features_to_scale]
y = df_overfitted['Outcome']

# Split the data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an instance of KNeighborsClassifier with n_neighbors=13
knn_clf = KNeighborsClassifier(n_neighbors=13)

# Fit the model on the training data
knn_clf.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = knn_clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Display results
print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_rep)
print("\nConfusion Matrix:")
print(conf_matrix)

Accuracy: 0.795

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.73      0.78        99
           1       0.76      0.86      0.81       101

    accuracy                           0.80       200
   macro avg       0.80      0.79      0.79       200
weighted avg       0.80      0.80      0.79       200


Confusion Matrix:
[[72 27]
 [14 87]]
