## **1 Data Structuring and Cleaning.**

#1 data availability

In [4]:
import pandas as pd
df=pd.read_csv("diabetes_prediction_dataset.csv")
df

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
99995,Female,80.0,0,0,No Info,27.32,6.2,90,0
99996,Female,2.0,0,0,No Info,17.37,6.5,100,0
99997,Male,66.0,0,0,former,27.83,5.7,155,0
99998,Female,24.0,0,0,never,35.42,4.0,100,0


In [5]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [6]:
df.shape

(100000, 9)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


In [8]:
df.isnull().sum()
# df.isna().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64

In [9]:
df.duplicated().sum()

np.int64(3854)

In [10]:
duplicate_rows = df[df.duplicated()]
print(duplicate_rows)

       gender   age  hypertension  heart_disease smoking_history    bmi  \
2756     Male  80.0             0              0         No Info  27.32   
3272   Female  80.0             0              0         No Info  27.32   
3418   Female  19.0             0              0         No Info  27.32   
3939   Female  78.0             1              0          former  27.32   
3960     Male  47.0             0              0         No Info  27.32   
...       ...   ...           ...            ...             ...    ...   
99980  Female  52.0             0              0           never  27.32   
99985    Male  25.0             0              0         No Info  27.32   
99989  Female  26.0             0              0         No Info  27.32   
99990    Male  39.0             0              0         No Info  27.32   
99995  Female  80.0             0              0         No Info  27.32   

       HbA1c_level  blood_glucose_level  diabetes  
2756           6.6                  159        

In [11]:
# df data copying to df1. df1 what we are changing will not change in df so we are copying data
# df1 = df.copy() works with data
# df1 = df it will works with address
df1 = df.copy()

In [12]:
df.drop_duplicates(inplace=True)

In [13]:
df.duplicated().sum()

np.int64(0)

In [14]:
Unique_smoking_history = df['smoking_history'].unique()
Unique_smoking_history

array(['never', 'No Info', 'current', 'former', 'ever', 'not current'],
      dtype=object)

In [15]:
df.gender.unique()

array(['Female', 'Male', 'Other'], dtype=object)

## **2 Data Transformation**

In [16]:
# encoding mechanism
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
df["gender"]=lb.fit_transform(df["gender"])  # label encoding
df["smoking_history"]=lb.fit_transform(df["smoking_history"])

In [17]:
df.smoking_history.unique()

array([4, 0, 1, 3, 2, 5])

In [18]:
df.gender.unique()

array([0, 1, 2])

# **3 Train and Split**

In [19]:
# 0 to 7 columns are independent columns and 8 once is Dependent col so we are splitting.
# or x=df.drop("diabetes",axis=1)
# y=df["diabetes"]

x=df.iloc[:,:-1]
y=df.iloc[:,-1]

# or # Separate features and target variable
# X = data[Features]
# y = data[Target]


In [20]:
# counting the values of particular column and 0, 1 it will happen only for category data countinus data will not happen.
y.value_counts()

diabetes
0    87664
1     8482
Name: count, dtype: int64

In [21]:
df = x.value_counts()
df

gender  age   hypertension  heart_disease  smoking_history  bmi    HbA1c_level  blood_glucose_level
0       61.0  0             0              5                27.32  5.7          200                    2
        48.0  0             0              1                27.32  6.5          200                    2
        62.0  0             0              4                27.32  6.0          126                    2
        45.0  0             0              0                27.32  6.0          126                    2
        56.0  0             0              0                27.32  6.2          126                    2
                                                                                                      ..
1       80.0  1             1              4                28.29  6.2          140                    1
                                                            29.47  6.2          240                    1
                                                            

In [22]:
# 80% data will train for model development remaining 20% data will test after developing a model and then will evaluate with that 20% data.
# Split the data into training and testing sets
# squenes

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42, stratify=y)

# Scale the features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(x_train)
X_test_scaled = scaler.transform(x_test)

print(f"Training set shape: {x_train.shape}, Test set shape: {x_test.shape}")

Training set shape: (76916, 8), Test set shape: (19230, 8)


In [23]:
# y value will lock in x value also. because y has 0, 1 so x will pairs automatically
y_train.value_counts()

diabetes
0    70130
1     6786
Name: count, dtype: int64

# **4 Model Training**

In [24]:
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

# Initialize the scaler and SMOTE
scaler = StandardScaler()
smote = SMOTE(random_state=42)

# Scale the features
X_train_scaled = scaler.fit_transform(x_train)
X_test_scaled = scaler.transform(x_test)

# Apply SMOTE to the scaled training data
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

# Display original and resampled class distributions
print(f"Original training set size:\n{y_train.value_counts()}")
print(f"Resampled training set size:\n{y_train_resampled.value_counts()}")

Original training set size:
diabetes
0    70130
1     6786
Name: count, dtype: int64
Resampled training set size:
diabetes
0    70130
1    70130
Name: count, dtype: int64


In [25]:
# diabetes column has Category Data 0 & 1 so Classification Algorithm we can use random forest Classification and desition tree we can use.
# except logistic regression we should not use regression in category data.

# Algorithm 1
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(x_train,y_train)


In [26]:
# Model how it can take decision and which Algorithm can take decision?
# y_predict = Predicated Value
y_predict = model.predict(x_test)    # 20,000 data will store in y_predict
from sklearn.metrics import accuracy_score , confusion_matrix , classification_report
print(confusion_matrix(y_test,y_predict))
print(accuracy_score(y_test,y_predict))
print(classification_report(y_test,y_predict))

[[16952   582]
 [  436  1260]]
0.947061882475299
              precision    recall  f1-score   support

           0       0.97      0.97      0.97     17534
           1       0.68      0.74      0.71      1696

    accuracy                           0.95     19230
   macro avg       0.83      0.85      0.84     19230
weighted avg       0.95      0.95      0.95     19230



In [27]:
# Algorithm 2
from sklearn.ensemble import RandomForestClassifier
model1 = RandomForestClassifier().fit(x_train,y_train)
y_predict = model1.predict(x_test)   # 20,000 data will store in y_predict
# from sklearn.metrics import accuracy_score , confusion_matrix , classification_report
print(confusion_matrix(y_test,y_predict))
print(accuracy_score(y_test,y_predict))
print(classification_report(y_test,y_predict))


[[17474    60]
 [  524  1172]]
0.9696307852314092
              precision    recall  f1-score   support

           0       0.97      1.00      0.98     17534
           1       0.95      0.69      0.80      1696

    accuracy                           0.97     19230
   macro avg       0.96      0.84      0.89     19230
weighted avg       0.97      0.97      0.97     19230



In [28]:
# Algorithm 3
from sklearn.ensemble import RandomForestClassifier
model1 = RandomForestClassifier(n_estimators=4,max_depth=15).fit(x_train,y_train)
y_predict = model1.predict(x_test)   # 20,000 data will store in y_predict
# from sklearn.metrics import accuracy_score , confusion_matrix , classification_report
print(confusion_matrix(y_test,y_predict))
print(accuracy_score(y_test,y_predict))
print(classification_report(y_test,y_predict))


[[17483    51]
 [  521  1175]]
0.9702548101924077
              precision    recall  f1-score   support

           0       0.97      1.00      0.98     17534
           1       0.96      0.69      0.80      1696

    accuracy                           0.97     19230
   macro avg       0.96      0.84      0.89     19230
weighted avg       0.97      0.97      0.97     19230



# **5 Adding data for testing the model and Saving model**

In [29]:
# adding data for test
import numpy as np
aravind = np.array([[1,25,0,0,4,25.6,4.2,72]])

# Testing the model
aravind_preditct = model.predict(aravind)
aravind_preditct



array([0])

In [30]:
# Now need to save model to upload in any cloud like AWS or Azure in cloud.
import pickle
pickle.dump(model,open("diabetes_model.pkl","wb"))
# We can share this model

with open("diabetes_model.pkl","rb") as file:
  Brain_model = pickle.load(file)

aravind_preditct = Brain_model.predict(aravind)
aravind_preditct



array([0])