In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('../data/cleaned_diabetes_data.csv')
df

Unnamed: 0,Gender,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI,CLASS
0,F,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
1,M,26,4.5,62,4.9,3.7,1.4,1.1,2.1,0.6,23.0,N
2,F,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
3,F,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
4,M,33,7.1,46,4.9,4.9,1.0,0.8,2.0,0.4,21.0,N
...,...,...,...,...,...,...,...,...,...,...,...,...
995,M,71,11.0,97,7.0,7.5,1.7,1.2,1.8,0.6,30.0,Y
996,M,31,3.0,60,12.3,4.1,2.2,0.7,2.4,15.4,37.2,Y
997,M,30,7.1,81,6.7,4.1,1.1,1.2,2.4,8.1,27.4,Y
998,M,38,5.8,59,6.7,5.3,2.0,1.6,2.9,14.0,40.5,Y


In [3]:
# age_range_buckkets=["[{0}-{1}]".format(age,age+10) for age in range(20,100,10)]
# age_range_buckkets

# df['age_range'] = pd.cut(x=df['AGE'], bins=8,labels=age_range_buckkets)  # 5 automatic equal-width bins
# df['age_range'] 
# df['age_range'].value_counts()

#### 3. Handle Categorical Features:

The 'Gender' column is categorical.
Machine learning models require numerical input, so we will convert it using one-hot encoding.
We'll also drop the first category to prevent multicollinearity."

In [4]:
#get_dummies: cant used in pipeline(can use onehotencoder), output:dataframe
df=pd.get_dummies(df,columns=['Gender'],dtype=int)
df

Unnamed: 0,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI,CLASS,Gender_F,Gender_M
0,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N,1,0
1,26,4.5,62,4.9,3.7,1.4,1.1,2.1,0.6,23.0,N,0,1
2,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N,1,0
3,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N,1,0
4,33,7.1,46,4.9,4.9,1.0,0.8,2.0,0.4,21.0,N,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,71,11.0,97,7.0,7.5,1.7,1.2,1.8,0.6,30.0,Y,0,1
996,31,3.0,60,12.3,4.1,2.2,0.7,2.4,15.4,37.2,Y,0,1
997,30,7.1,81,6.7,4.1,1.1,1.2,2.4,8.1,27.4,Y,0,1
998,38,5.8,59,6.7,5.3,2.0,1.6,2.9,14.0,40.5,Y,0,1


In [5]:
df.drop('Gender_F',inplace=True,axis=1)
df

Unnamed: 0,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI,CLASS,Gender_M
0,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N,0
1,26,4.5,62,4.9,3.7,1.4,1.1,2.1,0.6,23.0,N,1
2,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N,0
3,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N,0
4,33,7.1,46,4.9,4.9,1.0,0.8,2.0,0.4,21.0,N,1
...,...,...,...,...,...,...,...,...,...,...,...,...
995,71,11.0,97,7.0,7.5,1.7,1.2,1.8,0.6,30.0,Y,1
996,31,3.0,60,12.3,4.1,2.2,0.7,2.4,15.4,37.2,Y,1
997,30,7.1,81,6.7,4.1,1.1,1.2,2.4,8.1,27.4,Y,1
998,38,5.8,59,6.7,5.3,2.0,1.6,2.9,14.0,40.5,Y,1


#### 4. Target Variable Encoding:

In [6]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['CLASS']=le.fit_transform(df['CLASS'])

In [7]:
df['CLASS'].value_counts()

CLASS
2    844
0    103
1     53
Name: count, dtype: int64

In [8]:
df

Unnamed: 0,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI,CLASS,Gender_M
0,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,0,0
1,26,4.5,62,4.9,3.7,1.4,1.1,2.1,0.6,23.0,0,1
2,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,0,0
3,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,0,0
4,33,7.1,46,4.9,4.9,1.0,0.8,2.0,0.4,21.0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
995,71,11.0,97,7.0,7.5,1.7,1.2,1.8,0.6,30.0,2,1
996,31,3.0,60,12.3,4.1,2.2,0.7,2.4,15.4,37.2,2,1
997,30,7.1,81,6.7,4.1,1.1,1.2,2.4,8.1,27.4,2,1
998,38,5.8,59,6.7,5.3,2.0,1.6,2.9,14.0,40.5,2,1


#### 5. Splitting the Data:

In [9]:
X=df.drop('CLASS',axis=1)
y=pd.DataFrame(df['CLASS'])


In [10]:
X.shape,y.shape

((1000, 11), (1000, 1))

We will now split the data into training and testing sets.
Because the target variable ('CLASS') is imbalanced, we use the stratify parameter to ensure that both the train and test sets have a proportional representation of each class

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)



In [12]:
X_train.shape, X_test.shape, y_train.shape,

((800, 11), (200, 11), (800, 1))

#### 6.Feature Scaing

In [13]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Fit on training, transform both
X_train_scaled = scaler.fit_transform(X_train)  # fit + transform training data
X_test_scaled = scaler.transform(X_test)        # only transform test data


In [14]:
import joblib
joblib.dump(X_train_scaled, '../saved_models/X_train_scaled.pkl')
joblib.dump(X_test_scaled, '../saved_models/X_test_scaled.pkl')
joblib.dump(y_train, '../saved_models/y_train.pkl')
joblib.dump(y_test, '../saved_models/y_test.pkl')
joblib.dump(scaler, '../saved_models/scaler.pkl')  #
#joblib.dump(X_train_scaled, './data/X_train.pkl')


['../saved_models/scaler.pkl']