In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler

diabetes_DF = pd.read_csv('../Data/diabetes.csv')
diabetes_DF

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


## Handling Outliers & Missing Values

Some rows with outliers are dropped while others are changed to the mean value of their respective column.

In [2]:
Error_BMI = diabetes_DF[diabetes_DF['BMI'] < 10].index 
Error_BloodPressure = diabetes_DF[diabetes_DF['BloodPressure'] < 5].index
Error_Glucose = diabetes_DF[diabetes_DF['Glucose'] < 1].index

In [3]:
Error_List = [Error_BloodPressure, Error_BMI, Error_Glucose] #Rows with values below a certain threshold are dropped for each listed column

for Error in Error_List:
    try:
        diabetes_DF.drop(Error, inplace = True)
    except KeyError:
        continue

In [4]:
Mean_SkinThickness = diabetes_DF['SkinThickness'].mean() # The mean is used to replace values below 1 and above 75
diabetes_DF.loc[diabetes_DF['SkinThickness'] < 1, 'SkinThickness'] = Mean_SkinThickness
diabetes_DF.loc[diabetes_DF['SkinThickness'] > 75, 'SkinThickness'] = Mean_SkinThickness
diabetes_DF = diabetes_DF.reset_index()
diabetes_DF

Unnamed: 0,index,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0,6,148,72,35.000000,0,33.6,0.627,50,1
1,1,1,85,66,29.000000,0,26.6,0.351,31,0
2,2,8,183,64,21.388736,0,23.3,0.672,32,1
3,3,1,89,66,23.000000,94,28.1,0.167,21,0
4,4,0,137,40,35.000000,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...,...
723,763,10,101,76,48.000000,180,32.9,0.171,63,0
724,764,2,122,70,27.000000,0,36.8,0.340,27,0
725,765,5,121,72,23.000000,112,26.2,0.245,30,0
726,766,1,126,60,21.388736,0,30.1,0.349,47,1


## Scaling and Normalization

Separating the Outcome column and converting the DataFrame to an array to allow the features to be standardized.

In [5]:
column_list = diabetes_DF.columns
scaler = StandardScaler()

Outcome = diabetes_DF['Outcome']

Features = diabetes_DF.drop(labels = 'Outcome', axis = 1)

Features = Features.to_numpy()

In [6]:
Features = scaler.fit_transform(Features) # Numeric features are scaled in order to handle the imbalanced data

Features = pd.DataFrame(Features)

Features = pd.concat(objs = [Features, Outcome], axis = 1)

Features.columns = column_list

Features

Unnamed: 0,index,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,-1.726305,0.635910,0.852184,-0.035402,0.866539,-0.720856,0.180300,0.449710,1.402331,1
1,-1.721813,-0.851694,-1.202736,-0.520148,0.218104,-0.720856,-0.782209,-0.375105,-0.202189,0
2,-1.717322,1.230951,1.993806,-0.681730,-0.604464,-0.720856,-1.235963,0.584190,-0.117741,1
3,-1.712830,-0.851694,-1.072265,-0.520148,-0.430331,0.084355,-0.575957,-0.924982,-1.046673,0
4,-1.708338,-1.149215,0.493389,-2.620717,0.866539,0.718244,1.486563,5.413541,-0.033292,1
...,...,...,...,...,...,...,...,...,...,...
723,1.700897,1.825992,-0.680851,0.287763,2.271481,0.821036,0.084050,-0.913028,2.500160,0
724,1.705389,-0.554173,0.004122,-0.196984,0.001959,-0.720856,0.620305,-0.407978,-0.539983,0
725,1.709881,0.338389,-0.028496,-0.035402,-0.430331,0.238544,-0.837209,-0.691882,-0.286637,0
726,1.714372,-0.851694,0.134593,-1.004895,-0.604464,-0.720856,-0.300954,-0.381082,1.148985,1


In [7]:
Features.to_csv('../Data/model_diabetes.csv')