In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.express as px

In [18]:
diabetes = pd.read_csv('data/diabetes_data.csv')
diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Gender
0,6,98,58,33,190,34.0,0.43,43,0,Female
1,2,112,75,32,0,35.7,0.148,21,0,Female
2,2,108,64,0,0,30.8,0.158,21,0,Female
3,8,107,80,0,0,24.6,0.856,34,0,Female
4,7,136,90,0,0,29.9,0.21,50,0,Female


In [19]:
mask = diabetes.duplicated()
diabetes_duplicates = diabetes[mask]
diabetes_duplicates.shape[0]

10

In [20]:
diabetes_data = diabetes.drop_duplicates()

In [21]:
diabetes_data.shape[0]

768

In [22]:
low_information_cols = []
for col in diabetes_data.columns:
    top_freq = diabetes_data[col].value_counts(normalize=True).max()
    nunique_ratio = diabetes_data[col].nunique()/diabetes_data[col].count()
    
    if top_freq>0.95:
        low_information_cols.append(col)
    
    if nunique_ratio>0.95:
        low_information_cols.append(col)
        
low_information_cols

['Gender']

In [24]:
diabetes_data = diabetes_data.drop('Gender', axis=1)

In [25]:
diabetes_data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,98,58,33,190,34.0,0.43,43,0
1,2,112,75,32,0,35.7,0.148,21,0
2,2,108,64,0,0,30.8,0.158,21,0
3,8,107,80,0,0,24.6,0.856,34,0
4,7,136,90,0,0,29.9,0.21,50,0


In [26]:
diabetes_data.isnull().mean()

Pregnancies                 0.0
Glucose                     0.0
BloodPressure               0.0
SkinThickness               0.0
Insulin                     0.0
BMI                         0.0
DiabetesPedigreeFunction    0.0
Age                         0.0
Outcome                     0.0
dtype: float64

In [29]:
cols_list = ['Glucose', 'BloodPressure','SkinThickness', 'Insulin', 'BMI']
for col in diabetes_data.columns:
    if col in cols_list:
        diabetes_data[col] = diabetes_data[col].replace(0, np.nan)

diabetes_data.isnull().mean()

Pregnancies                 0.000000
Glucose                     0.006510
BloodPressure               0.045573
SkinThickness               0.295573
Insulin                     0.486979
BMI                         0.014323
DiabetesPedigreeFunction    0.000000
Age                         0.000000
Outcome                     0.000000
dtype: float64

In [30]:
diabetes_data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,98.0,58.0,33.0,190.0,34.0,0.43,43,0
1,2,112.0,75.0,32.0,,35.7,0.148,21,0
2,2,108.0,64.0,,,30.8,0.158,21,0
3,8,107.0,80.0,,,24.6,0.856,34,0
4,7,136.0,90.0,,,29.9,0.21,50,0


In [33]:
diabetes_data['Insulin'].value_counts(normalize=True)

105.0    0.027919
140.0    0.022843
130.0    0.022843
120.0    0.020305
180.0    0.017766
           ...   
485.0    0.002538
370.0    0.002538
342.0    0.002538
65.0     0.002538
235.0    0.002538
Name: Insulin, Length: 185, dtype: float64

In [34]:
thresh = diabetes_data.shape[0]*0.7
diabetes_data = diabetes_data.dropna(thresh=thresh, axis=1)

In [35]:
diabetes_data.shape[1]

8

In [36]:
m = diabetes_data.shape[1]
diabetes_data = diabetes_data.dropna(thresh=m-2, axis=0)

In [37]:
diabetes_data.shape[0]

761

In [38]:
diabetes_data.isnull().mean()

Pregnancies                 0.000000
Glucose                     0.006570
BloodPressure               0.036794
SkinThickness               0.289093
BMI                         0.005256
DiabetesPedigreeFunction    0.000000
Age                         0.000000
Outcome                     0.000000
dtype: float64

In [40]:
values = {'Glucose':diabetes_data['Glucose'].median(),\
    'BloodPressure':diabetes_data['BloodPressure'].median(),\
    'SkinThickness':diabetes_data['SkinThickness'].median(),\
    'BMI':diabetes_data['BMI'].median()}
diabetes_data = diabetes_data.fillna(values)
diabetes_data.isnull().mean()

Pregnancies                 0.0
Glucose                     0.0
BloodPressure               0.0
SkinThickness               0.0
BMI                         0.0
DiabetesPedigreeFunction    0.0
Age                         0.0
Outcome                     0.0
dtype: float64

In [41]:
diabetes_data['SkinThickness'].mean()

29.109067017082786

In [42]:
def outliers_iqr_mod(data, feature, left=1.5, right=1.5, log_scale=False):
    if log_scale:
        x = np.log(data[feature])
    else:
        x = data[feature]
    quartile_1, quartile_3 = x.quantile(0.25), x.quantile(0.75)
    iqr = quartile_3 - quartile_1
    lower_bound = quartile_1 - (iqr*left)
    upper_bound = quartile_3 + (iqr*right)
    outliers = data[(x<lower_bound)|(x>upper_bound)]
    cleaned = data[(x>lower_bound)&(x<upper_bound)]
    return outliers, cleaned

In [43]:
outliers, cleaned = outliers_iqr_mod(diabetes_data, 'SkinThickness')
outliers.shape[0]

87

In [44]:
def outliers_z_score_mod(data, feature, log_scale=False, left=3, right=3):
    if log_scale:
        x = np.log(data[feature]+1)
    else:
        x = data[feature]
    mu = x.mean()
    sigma = x.std()
    lower_bound = mu - left * sigma
    upper_bound = mu + right * sigma
    outliers = data[(x < lower_bound) | (x > upper_bound)]
    cleaned = data[(x > lower_bound) & (x < upper_bound)]
    return outliers, cleaned

In [45]:
outliers, cleaned = outliers_z_score_mod(diabetes_data, 'SkinThickness')
outliers.shape[0]

4

In [46]:
outliers, cleaned = outliers_iqr_mod(diabetes_data, 'DiabetesPedigreeFunction')
outliers.shape[0]

29

In [47]:
outliers, cleaned = outliers_iqr_mod(diabetes_data, 'DiabetesPedigreeFunction', log_scale=True)
outliers.shape[0]

0