In [208]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.express as px

In [209]:
diabetes = pd.read_csv('data/diabetes_data.csv')
diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Gender
0,6,98,58,33,190,34.0,0.43,43,0,Female
1,2,112,75,32,0,35.7,0.148,21,0,Female
2,2,108,64,0,0,30.8,0.158,21,0,Female
3,8,107,80,0,0,24.6,0.856,34,0,Female
4,7,136,90,0,0,29.9,0.21,50,0,Female


In [210]:
dupl_columns = list(diabetes.columns)

#mask = diabetes.duplicated(subset=dupl_columns)
#diabetes_dupl = diabetes[mask]

diabetes = diabetes.drop_duplicates(subset=dupl_columns)
print(diabetes.shape[0])

768


In [211]:
def low_informative_feature(data):
    low_information_cols = []
    for col in data.columns:
        top_freq = data[col].value_counts(normalize=True).max()
        nunique_ratio = data[col].nunique() / data[col].count()
        if top_freq > 0.95:
            low_information_cols.append(col)
        if nunique_ratio > 0.95:
            low_information_cols.append(col)
    return low_information_cols
low_information_cols = low_informative_feature(diabetes)
print(low_information_cols)

['Gender']


In [212]:
diabetes = diabetes.drop(low_information_cols, axis=1)
display(diabetes.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 60.0 KB


None

In [213]:
diabetes['Glucose'] = diabetes['Glucose'].apply(lambda x: np.nan if x == 0 else x)
diabetes['BloodPressure'] = diabetes['BloodPressure'].apply(lambda x: np.nan if x == 0 else x)
diabetes['SkinThickness'] = diabetes['SkinThickness'].apply(lambda x: np.nan if x == 0 else x)
diabetes['Insulin'] = diabetes['Insulin'].apply(lambda x: np.nan if x == 0 else x)
diabetes['BMI'] = diabetes['BMI'].apply(lambda x: np.nan if x == 0 else x)

In [214]:
round(diabetes['Insulin'].isnull().mean(), 2)

0.49

In [215]:
thresh = diabetes.shape[0] * 0.7
diabetes = diabetes.dropna(how='any', thresh=thresh, axis=1)
display(diabetes.shape[1])

8

In [216]:
thresh = diabetes.shape[1] - 2
diabetes = diabetes.dropna(how='any', thresh=thresh, axis=0)
display(diabetes.shape[0])

761

In [217]:
diabetes.isnull().mean()

Pregnancies                 0.000000
Glucose                     0.006570
BloodPressure               0.036794
SkinThickness               0.289093
BMI                         0.005256
DiabetesPedigreeFunction    0.000000
Age                         0.000000
Outcome                     0.000000
dtype: float64

In [218]:
values = {
    'Glucose': diabetes['Glucose'].median(),
    'BloodPressure': diabetes['BloodPressure'].median(),
    'SkinThickness': diabetes['SkinThickness'].median(),
    'BMI': diabetes['BMI'].median()
}
inf_diabetes = diabetes.fillna(values)

In [219]:
round(diabetes['SkinThickness'].mean(), 1)

29.2

In [220]:
def outliers_iqr(data, feature, left=1.5, right=1.5, log_scale=False):
    if log_scale:
        x = np.log(data[feature])
    else:
        x= data[feature]
    q_1, q_3 = x.quantile(0.25), x.quantile(0.75)
    iqr = q_3 - q_1
    lower_bound = q_1 - left*iqr
    upper_bound = q_3 + right*iqr
    outliers = data[(x < lower_bound) | (x > upper_bound)]
    cleaned = data[(x > lower_bound) & (x < upper_bound)]
    return outliers, cleaned

outliers, cleaned_diabetes = outliers_iqr(diabetes, 'SkinThickness')
print(outliers.shape[0])


3


In [221]:
def outliers_z_score(data, feature, left=3, right=3, log_scale=False):
    if log_scale:
        x = np.log(data[feature]+1)
    else:
        x = data[feature]
    mu = x.mean()
    sigma = x.std()
    lower_bound = mu - left * sigma
    upper_bound = mu + right * sigma
    outliers = data[(x < lower_bound) | (x > upper_bound)]
    cleaned = data[(x > lower_bound) & (x < upper_bound)]
    return outliers, cleaned

outliers, clean_diab = outliers_z_score_mod(diabetes, 'SkinThickness')
print(outliers.shape[0])


NameError: name 'outliers_z_score_mod' is not defined

In [None]:
outliers, _ = outliers_iqr(clean_diab, 'DiabetesPedigreeFunction')
outliers_log, _ = outliers_iqr(clean_diab, 'DiabetesPedigreeFunction', log_scale=True)
print(outliers.shape[0], outliers_log.shape[0])
print(outliers.shape[0] - outliers_log.shape[0])

28 0
28


In [None]:
outliers, _ = outliers_iqr(diabetes, 'DiabetesPedigreeFunction')
outliers_log, _ = outliers_iqr(diabetes, 'DiabetesPedigreeFunction', log_scale=True)
print(outliers.shape[0], outliers_log.shape[0])
print(outliers.shape[0] - outliers_log.shape[0])

29 0
29


In [None]:
diab_df = pd.read_csv('data/diabetes_data.csv')
diab_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Gender
0,6,98,58,33,190,34.0,0.43,43,0,Female
1,2,112,75,32,0,35.7,0.148,21,0,Female
2,2,108,64,0,0,30.8,0.158,21,0,Female
3,8,107,80,0,0,24.6,0.856,34,0,Female
4,7,136,90,0,0,29.9,0.21,50,0,Female


In [None]:
diab_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 778 entries, 0 to 777
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               778 non-null    int64  
 1   Glucose                   778 non-null    int64  
 2   BloodPressure             778 non-null    int64  
 3   SkinThickness             778 non-null    int64  
 4   Insulin                   778 non-null    int64  
 5   BMI                       778 non-null    float64
 6   DiabetesPedigreeFunction  778 non-null    float64
 7   Age                       778 non-null    int64  
 8   Outcome                   778 non-null    int64  
 9   Gender                    778 non-null    object 
dtypes: float64(2), int64(7), object(1)
memory usage: 60.9+ KB


In [None]:
dupl = diab_df[diab_df.duplicated()]
dupl.shape[0]

10

In [None]:
diab_df = diab_df.drop_duplicates()
diab_df.shape[0]

768

In [None]:
low_information_cols = []

for col in diab_df.columns:
    top_freq = diab_df[col].value_counts(normalize=True).max()
    nunique_ratio = diab_df[col].nunique() / diab_df[col].count()
    if (top_freq > 0.95) or (nunique_ratio > 0.95):
        low_information_cols.append(col)
        print(f'Column {col}: top_freq is {top_freq}, nunique_ratio is {nunique_ratio}')
print(*low_information_cols)

Column Gender: top_freq is 1.0, nunique_ratio is 0.0013020833333333333
Gender


In [None]:
diab_df = diab_df.drop(low_information_cols, axis=1)
diab_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 60.0 KB


In [None]:
new_func = lambda x: np.nan if x == 0 else x

diab_df['Glucose'] = diab_df['Glucose'].apply(new_func)
diab_df['BloodPressure'] = diab_df['BloodPressure'].apply(new_func)
diab_df['SkinThickness'] = diab_df['SkinThickness'].apply(new_func)
diab_df['Insulin'] = diab_df['Insulin'].apply(new_func)
diab_df['BMI'] = diab_df['BMI'].apply(new_func)

round(diab_df['Insulin'].isnull().mean(), 2)

0.49

In [None]:
diab_df.isnull().mean().round(2).sort_values(ascending=False)

Insulin                     0.49
SkinThickness               0.30
BloodPressure               0.05
Glucose                     0.01
BMI                         0.01
Pregnancies                 0.00
DiabetesPedigreeFunction    0.00
Age                         0.00
Outcome                     0.00
dtype: float64

In [None]:
thresh = diab_df.shape[0] * 0.7
diab_df = diab_df.dropna(thresh=thresh, axis=1)

diab_df.shape[1]

8

In [None]:
thresh = diab_df.shape[1] - 2
diab_df = diab_df.dropna(how='any', thresh=thresh, axis=0)
display(diab_df.shape[0])

761

In [None]:
diab_df.isnull().mean().round(2).sort_values(ascending=False)

SkinThickness               0.29
BloodPressure               0.04
Glucose                     0.01
BMI                         0.01
Pregnancies                 0.00
DiabetesPedigreeFunction    0.00
Age                         0.00
Outcome                     0.00
dtype: float64

In [None]:
values = {
    'SkinThickness': diab_df['SkinThickness'].median(),
    'BloodPressure': diab_df['BloodPressure'].median(),
    'Glucose': diab_df['Glucose'].median(),
    'BMI': diab_df['BMI'].median()
}

diab_df = diab_df.fillna(values)

round(diab_df['SkinThickness'].mean(), 1)

29.1

In [None]:
def outliers_iqr(data, feature, left=1.5, right=1.5, log_scale=None):
    if log_scale:
        x = np.log(data[feature] + 1)
    else:
        x = data[feature]
    q1, q3 = x.quantile(0.25), x.quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - left * iqr
    upper_bound = q3 + right * iqr
    outliers = data[(x < lower_bound) | (x > upper_bound)]
    cleaned = data[(x > lower_bound) & (x < upper_bound)]
    return outliers, cleaned

outliers, diab_data = outliers_iqr(diab_df, 'SkinThickness')

outliers.shape[0]
    

87

In [None]:
def outliers_iqr(data, feature, left=3, right=3, log_scale=None):
    if log_scale:
        x = np.log(data[feature] + 1)
    else:
        x = data[feature]
    mu = x.mean()
    sigma = x.std()

    lower_bound = mu - left * sigma
    upper_bound = mu + right * sigma
    
    outliers = data[(x < lower_bound) | (x > upper_bound)]
    cleaned = data[(x > lower_bound) & (x < upper_bound)]
    return outliers, cleaned

outliers1, diab_data = outliers_iqr(diab_df, 'SkinThickness')

outliers1.shape[0]

4

In [None]:
outliers1, diab_data = outliers_iqr(diab_data, 'SkinThickness')

outliers1.shape[0]

0

In [222]:
outliers, diab_data = outliers_iqr(diab_df, 'DiabetesPedigreeFunction')

outliers.shape[0]

29

In [223]:
outliers, diab_data = outliers_iqr(diab_df, 'DiabetesPedigreeFunction', log_scale=True)

outliers.shape[0]

0