In [98]:
import pandas as pd
import numpy as np

diabetes = pd.read_csv('data/diabetes_data.csv')

In [99]:
diabetes_un = diabetes.drop_duplicates()
print(diabetes.shape, diabetes_un.shape)

(778, 10) (768, 10)


In [100]:
def get_low_information_cols(data, porog=0.95):
    low_information_cols = []
    for col in data.columns:
        top_freq = data[col].value_counts(normalize=True).max()
        nunique_ratio = data[col].nunique() / data[col].count()
        if top_freq > porog:
            low_information_cols.append(col)
            # print(f'{col}: {round(top_freq*100,2)}% одинаковых значений')
        if nunique_ratio > porog:
            low_information_cols.append(col)
            # print(f'{col}: {round(nunique_ratio*100,2)}% уникальных значений')
    return low_information_cols

print(get_low_information_cols(diabetes_un))

['Gender']


In [101]:
diabetes_un2 = diabetes_un.drop('Gender', axis=1)
print(diabetes_un.shape, diabetes_un2.shape)

(768, 10) (768, 9)


In [102]:
print(diabetes_un2[diabetes_un2['Insulin'].isnull()].shape)

(0, 9)


In [112]:

def nan_func(x):
    return np.nan if x == 0 else x

diabetes_un3 = diabetes_un2.copy()
cols_0_to_NUN = ('Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI')

for col in cols_0_to_NUN:
    diabetes_un3[col] = diabetes_un3[col].apply(nan_func)

print(diabetes_un3[diabetes_un3['Insulin'].isnull()].shape, diabetes_un3.shape, round(diabetes_un3[diabetes_un3['Insulin'].isnull()].shape[0] / diabetes_un3.shape[0], 2))
print(diabetes_un3.isnull().mean().round(2).sort_values(ascending=False))

(374, 9) (768, 9) 0.49
Insulin                     0.49
SkinThickness               0.30
BloodPressure               0.05
Glucose                     0.01
BMI                         0.01
Pregnancies                 0.00
DiabetesPedigreeFunction    0.00
Age                         0.00
Outcome                     0.00
dtype: float64


In [113]:
thresh = diabetes_un3.shape[0] * 0.7
print(thresh)
diabetes_un4 = diabetes_un3.dropna(thresh=thresh, axis=1)
print(diabetes_un3.shape, diabetes_un4.shape)

537.5999999999999
(768, 9) (768, 8)


In [114]:
m = diabetes_un4.shape[1]
diabetes_un4 = diabetes_un4.dropna(thresh=m-2, axis=0)
print(diabetes_un4.shape)

(761, 8)


In [121]:
diabetes_un5 = diabetes_un4.copy() 
values = {}
for col in cols_0_to_NUN:
    try:
        values[col] = diabetes_un5[col].median()
    except:
        continue
    
print(values)
diabetes_un5 = diabetes_un5.fillna(values)
print(diabetes_un5['SkinThickness'].mean(), diabetes_un5.shape)

{'Glucose': 117.0, 'BloodPressure': 72.0, 'SkinThickness': 29.0, 'BMI': 32.3}
29.109067017082786 (761, 8)


In [136]:
def outliers_iqr_mod(data, feature, left=1.5, right=1.5):
    x = data[feature]
    quartile_1, quartile_3 = x.quantile(0.25), x.quantile(0.75)
    iqr = quartile_3 - quartile_1
    lower_bound = quartile_1 - iqr * left
    upper_bound = quartile_3 + iqr * right
    outliers = data[(x < lower_bound) | (x > upper_bound)]
    cleaned = data[(x > lower_bound) & (x < upper_bound)]
    return outliers, cleaned

o, c = outliers_iqr_mod(diabetes_un5, 'SkinThickness')
print(o.shape, c.shape)

(87, 8) (674, 8)


In [137]:
def outliers_z_score_mod(data, feature, left=3, right=3, log_scale=False):
    if log_scale:
        x = np.log(data[feature])
    else:
        x = data[feature]
    mu = x.mean()
    sigma = x.std()
    lower_bound = mu - left * sigma
    upper_bound = mu + right * sigma
    outliers = data[(x < lower_bound) | (x > upper_bound)]
    cleaned = data[(x > lower_bound) & (x < upper_bound)]
    return outliers, cleaned

o, c = outliers_iqr_mod(diabetes_un5, 'SkinThickness', log_scale=False)
print(o.shape, c.shape)
o, c = outliers_iqr_mod(diabetes_un5, 'SkinThickness', log_scale=True)
print(o.shape, c.shape)

TypeError: outliers_iqr_mod() got an unexpected keyword argument 'log_scale'

In [135]:
o1, c1 = outliers_iqr_mod(diabetes_un5, 'DiabetesPedigreeFunction')
print(o1.shape, c1.shape)
o2, c2 = outliers_z_score_mod(diabetes_un5, 'DiabetesPedigreeFunction', log_scale=False)
print(o2.shape, c2.shape)

(29, 8) (732, 8)
(11, 8) (750, 8)
