In [45]:
import pandas as pd
data = pd.read_csv('data\\diabetes_data.csv')

In [46]:
data = data.drop_duplicates()
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Gender
0,6,98,58,33,190,34.0,0.43,43,0,Female
1,2,112,75,32,0,35.7,0.148,21,0,Female
2,2,108,64,0,0,30.8,0.158,21,0,Female
3,8,107,80,0,0,24.6,0.856,34,0,Female
4,7,136,90,0,0,29.9,0.21,50,0,Female


In [47]:
#список неинформативных признаков --------------Осознать почему процент неуникальных != процент одинаковых
low_information_cols = [] 

#цикл по всем столбцам
for col in data.columns:
    #наибольшая относительная частота в признаке
    top_freq = data[col].value_counts(normalize=True).max()
    #доля уникальных значений от размера признака
    nunique_ratio = data[col].nunique() / data[col].count()
    # сравниваем наибольшую частоту с порогом
    if top_freq > 0.95:
        low_information_cols.append(col)
        print(f'{col}: {round(top_freq*100, 2)}% одинаковых значений')
    # сравниваем долю уникальных значений с порогом
    if nunique_ratio > 0.95:
        low_information_cols.append(col)
        print(f'{col}: {round(nunique_ratio*100, 2)}% уникальных значений')

data.drop(columns=low_information_cols, inplace=True)
data.head()

Gender: 100.0% одинаковых значений


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,98,58,33,190,34.0,0.43,43,0
1,2,112,75,32,0,35.7,0.148,21,0
2,2,108,64,0,0,30.8,0.158,21,0
3,8,107,80,0,0,24.6,0.856,34,0
4,7,136,90,0,0,29.9,0.21,50,0


In [48]:
import numpy as np
for col in ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']:
    data[col].replace(to_replace=0.0, value=np.nan, inplace=True)

In [49]:
data['Insulin'].isnull().value_counts(normalize=True).round(2)

False    0.51
True     0.49
Name: Insulin, dtype: float64

In [50]:
for col in data.columns:
    if data[col].isna().mean() > 0.3: 
        data.drop(columns=col, inplace=True)

data.columns.shape

(8,)

In [51]:
for index in data.index:
    if data.loc[index].isna().sum() > 2:
        data.drop(index=index, inplace=True)

In [52]:
data.shape

(761, 8)

In [53]:
for col in data.columns:
    data[col].replace(to_replace=np.nan, value=data[col].median(), inplace=True)
data['SkinThickness'].mean()

29.109067017082786

In [58]:
def outliers_iqr_mod(data, feature, left=1.5, right=1.5, log_scale=False, increment=0):
    if log_scale:
        import numpy as np
        x = np.log(data[feature]+increment)
    else:
        x = data[feature]
    quartile_1, quartile_3 = x.quantile(0.25), x.quantile(0.75)
    iqr = quartile_3 - quartile_1
    lower_bound = quartile_1 - (iqr * left)
    upper_bound = quartile_3 + (iqr * right)
    outliers = data[(x < lower_bound) | (x > upper_bound)]
    cleaned = data[( x > lower_bound) & (x < upper_bound)]
    return outliers, cleaned

outliers_iqr_mod(data, 'SkinThickness')[0].shape[0]

87

In [59]:
def outliers_z_score_mod(data, feature, log_scale=False, left=3.0, right=3.0):
    if log_scale:
        import numpy as np
        x = np.log(data[feature]+1)
    else:
        x = data[feature]
    mu = x.mean()
    sigma = x.std()
    lower_bound = mu - left * sigma
    upper_bound = mu + right * sigma
    outliers = data[(x < lower_bound) | (x > upper_bound)]
    cleaned = data[(x > lower_bound) & (x < upper_bound)]
    return outliers, cleaned

outliers_z_score_mod(data, 'SkinThickness')[0].shape[0]

4

In [61]:
outliers_iqr_mod(data, 'DiabetesPedigreeFunction')[0].shape[0] - outliers_iqr_mod(data, 'DiabetesPedigreeFunction', log_scale=True)[0].shape[0]

29