In [73]:
import pandas as pd
import plotly.express as px
import numpy as np

# Pregnancies — количество беременностей.
# Glucose — концентрация глюкозы в плазме через два часа при пероральном тесте на толерантность к глюкозе.
# BloodPressure — диастолическое артериальное давление (мм рт. ст.).
# SkinThickness — толщина кожной складки трицепса (мм).
# Insulin — двухчасовой сывороточный инсулин (ме Ед/мл).
# BMI — индекс массы тела
# DiabetesPedigreeFunction — функция родословной диабета (чем она выше, тем выше шанс наследственной заболеваемости).
# Age — возраст.
# Outcome — наличие диабета (0 — нет, 1 — да).

diabetes = pd.read_csv('data/diabetes_data.csv')
diabetes

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Gender
0,6,98,58,33,190,34.0,0.430,43,0,Female
1,2,112,75,32,0,35.7,0.148,21,0,Female
2,2,108,64,0,0,30.8,0.158,21,0,Female
3,8,107,80,0,0,24.6,0.856,34,0,Female
4,7,136,90,0,0,29.9,0.210,50,0,Female
...,...,...,...,...,...,...,...,...,...,...
773,6,103,72,32,190,37.7,0.324,55,0,Female
774,1,71,48,18,76,20.4,0.323,22,0,Female
775,0,117,0,0,0,33.8,0.932,44,0,Female
776,4,154,72,29,126,31.3,0.338,37,0,Female


In [74]:
diabetes_df = diabetes.copy()
diabetes_df = diabetes_df.drop_duplicates()
print(diabetes_df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 768 entries, 0 to 767
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
 9   Gender                    768 non-null    object 
dtypes: float64(2), int64(7), object(1)
memory usage: 66.0+ KB
None


In [75]:
non_inf_list = []
for col in diabetes_df.columns:
    top_freq = diabetes_df[col].value_counts(normalize = True).max()
    nunique_ratio = diabetes_df[col].nunique() / diabetes_df[col].count()
    if top_freq > 0.95:
        non_inf_list.append(col)
    if nunique_ratio > 0.95:
        non_inf_list.append(col)
print(non_inf_list)

diabetes_df = diabetes_df.drop('Gender', axis = 1)

['Gender']


In [76]:
null_cols = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin','BMI']
diabetes_df[null_cols] = diabetes_df[null_cols].replace(0, np.nan)
diabetes_df['Insulin'].isnull().mean()

0.4869791666666667

In [78]:
thresh = diabetes_df.shape[0]*0.7
diabetes_df = diabetes_df.dropna(thresh=thresh, axis=1)
print(diabetes_df.shape[1])

8


In [82]:
diabetes_df.dropna(axis=0, thresh=6)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,98.0,58.0,33.0,34.0,0.430,43,0
1,2,112.0,75.0,32.0,35.7,0.148,21,0
2,2,108.0,64.0,,30.8,0.158,21,0
3,8,107.0,80.0,,24.6,0.856,34,0
4,7,136.0,90.0,,29.9,0.210,50,0
...,...,...,...,...,...,...,...,...
763,5,139.0,64.0,35.0,28.6,0.411,26,0
764,1,96.0,122.0,,22.4,0.207,27,0
765,10,101.0,86.0,37.0,45.6,1.136,38,1
766,0,141.0,,,42.4,0.205,29,1


In [85]:
diabetes_df = diabetes_df.fillna(diabetes_df.median())

diabetes_df['SkinThickness'].describe()

count    768.000000
mean      29.108073
std        8.791221
min        7.000000
25%       25.000000
50%       29.000000
75%       32.000000
max       99.000000
Name: SkinThickness, dtype: float64

In [109]:
def outliers_iqr_mod(data, feature, left = 1.5, right = 1.5, log_scale = True):
    if log_scale:
        x = np.log(data[feature])
    else:
        x = data[feature]
    quartile_1, quartile_3 = x.quantile(0.25), x.quantile(0.75),
    iqr = quartile_3 - quartile_1
    lower_bound = quartile_1 - (iqr * left)
    upper_bound = quartile_3 + (iqr * right)
    outliers = data[(x < lower_bound) | (x > upper_bound)]
    cleaned = data[(x >= lower_bound) & (x <= upper_bound)]
    return outliers


outliers_1 = outliers_iqr_mod(diabetes_df, 'DiabetesPedigreeFunction')
print(outliers_1.shape[0])


0
