In [1]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier

## Read data 

In [2]:
data = pd.read_csv('diabetes_dataset.csv')

In [3]:
data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,,33.6,0.627,50,1
1,1,85.0,66.0,29.0,,26.6,0.351,31,0
2,8,183.0,64.0,,,23.3,0.672,32,1
3,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
4,5,116.0,74.0,,,25.6,0.201,30,0
...,...,...,...,...,...,...,...,...,...
567,9,89.0,62.0,,,22.5,0.142,33,0
568,2,122.0,70.0,27.0,,36.8,0.340,27,0
569,5,121.0,72.0,23.0,112.0,26.2,0.245,30,0
570,1,126.0,60.0,,,30.1,0.349,47,1


### Check missing values 

In [4]:
data.isnull().sum().sort_values(ascending=False)

Insulin                     374
SkinThickness               227
BloodPressure                35
BMI                          11
Glucose                       5
Outcome                       0
Age                           0
DiabetesPedigreeFunction      0
Pregnancies                   0
dtype: int64

## Data pre-processing 

### Fill missing values 

#### Median 

In [5]:
# Fill glucose column 

Glucose_median = data.Glucose.median()
data['Glucose'].fillna(Glucose_median, inplace=True)

In [6]:
# Fill BMI column 

BMI_median = data.BMI.median()
data['BMI'].fillna(BMI_median, inplace=True)

In [7]:
# Fill BloodPressure column 

BloodPressure_median = data.BloodPressure.median()
data['BloodPressure'].fillna(BloodPressure_median, inplace=True)

In [8]:
data.isnull().sum().sort_values(ascending=False)

Insulin                     374
SkinThickness               227
Outcome                       0
Age                           0
DiabetesPedigreeFunction      0
BMI                           0
BloodPressure                 0
Glucose                       0
Pregnancies                   0
dtype: int64

#### MICE

In [9]:
from impyute.imputation.cs import mice

In [10]:
imputed_training = mice(data.values)

In [11]:
data['Insulin'] = imputed_training[:, 4:5]
data['SkinThickness'] = imputed_training[:, 3:4]

In [12]:
data.isnull().sum().sort_values(ascending=False)

Outcome                     0
Age                         0
DiabetesPedigreeFunction    0
BMI                         0
Insulin                     0
SkinThickness               0
BloodPressure               0
Glucose                     0
Pregnancies                 0
dtype: int64

In [13]:
data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.000000,194.795981,33.6,0.627,50,1
1,1,85.0,66.0,29.000000,37.699964,26.6,0.351,31,0
2,8,183.0,64.0,21.525391,259.030312,23.3,0.672,32,1
3,0,137.0,40.0,35.000000,168.000000,43.1,2.288,33,1
4,5,116.0,74.0,21.797967,104.783953,25.6,0.201,30,0
...,...,...,...,...,...,...,...,...,...
567,9,89.0,62.0,18.777223,22.238692,22.5,0.142,33,0
568,2,122.0,70.0,27.000000,168.898936,36.8,0.340,27,0
569,5,121.0,72.0,23.000000,112.000000,26.2,0.245,30,0
570,1,126.0,60.0,29.438468,123.062470,30.1,0.349,47,1


In [15]:
data.to_csv('diabetes_dataset2.csv', encoding="utf-8")