[Reference](https://medium.com/@hhuseyincosgun/dealing-with-missing-data-from-zero-to-advanced-4fb734ee5998)

# Types of Missing Data
- Missing completely at random (MCAR)
- Missing at random (MAR)
- Not missing at random (NMAR)

# Identify missing data

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [1]:
def load():
    data = pd.read_csv("/kaggle/input/diabetes-data-set/diabetes.csv")
    return data
df = load()
df.head()

In [3]:
import missingno as msno
msno.matrix(df)

In [4]:
# Analysis of missing values
missing_plot(df, 'Outcome')

# Handling Missing Data

In [5]:
#This function about missing data
miss_cols_info = missing_percent(data)
miss_cols_info

## 1. Data Dropping

In [7]:
#Columns with more than 25% missing data
drop_cols = miss_cols_info[miss_cols_info['Percent of Total Values'] > 25]
drop_cols

In [8]:
col_names = drop_cols.index.tolist()
col_names

In [9]:
data.drop(col_names, axis = 1, inplace=True)
data.head()

## 2.Simple Imputation Methods
### 2.1 Mean Imputation

In [11]:
print(round(data_mean['Insulin'].mean(), 2))

from sklearn.impute import SimpleImputer
mean_imputer = SimpleImputer(missing_values = np.nan,
                        strategy ='mean')

data_mean['Insulin'] = mean_imputer.fit_transform(
data_mean['Insulin'].values.reshape(-1,1))

In [12]:
# Analyze visually with scatter plot
plt.style.use('seaborn')
fig = plt.Figure()
null_values = data['Insulin'].isnull()
fig = data_mean.plot(x="BMI", y='Insulin', kind='scatter',
                     c=null_values, cmap='plasma',s = 10,
                     title='Mean Imputation', colorbar=False)

## 2.2 Median Imputation

In [13]:
print(round(data_median['Insulin'].median(), 2))
#Median
median_imputer = SimpleImputer(missing_values = np.nan,
                        strategy ='median')
data_median['Insulin'] = median_imputer.fit_transform(
data_median['Insulin'].values.reshape(-1,1)))

In [14]:
# Analyze visually with scatter plot
plt.style.use('seaborn')
fig = plt.Figure()
null_values = data['Insulin'].isnull()
fig = data_median.plot(x="BMI", y='Insulin', kind='scatter',
                     c=null_values, cmap='winter',s = 10,
                     title='Median Imputation', colorbar=False)

## 3. Advanced Imputation Methods
### 3.1 K-Nearest Neighbour (KNN) Imputation

In [15]:
# Imputing with KNNImputer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler

# Define scaler to set values between 0 and 1
scaler = MinMaxScaler(feature_range=(0, 1))
df_knn = pd.DataFrame(scaler.fit_transform(df_knn), columns = df_knn.columns)

# Define KNN imputer and fill missing values
knn_imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')
df_knn_imputed = pd.DataFrame(knn_imputer.fit_transform(df_knn), columns=df_knn.columns)

# Converting the dataset to its original form after scaling
original_data = scaler.inverse_transform(df_knn_imputed)

# Convert the original data to a DataFrame
df_original = pd.DataFrame(original_data, columns=df_knn.columns)

### 3.2 Multivariate Imputation by Chained Equation — MICE

In [16]:
# Imputing with MICE
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn import linear_model

df_mice = data.filter(['Pregnancies','Glucose','BloodPressure','BMI','DPF','Age','Insulin'], axis=1).copy()

# Define MICE Imputer and fill missing values
mice_imputer = IterativeImputer(estimator=linear_model.BayesianRidge(), n_nearest_features=None, imputation_order='ascending')

df_mice_imputed = pd.DataFrame(mice_imputer.fit_transform(df_mice), columns=df_mice.columns)