In [2]:
import numpy as np
import pandas as pd
from fancyimpute import  IterativeImputer

In [3]:
data ={
        'A':[1,2,np.nan,4,5],
        'B':[3,np.nan,5,np.nan,7],
        'C':[np.nan,2,3,4,np.nan],
        'D':[1,np.nan,3,np.nan,5]
}

In [4]:
df = pd.DataFrame(data)

In [5]:
df

Unnamed: 0,A,B,C,D
0,1.0,3.0,,1.0
1,2.0,,2.0,
2,,5.0,3.0,3.0
3,4.0,,4.0,
4,5.0,7.0,,5.0


# MICE

In [6]:
mice_imputer = IterativeImputer()

In [7]:
imputed_data = mice_imputer.fit_transform(df)

In [8]:
df_imputed = pd.DataFrame(imputed_data,columns=df.columns)

In [9]:
df_imputed

Unnamed: 0,A,B,C,D
0,1.0,3.0,0.995794,1.0
1,2.0,4.002739,2.0,2.001613
2,3.0,5.0,3.0,3.0
3,4.0,5.997261,4.0,3.998387
4,5.0,7.0,5.004206,5.0


# 2. KNN Imputation

In [10]:
from sklearn.impute import KNNImputer

In [11]:
data = {
    'Day': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'Sales': [50, 55, np.nan, 65, np.nan, 75, 80, np.nan, 90, 95],
    'Customers': [200, 210, 220, np.nan, 230, 240, np.nan, 250, 260, 270]
}

In [12]:
# Create a DataFrame
df = pd.DataFrame(data)

In [16]:
df

Unnamed: 0,Day,Sales,Customers
0,1,50.0,200.0
1,2,55.0,210.0
2,3,,220.0
3,4,65.0,
4,5,,230.0
5,6,75.0,240.0
6,7,80.0,
7,8,,250.0
8,9,90.0,260.0
9,10,95.0,270.0


In [13]:
# Initialize the KNN Imputer with a specified number of neighbors (e.g., 2)
imputer = KNNImputer(n_neighbors=2)

In [14]:
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

In [15]:
df_imputed

Unnamed: 0,Day,Sales,Customers
0,1.0,50.0,200.0
1,2.0,55.0,210.0
2,3.0,72.5,220.0
3,4.0,65.0,225.0
4,5.0,72.5,230.0
5,6.0,75.0,240.0
6,7.0,80.0,240.0
7,8.0,72.5,250.0
8,9.0,90.0,260.0
9,10.0,95.0,270.0


# 3. Regression Imputation

In [17]:
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer

In [21]:
def regression_imputer(df, target_column):
    # Split the data into train (non-missing target) and test (missing target)
    train_df = df[df[target_column].notna()]
    test_df = df[df[target_column].isna()]

    # Separate independent (X) and dependent (y) variables for training
    x_train = train_df.drop(columns=[target_column])
    y_train = train_df[target_column]

    # Extract independent variables for the test set (where target is missing)
    x_test = test_df.drop(columns=[target_column])

    # Handle any missing values in the independent variables (optional)
    imputer = SimpleImputer(strategy='mean')
    x_train_imputed = imputer.fit_transform(x_train)
    x_test_imputed = imputer.transform(x_test)

    # Train the regression model
    model = LinearRegression()
    model.fit(x_train_imputed, y_train)

    # Predict the missing values in the target column
    predicted_values = model.predict(x_test_imputed)

    # Impute the missing values in the original DataFrame
    df.loc[df[target_column].isna(), target_column] = predicted_values

    return df

In [22]:
data = {
    'Day': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'Sales': [50, 55, np.nan, 65, np.nan, 75, 80, np.nan, 90, 95],
    'Customers': [200, 210, 220, np.nan, 230, 240, np.nan, 250, 260, 270]
}

df = pd.DataFrame(data)

In [25]:
df

Unnamed: 0,Day,Sales,Customers
0,1,50.0,200.0
1,2,55.0,210.0
2,3,60.0,220.0
3,4,65.0,
4,5,70.0,230.0
5,6,75.0,240.0
6,7,80.0,
7,8,85.0,250.0
8,9,90.0,260.0
9,10,95.0,270.0


In [23]:
df_imputed = regression_imputer(df, 'Sales')

In [24]:
df_imputed 

Unnamed: 0,Day,Sales,Customers
0,1,50.0,200.0
1,2,55.0,210.0
2,3,60.0,220.0
3,4,65.0,
4,5,70.0,230.0
5,6,75.0,240.0
6,7,80.0,
7,8,85.0,250.0
8,9,90.0,260.0
9,10,95.0,270.0
