[Reference](https://medium.com/@WojtekFulmyk/handling-missing-values-in-dataset-preprocessing-8f7f5a210f47)

In [1]:
import numpy as np
import pandas as pd

# example df of 10 rows and 5 columns
df = pd.DataFrame(np.random.randn(10, 5))

# Sets random 20% of values to NaN
num_nan = int(df.size * 0.2)

# Loop to randomize the NaN values
for _ in range(num_nan):
    i = np.random.randint(0, df.shape[0])
    j = np.random.randint(0, df.shape[1])
    df.iloc[i, j] = np.nan
    # ensures all values are positive
    df = df.abs()

print(df)

          0         1         2         3         4
0       NaN  0.754366  1.507465  0.950219  0.510642
1  1.001304  0.664201  0.349355       NaN  0.101694
2  0.365116  0.150684  0.428704  0.585764  0.849133
3       NaN  0.593344       NaN  0.551494  0.919409
4  0.416361  0.344855  1.297222  0.402531  0.294946
5  0.075290  0.006619  0.351888       NaN       NaN
6  0.326784  1.148735  2.343642  0.647455  0.648783
7       NaN       NaN  1.702416  0.129627  1.434416
8  0.562866  1.003060  0.793655  0.967886  0.717289
9  0.011593  0.288280       NaN  2.045615  0.287391


## Mean Imputation (column-wise):

In [2]:
# Impute missing values with column-wise mean values
df.fillna(df.mean(), inplace=True)

# Print updated dataset
print(df)

          0         1         2         3         4
0  0.394188  0.754366  1.507465  0.950219  0.510642
1  1.001304  0.664201  0.349355  0.785074  0.101694
2  0.365116  0.150684  0.428704  0.585764  0.849133
3  0.394188  0.593344  1.096793  0.551494  0.919409
4  0.416361  0.344855  1.297222  0.402531  0.294946
5  0.075290  0.006619  0.351888  0.785074  0.640411
6  0.326784  1.148735  2.343642  0.647455  0.648783
7  0.394188  0.550460  1.702416  0.129627  1.434416
8  0.562866  1.003060  0.793655  0.967886  0.717289
9  0.011593  0.288280  1.096793  2.045615  0.287391


## Hot-deck Imputation:

In [3]:
# Impute missing values with hot deck imputation
for col in df.columns:
    for i, val in enumerate(df[col]):
        if pd.isna(val):
            df.at[i, col] = df[col].dropna().sample().iloc[0]

# Print updated dataset
print(df)

          0         1         2         3         4
0  0.394188  0.754366  1.507465  0.950219  0.510642
1  1.001304  0.664201  0.349355  0.785074  0.101694
2  0.365116  0.150684  0.428704  0.585764  0.849133
3  0.394188  0.593344  1.096793  0.551494  0.919409
4  0.416361  0.344855  1.297222  0.402531  0.294946
5  0.075290  0.006619  0.351888  0.785074  0.640411
6  0.326784  1.148735  2.343642  0.647455  0.648783
7  0.394188  0.550460  1.702416  0.129627  1.434416
8  0.562866  1.003060  0.793655  0.967886  0.717289
9  0.011593  0.288280  1.096793  2.045615  0.287391
