[Reference](https://towardsdatascience.com/missing-value-imputation-explained-a-visual-guide-with-code-examples-for-beginners-93e0726284eb)

In [1]:
import pandas as pd
import numpy as np

# Create the dataset as a dictionary
data = {
    'Date': ['08-01', '08-02', '08-03', '08-04', '08-05', '08-06', '08-07', '08-08', '08-09', '08-10',
             '08-11', '08-12', '08-13', '08-14', '08-15', '08-16', '08-17', '08-18', '08-19', '08-20'],
    'Weekday': [0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5],
    'Holiday': [0.0, 0.0, 0.0, 0.0, np.nan, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, np.nan, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    'Temp': [25.1, 26.4, np.nan, 24.1, 24.7, 26.5, 27.6, 28.2, 27.1, 26.7, np.nan, 24.3, 23.1, 22.4, np.nan, 26.5, 28.6, np.nan, 27.0, 26.9],
    'Humidity': [99.0, np.nan, 96.0, 68.0, 98.0, 98.0, 78.0, np.nan, 70.0, 75.0, np.nan, 77.0, 77.0, 89.0, 80.0, 88.0, 76.0, np.nan, 73.0, 73.0],
    'Wind': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, np.nan, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, np.nan, 1.0, 0.0],
    'Outlook': ['rainy', 'sunny', 'rainy', 'overcast', 'rainy', np.nan, 'rainy', 'rainy', 'overcast', 'sunny', np.nan, 'overcast', 'sunny', 'rainy', 'sunny', 'rainy', np.nan, 'rainy', 'overcast', 'sunny'],
    'Crowdedness': [0.14, np.nan, 0.21, 0.68, 0.20, 0.32, 0.72, 0.61, np.nan, 0.54, np.nan, 0.67, 0.66, 0.38, 0.46, np.nan, 0.52, np.nan, 0.62, 0.81]
}

# Create a DataFrame from the dictionary
df = pd.DataFrame(data)

# Display basic information about the dataset
print(df.info())

# Display the first few rows of the dataset
print(df.head())

# Display the count of missing values in each column
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Date         20 non-null     object 
 1   Weekday      20 non-null     int64  
 2   Holiday      18 non-null     float64
 3   Temp         16 non-null     float64
 4   Humidity     16 non-null     float64
 5   Wind         18 non-null     float64
 6   Outlook      17 non-null     object 
 7   Crowdedness  15 non-null     float64
dtypes: float64(5), int64(1), object(2)
memory usage: 1.4+ KB
None
    Date  Weekday  Holiday  Temp  Humidity  Wind   Outlook  Crowdedness
0  08-01        0      0.0  25.1      99.0   0.0     rainy         0.14
1  08-02        1      0.0  26.4       NaN   0.0     sunny          NaN
2  08-03        2      0.0   NaN      96.0   0.0     rainy         0.21
3  08-04        3      0.0  24.1      68.0   0.0  overcast         0.68
4  08-05        4      NaN  24.7      98.0   0.0   

# Method 1: Listwise Deletion

In [2]:
# Count missing values in each row
missing_count = df.isnull().sum(axis=1)

# Keep only rows with less than 4 missing values
df_clean = df[missing_count < 4].copy()

# Method 2: Simple Imputation — Mean and Mode

In [3]:
# Mean imputation for Humidity
df_clean['Humidity'] = df_clean['Humidity'].fillna(df_clean['Humidity'].mean())

# Mode imputation for Holiday
df_clean['Holiday'] = df_clean['Holiday'].fillna(df_clean['Holiday'].mode()[0])

# Method 3: Linear Interpolation

In [4]:
df_clean['Temp'] = df_clean['Temp'].interpolate(method='linear')

# Method 4: Forward/Backward Fill

In [5]:
df_clean['Outlook'] = df_clean['Outlook'].fillna(method='ffill').fillna(method='bfill')

  df_clean['Outlook'] = df_clean['Outlook'].fillna(method='ffill').fillna(method='bfill')


# Method 5: Constant Value ImputationMethod 5: Constant Value Imputation

In [6]:
df_clean['Wind'] = df_clean['Wind'].fillna(-1)

# Method 6: KNN Imputation

In [7]:
from sklearn.impute import KNNImputer

# One-hot encode the 'Outlook' column
outlook_encoded = pd.get_dummies(df_clean['Outlook'], prefix='Outlook')

# Prepare features for KNN imputation
features_for_knn = ['Weekday', 'Holiday', 'Temp', 'Humidity', 'Wind']
knn_features = pd.concat([df_clean[features_for_knn], outlook_encoded], axis=1)

# Apply KNN imputation
knn_imputer = KNNImputer(n_neighbors=3)
df_imputed = pd.DataFrame(knn_imputer.fit_transform(pd.concat([knn_features, df_clean[['Crowdedness']]], axis=1)),
                          columns=list(knn_features.columns) + ['Crowdedness'])

# Update the original dataframe with the imputed Crowdedness values
df_clean['Crowdedness'] = df_imputed['Crowdedness']

In [8]:
print("Before:")
print(df)

print("\n\nAfter:")
print(df_clean)

Before:
     Date  Weekday  Holiday  Temp  Humidity  Wind   Outlook  Crowdedness
0   08-01        0      0.0  25.1      99.0   0.0     rainy         0.14
1   08-02        1      0.0  26.4       NaN   0.0     sunny          NaN
2   08-03        2      0.0   NaN      96.0   0.0     rainy         0.21
3   08-04        3      0.0  24.1      68.0   0.0  overcast         0.68
4   08-05        4      NaN  24.7      98.0   0.0     rainy         0.20
5   08-06        5      0.0  26.5      98.0   0.0       NaN         0.32
6   08-07        6      0.0  27.6      78.0   0.0     rainy         0.72
7   08-08        0      0.0  28.2       NaN   0.0     rainy         0.61
8   08-09        1      0.0  27.1      70.0   0.0  overcast          NaN
9   08-10        2      1.0  26.7      75.0   NaN     sunny         0.54
10  08-11        3      0.0   NaN       NaN   0.0       NaN          NaN
11  08-12        4      NaN  24.3      77.0   0.0  overcast         0.67
12  08-13        5      0.0  23.1      77.0