In [4]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression

# Sample dataset
df = pd.DataFrame({
    'Age': [25, np.nan, 35, 40, np.nan],
    'Gender': ['Male', 'Female', np.nan, 'Female', 'Male'],
    'Income': [50000, 60000, 52000, np.nan, 58000]
})

print("Original Data:\n", df)

# 1. Identify Missing Values
print("\nMissing Values:\n", df.isnull().sum())

# 2. Dropping Rows with Missing Values
df_drop_rows = df.dropna()
print("\nAfter Dropping Rows:\n", df_drop_rows)

# 3. Dropping Columns with Missing Values
df_drop_cols = df.dropna(axis=1)
print("\nAfter Dropping Columns:\n", df_drop_cols)

# 4. Mean Imputation (Numerical)
df_mean = df.copy()
df_mean['Age'] = df_mean['Age'].fillna(df_mean['Age'].mean())
print("\nAfter Mean Imputation:\n", df_mean)

# 5. Mode Imputation (Categorical)
df_mode = df.copy()
df_mode['Gender'] = df_mode['Gender'].fillna(df_mode['Gender'].mode()[0])
print("\nAfter Mode Imputation:\n", df_mode)

# 6. Median Imputation (Skewed Numerical)
df_median = df.copy()
df_median['Income'] = df_median['Income'].fillna(df_median['Income'].median())
print("\nAfter Median Imputation:\n", df_median)

# 7. KNN Imputation
df_knn = df.copy()
# Encode 'Gender' for KNN
df_knn['Gender'] = df_knn['Gender'].map({'Male': 1, 'Female': 0})
imputer = KNNImputer(n_neighbors=2)
df_knn_imputed = pd.DataFrame(imputer.fit_transform(df_knn), columns=df_knn.columns)
print("\nAfter KNN Imputation:\n", df_knn_imputed)

# 8. Handle Categorical Missing with 2nd Most Frequent
df_categorical = df.copy()
mode_counts = df_categorical['Gender'].value_counts()
if len(mode_counts) > 1:
    second_mode = mode_counts.index[1]
else:
    second_mode = mode_counts.index[0]
df_categorical['Gender'] = df_categorical['Gender'].fillna(second_mode)
print("\nAfter Second Most Frequent Category Imputation:\n", df_categorical)

# 9. Predictive Modeling for Imputation (Income)
df_pred = df.copy()
df_pred['Gender'] = df_pred['Gender'].map({'Male': 1, 'Female': 0})

train = df_pred[df_pred['Income'].notnull()].dropna(subset=['Age', 'Gender'])
test = df_pred[df_pred['Income'].isnull()].dropna(subset=['Age', 'Gender'])

model = LinearRegression()
model.fit(train[['Age', 'Gender']], train['Income'])

preds = model.predict(test[['Age', 'Gender']])
df_pred.loc[test.index, 'Income'] = preds
print("\nAfter Predictive Modeling Imputation on Income:\n", df_pred)

# 10. Forward and Backward Fill for Time Series
ts_df = pd.DataFrame({
    'Date': pd.date_range(start='2023-01-01', periods=7),
    'Value': [1, np.nan, 3, np.nan, 5, np.nan, 7]
})
ts_df = ts_df.sort_values('Date')

ts_df['Forward_Fill'] = ts_df['Value'].ffill()
ts_df['Backward_Fill'] = ts_df['Value'].bfill()
print("\nTime Series Forward and Backward Fill:\n", ts_df)

Original Data:
     Age  Gender   Income
0  25.0    Male  50000.0
1   NaN  Female  60000.0
2  35.0     NaN  52000.0
3  40.0  Female      NaN
4   NaN    Male  58000.0

Missing Values:
 Age       2
Gender    1
Income    1
dtype: int64

After Dropping Rows:
     Age Gender   Income
0  25.0   Male  50000.0

After Dropping Columns:
 Empty DataFrame
Columns: []
Index: [0, 1, 2, 3, 4]

After Mean Imputation:
          Age  Gender   Income
0  25.000000    Male  50000.0
1  33.333333  Female  60000.0
2  35.000000     NaN  52000.0
3  40.000000  Female      NaN
4  33.333333    Male  58000.0

After Mode Imputation:
     Age  Gender   Income
0  25.0    Male  50000.0
1   NaN  Female  60000.0
2  35.0  Female  52000.0
3  40.0  Female      NaN
4   NaN    Male  58000.0

After Median Imputation:
     Age  Gender   Income
0  25.0    Male  50000.0
1   NaN  Female  60000.0
2  35.0     NaN  52000.0
3  40.0  Female  55000.0
4   NaN    Male  58000.0

After KNN Imputation:
     Age  Gender   Income
0  25.0     1