In [1]:
import copy
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer, KNNImputer, MissingIndicator

In [2]:
# Create data
np.random.seed(42)

# Generate synthetic features
N = 16
full_sizes = np.float32(np.random.randint(600, 3000, N))  # House size in sqft
full_bedrooms = np.float32(np.random.randint(1, 6, N))  # Number of bedrooms
full_bathrooms = np.float32(np.random.randint(1, 4, N))  # Number of bathrooms

# Generate fake price
price_per_sqft = np.random.uniform(100, 250, N)
full_price = (full_sizes*price_per_sqft) + (full_bedrooms*10000) + (full_bathrooms*5000)

# Introduce missing values
M = 5

sizes = copy.deepcopy(full_sizes)
bedrooms = copy.deepcopy(full_bedrooms)
bathrooms = copy.deepcopy(full_bathrooms)
price = copy.deepcopy(full_price)

sizes[np.random.choice(N, M, replace=False)] = np.nan
bedrooms[np.random.choice(N, M, replace=False)] = np.nan
bathrooms[np.random.choice(N, M, replace=False)] = np.nan
price[np.random.choice(N, M, replace=False)] = np.nan

missing_df = pd.DataFrame({
    'sizes': sizes,
    'bedrooms': bedrooms,
    'bathrooms': bathrooms,
    'price': price
})

In [3]:
# Where are the missing?
indicator = MissingIndicator(missing_values=np.nan)
mask_for_nan = indicator.fit_transform(missing_df)
mask_for_nan

array([[ True, False, False, False],
       [False,  True,  True,  True],
       [ True,  True, False, False],
       [False, False, False,  True],
       [False,  True, False, False],
       [ True,  True,  True, False],
       [False, False, False, False],
       [False, False,  True, False],
       [False, False, False,  True],
       [False,  True, False, False],
       [False, False, False, False],
       [ True, False,  True,  True],
       [False, False, False, False],
       [False, False, False, False],
       [ True, False,  True,  True],
       [False, False, False, False]])

In [None]:
# Simple statistical appraoch - I
mean_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
mean_imputed_df = pd.DataFrame(mean_imputer.fit_transform(missing_df), columns=missing_df.columns)
mean_imputed_df

In [None]:
# Simple statistical appraoch - II
mode_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
mode_imputed_df = pd.DataFrame(mode_imputer.fit_transform(missing_df), columns=missing_df.columns)
mode_imputed_df

In [None]:
# KNN appraoch
imputer = KNNImputer(missing_values=np.nan, n_neighbors=3, weights='distance')
knn_imputed_df = pd.DataFrame(imputer.fit_transform(missing_df), columns=missing_df.columns)
knn_imputed_df

In [32]:
# Some post-processing
knn_imputed_df['sizes'] = knn_imputed_df['sizes'].round()
knn_imputed_df['bedrooms'] = knn_imputed_df['bedrooms'].round()
knn_imputed_df['bathrooms'] = knn_imputed_df['bathrooms'].round()

knn_imputed_df

Unnamed: 0,sizes,bedrooms,bathrooms,price
0,1166.0,2.0,3.0,202390.192967
1,1894.0,3.0,2.0,345763.534251
2,930.0,1.0,3.0,352219.572125
3,1695.0,1.0,1.0,268816.163312
4,2238.0,1.0,3.0,445030.888724
5,2079.0,3.0,2.0,326183.272245
6,1066.0,5.0,1.0,307000.332292
7,1838.0,4.0,1.0,300145.640927
8,930.0,1.0,3.0,352219.572125
9,2082.0,1.0,2.0,325547.369063


# Homework!

Can you design a simple MLP with **PyTorch** and train with this dataset?
Where size, bedrooms, bathrooms will be the input-features (x) and price will be the predicted output (y).

**Hint 1**: Use previously shared notebooks for help

**Hint 2**: You might want to use data normalization per feature
[<a href="https://www.geeksforgeeks.org/what-is-data-normalization-and-why-is-it-important/" target="_blank">ref</a>]

**Hint 3**: Be careful about choosing the output activation

In [7]:
# TODO