<img src="http://imgur.com/1ZcRyrc.png" style="float: left; margin: 20px; height: 55px">

# Project 2: Prediction of HDB Resale Flat Prices
Author: Edmond Ang

## Contents
1. [Baseline Test Data Cleaning](#1.-Baseline-Model-Data-Cleaning)

---
## 1. Baseline Model Data Cleaning
---

* Mimic baseline training data cleaning to determine baseline model prediction

### Imports

In [59]:
import numpy as np
import pandas as pd
import math
import os
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from statistics import mean 

### Read test data

In [60]:
hdb_df = pd.read_csv('datasets/test.csv')
pd.set_option('display.max_columns', None)  # to see all columns
print(hdb_df.shape)
print(hdb_df.head())  # missing resale_price column

(16737, 76)
       id Tranc_YearMonth         town flat_type block          street_name  \
0  114982         2012-11       YISHUN    4 ROOM   173         YISHUN AVE 7   
1   95653         2019-08  JURONG WEST    5 ROOM  986C    JURONG WEST ST 93   
2   40303         2013-10   ANG MO KIO    3 ROOM   534    ANG MO KIO AVE 10   
3  109506         2017-10    WOODLANDS    4 ROOM    29         MARSILING DR   
4  100149         2016-08  BUKIT BATOK    4 ROOM   170  BT BATOK WEST AVE 8   

  storey_range  floor_area_sqm         flat_model  lease_commence_date  \
0     07 TO 09            84.0         Simplified                 1987   
1     04 TO 06           112.0  Premium Apartment                 2008   
2     07 TO 09            68.0     New Generation                 1980   
3     01 TO 03            97.0     New Generation                 1979   
4     16 TO 18           103.0            Model A                 1985   

   Tranc_Year  Tranc_Month  mid_storey  lower  upper  mid  \
0      

  hdb_df = pd.read_csv('datasets/test.csv')


### Change all NaN values from 'X within Y distance' features to 0

In [61]:
hdb_df['Mall_Within_500m'] = [0 if math.isnan(x) else x for x in hdb_df['Mall_Within_500m']]
hdb_df['Mall_Within_1km'] = [0 if math.isnan(x) else x for x in hdb_df['Mall_Within_1km']]
hdb_df['Mall_Within_2km'] = [0 if math.isnan(x) else x for x in hdb_df['Mall_Within_2km']]

hdb_df['Hawker_Within_500m'] = [0 if math.isnan(x) else x for x in hdb_df['Hawker_Within_500m']]
hdb_df['Hawker_Within_1km'] = [0 if math.isnan(x) else x for x in hdb_df['Hawker_Within_1km']]
hdb_df['Hawker_Within_2km'] = [0 if math.isnan(x) else x for x in hdb_df['Hawker_Within_2km']]

### Change all 'Y' and 'N' values to 1 and 0 respectively

In [62]:
hdb_df['residential'] = [0 if x == "N" else 1 for x in hdb_df['residential']]
hdb_df['commercial'] = [0 if x == "N" else 1 for x in hdb_df['commercial']]
hdb_df['market_hawker'] = [0 if x == "N" else 1 for x in hdb_df['market_hawker']]
hdb_df['multistorey_carpark'] = [0 if x == "N" else 1 for x in hdb_df['multistorey_carpark']]
hdb_df['precinct_pavilion'] = [0 if x == "N" else 1 for x in hdb_df['precinct_pavilion']]

### Dropping all location-name features 

In [63]:
hdb_df = hdb_df.drop(columns=['lease_commence_date', 'flat_type', 'block', 'flat_model', 'storey_range', 'mid', 'lower', 'upper', 'postal', 'floor_area_sqft', 'block', 'street_name', 'address', 'planning_area', 'mrt_name', 'bus_stop_name', 'pri_sch_latitude', 'pri_sch_longitude', 'sec_sch_latitude', 'sec_sch_longitude', 'bus_stop_latitude', 'bus_stop_longitude', 'mrt_latitude', 'mrt_longitude', 'Longitude', 'Latitude'])

### Drop 'Tranc_YearMonth' feature

In [64]:
hdb_df = hdb_df.drop(columns=['Tranc_YearMonth'])

### Dropping NaN values for 'Mall_Nearest_Distance' 

In [65]:
hdb_df = hdb_df[hdb_df['Mall_Nearest_Distance'].notnull()]

### Convert remaining categorical variable into dummy/indicator variables

In [66]:
hdb_df = pd.get_dummies(hdb_df, columns=['town', 'pri_sch_name', 'sec_sch_name', 'full_flat_type'])

### Sanity check: making sure that the number of features matches baseline train dataset

In [67]:
len(hdb_df.columns)

422

### Export cleaned data to CSV file

In [68]:
newpath = 'output'
if not os.path.exists(newpath):
    os.makedirs(newpath)

hdb_df.to_csv('output/cleaned_baseline_hdb_test.csv')