In [1]:
import pandas as pd
import numpy as np

In [2]:
df_1990to1999 = pd.read_csv('./resale_price_data/ResaleFlatPricesBasedonApprovalDate19901999.csv')
df_2000tofeb2012 = pd.read_csv('./resale_price_data/ResaleFlatPricesBasedonApprovalDate2000Feb2012.csv')
df_mar2012todec2014 = pd.read_csv('./resale_price_data/ResaleFlatPricesBasedonRegistrationDateFromMar2012toDec2014.csv')
df_jan2015todec2016 = pd.read_csv('./resale_price_data/ResaleFlatPricesBasedonRegistrationDateFromJan2015toDec2016.csv')
df_jan2017tonov2023 = pd.read_csv('./resale_price_data/ResaleflatpricesbasedonregistrationdatefromJan2017onwards.csv')

In [3]:
# Join all csvs
df = pd.concat([df_1990to1999, df_2000tofeb2012, df_mar2012todec2014, df_jan2015todec2016, df_jan2017tonov2023]).reset_index(drop=True)
df = df.drop(columns=['flat_model', 'remaining_lease'])
df

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,lease_commence_date,resale_price
0,1990-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,10 TO 12,31.0,1977,9000.0
1,1990-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,04 TO 06,31.0,1977,6000.0
2,1990-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,10 TO 12,31.0,1977,8000.0
3,1990-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,07 TO 09,31.0,1977,6000.0
4,1990-01,ANG MO KIO,3 ROOM,216,ANG MO KIO AVE 1,04 TO 06,73.0,1976,47200.0
...,...,...,...,...,...,...,...,...,...
911477,2023-11,YISHUN,4 ROOM,851,YISHUN ST 81,04 TO 06,104.0,1988,520000.0
911478,2023-11,YISHUN,4 ROOM,865,YISHUN ST 81,04 TO 06,104.0,1988,520000.0
911479,2023-11,YISHUN,5 ROOM,315C,YISHUN AVE 9,04 TO 06,112.0,2015,645000.0
911480,2023-11,YISHUN,5 ROOM,342C,YISHUN RING RD,13 TO 15,113.0,2016,700000.0


In [4]:
# Ensure consistent feature naming
df['flat_type'] = df['flat_type'].replace('MULTI-GENERATION', 'MULTI GENERATION')
df['flat_type'].unique()

array(['1 ROOM', '3 ROOM', '4 ROOM', '5 ROOM', '2 ROOM', 'EXECUTIVE',
       'MULTI GENERATION'], dtype=object)

In [5]:
# Narrow storey range categories
df['storey_range'] = df['storey_range'].apply(lambda x: x if x in ['01 TO 03', '04 TO 06', '07 TO 09'] else '10 OR ABOVE')
df['storey_range'].unique()

array(['10 OR ABOVE', '04 TO 06', '07 TO 09', '01 TO 03'], dtype=object)

In [6]:
df['remaining_lease_years'] = 99 - (pd.to_datetime(df['month']).dt.year - df['lease_commence_date'])
df['resale_price_per_sqm'] = (df['resale_price'] / df['floor_area_sqm']).round(0)
df

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,lease_commence_date,resale_price,remaining_lease_years,resale_price_per_sqm
0,1990-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,10 OR ABOVE,31.0,1977,9000.0,86,290.0
1,1990-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,04 TO 06,31.0,1977,6000.0,86,194.0
2,1990-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,10 OR ABOVE,31.0,1977,8000.0,86,258.0
3,1990-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,07 TO 09,31.0,1977,6000.0,86,194.0
4,1990-01,ANG MO KIO,3 ROOM,216,ANG MO KIO AVE 1,04 TO 06,73.0,1976,47200.0,85,647.0
...,...,...,...,...,...,...,...,...,...,...,...
911477,2023-11,YISHUN,4 ROOM,851,YISHUN ST 81,04 TO 06,104.0,1988,520000.0,64,5000.0
911478,2023-11,YISHUN,4 ROOM,865,YISHUN ST 81,04 TO 06,104.0,1988,520000.0,64,5000.0
911479,2023-11,YISHUN,5 ROOM,315C,YISHUN AVE 9,04 TO 06,112.0,2015,645000.0,91,5759.0
911480,2023-11,YISHUN,5 ROOM,342C,YISHUN RING RD,10 OR ABOVE,113.0,2016,700000.0,92,6195.0


In [7]:
# Drop 2023-11 due to incomplete data
df = df[df['month'] != '2023-11']

In [8]:
df.to_csv('./resale_price_data/ResaleFlatPrices1990to2023Processed.csv',index=False)

In [None]:
df.groupby(by=['month'])['resale_price_per_sqm'].describe() # Check if dash graphs are correct