In [31]:
#importing all necessary modules

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

import math
import time
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut

sns.set_theme()
sns.set_palette("deep")
RANDOM_SEED = 42

In [32]:
#1: selecting relevant features
with open('19901999.csv', 'r') as f:
  master = [] #list to have relevant features from the resale flat price dataset
  header = next(f)
  for row in f:
    lis = row.strip().split(',')
    lis[0] = lis[0][0:4] + lis[0][5:] #processing string with '-' into an integer value
    line = [float(lis[0]), lis[2], float(lis[6]), lis[7], float(lis[8]), float(lis[9])] #referencing relevent features and adding them
    master.append(line)

month, flat_type, floor_area_sqm, flat_model, lease_commence_date, resale_price, unemployment_rate, real_gdp, population_size = 0,1,2,3,4,5,6,7,8 #for better and more intuitive referencing of index values with minimal confusion

#2: encoding categorical values for flat_model and flat_type
enc = OrdinalEncoder()
X = [['1 ROOM'], ['2 ROOM'], ['3 ROOM'], ['4 ROOM'], ['5 ROOM'], ['EXECUTIVE'], ['MULTI GENERATION']]
X_2 = [['SIMPLIFIED'], ['IMPROVED'], ['MODEL A'], ['APARTMENT'], ['NEW GENERATION'], ['STANDARD'], ['MAISONETTE'], ['MODEL A-MAISONETTE'], ['TERRACE'], ['IMPROVED-MAISONETTE'], ['PREMIUM APARTMENT'], ['MULTI GENERATION'], ['2-ROOM']]

enc.fit(X)
result = enc.transform([[row[flat_type]] for row in master]) #flat type encoding
enc.fit(X_2)
result2 = enc.transform([[row[flat_model]] for row in master]) #flat model encoding

for row in range(len(master)):
  master[row][flat_type] = result[row][0]
  master[row][flat_model] = result2[row][0]



In [33]:
#3: adding unemployment rate as a feature to the master dataset
import csv
#list of sublists seperated by quarters
months = [['01', '02', '03'], ['04', '05', '06'], ['07', '08', '09'], ['10', '11', '12']]

dic = {} #dic will store manually created string keys ('20241Q'...'19904Q') that will reference sublists consisting of months(from the months list) in that specific quarter.
#eg. {'20241Q':['202401.0', '202402.0', '202403.0']}
for i in range(1990, 2025):
  for j in range(1, 5):
    dic[str(i)+str(j)+'Q'] = []
    for m in months[j-1]:
      dic[str(i)+str(j)+'Q'] += [float(str(i)+m)]

finaldic = {}

with open('UnemploymentRateEndOfPeriodQuarterlySeasonallyAdjusted.csv', 'r') as f:
  masterr = [] #this list will contain 2 rows: row 1 contains the quarterly label year values such as '20241Q'. row 2 contains the unemployment rate for that quarter such as '1.7'
  for row in f:
    row = row.strip().split(',')
    masterr.append(row[1:]) #removing the titles total unemployment rate, resident unemployment rate
  masterr = masterr[:2] #removing resident and citize unemployment rate
  masterr[1] = masterr[1][1:] #removing the title ' (SA)"'

#as the file only contains unemployment rates from 1992-2024,
#through research(https://stats.mom.gov.sg/iMAS_Tables/Times%20Series%20Table/mrsd_14_Historical_Unemployment_Rate_28Jan21.xlsx)
#we will be manually inserting unemployment rates as 1.7 for each quarter in 1990 and 1991.
for i in range(1, 5):
  masterr[0].append('1990'+str(i)+'Q')
  masterr[0].append('1991'+str(i)+'Q')
  masterr[1] += [1.7, 1.7]

#for each quarter label in the first row, and for each key in dic, if they equate to each other,
#we will insert into finaldic, (the values in the sublist that the key in dic refers to) as the key
#that references the (unemployment rate for that month)value.
for item in masterr[0]:
  for row in dic:
    if item == row:
      for i in dic[row]:
        finaldic[i] = float(masterr[1][masterr[0].index(item)])

#adding the correct unemployment rate to the correct month in the dataset based on the float key of the month in finaldic
for row in master:
  for item in finaldic:
    if row[month] == item:
      row += [finaldic[item]]


In [34]:
#4: calculating remaining lease by performing the following: 99 - (current year - lease commencement date)
for row in master:
  result = 99-(int(str(row[month])[:4])-int(str(row[lease_commence_date])[:4]))
  row[lease_commence_date] = result

remaining_lease = lease_commence_date

In [35]:
#5 inserting real gdp into the dataset
with open('M015651.csv', 'r') as f:
  finalliz = f.readlines()
  year = (finalliz[10].strip().split(','))[1:] #removing irrelevant data and the header value to obtain the years
  r = year.index('1990 1Q ') #finding the index to cut off and obtain relevant years
  year = year[:r+1]
  gdp = (finalliz[11].strip().split(','))[1:] #removing irrelevant data and the header value to obtain the gdp
  r2 = gdp.index('16669.5') #finding the index to cut off and obtain relevant gdp
  gdp = gdp[:r2+1]

for row in range(len(year)):
  year[row] = year[row][:4] + year[row][5:7] #removing spaces from the quearterly year labels

finalfinaldic = {}

#for each quarter label in the year list, and for each key in dic, if they equate to each other,
#we will insert into finalfinaldic, (the values in the sublist that the key in dic refers to) as the key
#that references the (gdp)value in the gdp list.
for item in year:
  for row in dic:
    if item == row:
      for i in dic[row]:
        finalfinaldic[i] = float(gdp[year.index(item)])

#adding the correct unemployment rate to the correct month in the dataset based on the float key of the month in finaldic
for row in master:
  for item in finalfinaldic:
    if row[month] == item:
      row += [finalfinaldic[item]]

data = np.array(master)

print(data)

[[1.99001e+05 0.00000e+00 3.10000e+01 ... 9.00000e+03 1.70000e+00
  1.66695e+04]
 [1.99001e+05 0.00000e+00 3.10000e+01 ... 6.00000e+03 1.70000e+00
  1.66695e+04]
 [1.99001e+05 0.00000e+00 3.10000e+01 ... 8.00000e+03 1.70000e+00
  1.66695e+04]
 ...
 [1.99912e+05 5.00000e+00 1.46000e+02 ... 4.69000e+05 2.40000e+00
  3.86775e+04]
 [1.99912e+05 5.00000e+00 1.46000e+02 ... 4.40000e+05 2.40000e+00
  3.86775e+04]
 [1.99912e+05 5.00000e+00 1.45000e+02 ... 4.84000e+05 2.40000e+00
  3.86775e+04]]


In [10]:
#6: inserting the distance to cbd
#Uses the Nominamtim API to obtain latitude and longitude for the street
data = []
with open("19901999.csv",'r') as f:
  for row in f:
    data.append(row.strip('\n').split(','))

def get_lat_lon(address):

    #Initializes the geocoder and collects the location information
    geolocator = Nominatim(user_agent="Jupyter_AI_Project_HomeBros")
    location = geolocator.geocode(address, timeout=10)

    #Begins up to 5 attempts to find location, increasing delay time each attempt in case of timeout
    for attempt in range(5):
        delay = 1
        try:
            location = geolocator.geocode(address, timeout=10)

            #Checks if location found. If not, prints address and returns None.
            #When not found, address was examined to identify why. (During preprocessing. Resulted in the many if statements above)
            if location:
                return location.latitude, location.longitude
            else:
                print(address)
                return None, None

        except GeocoderTimedOut:
            print(f"⏱️ Timeout on attempt {attempt+1} for '{address}'... retrying in {delay}s.")
            time.sleep(delay)
            delay += 1


# Calculates shortest possible distance to CBD in Kilometres
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Earth's radius in kilometers
    phi1 = math.radians(lat1)
    phi2 = math.radians(lat2)
    delta_phi = math.radians(lat2 - lat1)
    delta_lambda = math.radians(lon2 - lon1)

    a = math.sin(delta_phi/2)**2 + math.cos(phi1) * math.cos(phi2) * math.sin(delta_lambda/2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    distance = R * c
    return round(distance,3)




# Selects the unique street names for further processing
# Replaces streets that no longer exist with streets near the location in the past
# Replaces short forms into full words for Nominatim search
streets = []
unique_streets = []
for row in data[1:]:
    new_strt = row[4]

    # Storage to retrieve indexes later
    if new_strt not in streets:
      streets.append(new_strt)

    # Processing
    if " NTH " in new_strt:
      new_strt = new_strt.replace(" NTH ", " NORTH ")
    if " NTH" in new_strt:
      new_strt = new_strt.replace(" NTH", " NORTH")
    if " STH " in new_strt:
      new_strt = new_strt.replace(" STH ", " SOUTH ")
    if " ST " in new_strt:
      new_strt = new_strt.replace(" ST ", " STREET ")
    if " RD " in new_strt:
      new_strt = new_strt.replace(" RD ", " ROAD ")
    if "BT " in new_strt:
      new_strt = new_strt.replace("BT ", "BUKIT ")
    if " BT " in new_strt:
      new_strt = new_strt.replace(" BT ", " BUKIT ")
    if " ST" == new_strt[-3:]:
      new_strt = new_strt[:-3] + " STREET"
    if " RD" == new_strt[-3:]:
      new_strt = new_strt[:-3] + " ROAD"
    if new_strt == "JLN MEMBINA BARAT":
      new_strt = "JALAN MEMBINA"
    if "JLN " in new_strt:
      new_strt = new_strt.replace("JLN ", "JALAN ")
    if "LOR " in new_strt:
      new_strt = new_strt.replace("LOR ", "LORONG ")
    if " AVE " in new_strt:
      new_strt = new_strt.replace(" AVE ", " AVENUE ")
    if " AVE" == new_strt[-4:]:
      new_strt = new_strt[:-4] + " AVENUE"
    if " DR " in new_strt:
      new_strt = new_strt.replace(" DR ", " DRIVE ")
    if " DR" == new_strt[-3:]:
      new_strt = new_strt[:-3] + " DRIVE"
    if "C'WEALTH" in new_strt:
      new_strt = new_strt.replace("C'WEALTH", "COMMONWEALTH")
    if "TG " in new_strt:
      new_strt = new_strt.replace("TG ", "TANJONG ")
    if new_strt == "KG BAHRU HILL":
      new_strt = "SPOONER ROAD" #Road no longer exists
    elif "KG " in new_strt:
      new_strt = new_strt.replace("KG ", "KAMPONG ")
    if "UPP " in new_strt:
      new_strt = new_strt.replace("UPP ", "UPPER ")
    if "BUANGKOK SOUTH FARMWAY 1" == new_strt:
      new_strt = "BUANGKOK" #Road no longer exists

    #If street not already in unique_streets, add it
    if new_strt not in unique_streets:
      unique_streets.append(new_strt)


#Converts each item in streets to a list in the format [street_name, latitude, longitude]
i = 0
for row in unique_streets:
  address = row
  lat, lon = get_lat_lon(address)
  unique_streets[i] = [row, lat, lon]
  i += 1

# Initialized the values for CBD's latitude and longitude
CBD = [1.2812, 103.8503]
for row in unique_streets:
  dist = haversine(CBD[0], CBD[1], row[1], row[2])
  row.append(dist)

# Adjusts main dataset to reflect dist to CBD instead of street name
data[0][4] = "distance_to_CBD"
for row in data[1:]:
  if streets.index(row[4]) > len(unique_streets):
    print(row[4])
  else:
    record = unique_streets[streets.index(row[4])]
    row[4] = record[3]

for row in range(len(data[1:])):
  master[row].append(data[1:][row][4])

for each in master[:10]:
  print(each)




KeyboardInterrupt: 

In [36]:
#7 Insert population into dataset
with open('M810811.csv', 'r') as f:
  finalliz = f.readlines()
  year = (finalliz[10].strip().split(','))[1:] #removing irrelevant data and the header value to obtain the years
  #r = year.index('1990 1Q ') #finding the index to cut off and obtain relevant years
  #year = year[:r+1]
  population = (finalliz[11].strip().split(','))[1:] #removing irrelevant data and the header value to obtain the gdp
  #r2 = gdp.index('16669.5') #finding the index to cut off and obtain relevant gdp
  #gdp = gdp[:r2+1]
months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
finalpop = []
finalyear = []
for row in year:
  for m in months:
    finalyear += [(row.strip()+m)]

for pop in population:
  for i in range(12):
    finalpop.append(pop)

#adding the correct unemployment rate to the correct month in the dataset based on the float key of the month in finaldic
for row in master:
  for item in finalyear:
    if row[month] == float(item):
      row += [float(finalpop[finalyear.index(item)])]


data = np.array(master)

print(data)


[[1.990010e+05 0.000000e+00 3.100000e+01 ... 1.700000e+00 1.666950e+04
  3.047132e+06]
 [1.990010e+05 0.000000e+00 3.100000e+01 ... 1.700000e+00 1.666950e+04
  3.047132e+06]
 [1.990010e+05 0.000000e+00 3.100000e+01 ... 1.700000e+00 1.666950e+04
  3.047132e+06]
 ...
 [1.999120e+05 5.000000e+00 1.460000e+02 ... 2.400000e+00 3.867750e+04
  3.958723e+06]
 [1.999120e+05 5.000000e+00 1.460000e+02 ... 2.400000e+00 3.867750e+04
  3.958723e+06]
 [1.999120e+05 5.000000e+00 1.450000e+02 ... 2.400000e+00 3.867750e+04
  3.958723e+06]]
