In [None]:
import os
import json
import requests
import time 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from datetime import datetime

from functions import load_api_key, plotting

pd.set_option('display.max_columns', None)

%load_ext autoreload
%autoreload 2

In [None]:
hdb_df = pd.read_csv('inputs/HDBPropertyInformation.csv')
rental_data_df = pd.read_csv('inputs/rental_with_coordinates.csv')
time_distance_df = pd.read_csv('inputs/travelling_distance.csv')

# Merging HDBPropertyInformation and rental_with_coordinates

In [None]:
hdb_df['address'] = hdb_df['blk_no'] + ' ' + hdb_df['street'] 

In [None]:
rental_data_df = pd.merge(rental_data_df, 
                          hdb_df[['max_floor_lvl', 'year_completed', 'residential', 'commercial', 
                                  'market_hawker', 'miscellaneous', 'multistorey_carpark', 'precinct_pavilion', 'address']], 
                          how = 'left', 
                          on = 'address')

In [None]:
rental_data_df.shape

In [None]:
rental_data_df.isna().sum()

In [None]:
rental_data_df[rental_data_df['max_floor_lvl'].isna()]

In [None]:
rental_data_df['age'] = int(datetime.now().year) - rental_data_df['year_completed']
rental_data_df['age'].describe()

In [None]:
rental_data_df['age'] = pd.to_numeric(rental_data_df['age'], errors='coerce')
bins = range(0, int(rental_data_df['age'].max()) + 5, 5)  
rental_data_df['age_group'] = pd.cut(rental_data_df['age'], bins=bins, right=False)

## EDA

In [None]:
plotting('Town rents across age groups', 'boxplot', rental_data_df, 'age_group', 'monthly_rent')

In [None]:
plotting('Town rents across age groups in each town', 'boxplot', rental_data_df, 'town', 'monthly_rent', 'age_group')

In [None]:
amenties = ['commercial', 'market_hawker', 'miscellaneous', 'multistorey_carpark', 'precinct_pavilion']
for amenity in amenties:
    plotting(f'Town rents across {amenity} in each town', 'boxplot', rental_data_df, 'town', 'monthly_rent', amenity)

# Merging again with travelling_distance

In [None]:
time_distance_df.head()

In [None]:
time_distance_df['postal_code'].nunique()

In [None]:
fastest_time_df = time_distance_df.loc[time_distance_df.groupby(['postal_code', 'latitude', 'longitude'])['walking_time_s'].idxmin()]

In [None]:
rental_data_df['postal_code'] = rental_data_df['postal_code'].astype(int)

In [None]:
main_rental_data_df = rental_data_df.merge(fastest_time_df, on = ['postal_code', 'latitude', 'longitude'])

In [None]:
main_rental_data_df.columns

In [None]:
main_rental_data_df['walking_time_s'] = pd.to_numeric(main_rental_data_df['walking_time_s'], errors='coerce')
interval = 300
bins = range(0, int(main_rental_data_df['walking_time_s'].max()) + interval, interval)  
main_rental_data_df['walking_time_interval'] = pd.cut(main_rental_data_df['walking_time_s'], bins=bins, right=False)

## EDA

In [None]:
main_rental_data_df

In [None]:
plotting('Town rents across walking time', 'boxplot', main_rental_data_df, 'walking_time_interval', 'monthly_rent', 'year')

In [None]:
for town in main_rental_data_df['town'].unique():
    df = main_rental_data_df[main_rental_data_df['town'] == town]
    town = town.replace('/', '_')
    plotting(f'Town rents across walking time in {town}', 'boxplot', df, 'walking_time_interval', 'monthly_rent', 'year')