# Introduction
<a id="Introduction"></a>

In [None]:
import time
import json
import requests
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import statsmodels.api as sm

from scipy.stats import kruskal
from scipy.stats import anderson
from datetime import datetime


from functions import load_api_key, plotting

pd.set_option('display.max_columns', None)

%load_ext autoreload
%autoreload 2

# Primary Data
<a id="Primary-Data"></a>

In [None]:
rental_data_df = pd.read_csv('inputs/RentingOutofFlats2024CSV.csv')

In [None]:
rental_data_df.shape

In [None]:
rental_data_df.head()

In [None]:
rental_data_df.isna().sum().sum() == 0

In [None]:
rental_data_df['date']       = pd.to_datetime(rental_data_df['rent_approval_date'], errors = 'coerce')
rental_data_df['year']       = rental_data_df['date'].dt.year.astype('str')
rental_data_df['month']      = rental_data_df['date'].dt.month.astype('str')
rental_data_df['year_month'] = rental_data_df['year'] + '_' + rental_data_df['month']
rental_data_df['address'] = rental_data_df['block'] + ' ' + rental_data_df['street_name'] 

In [None]:
rental_data_df['year_month'].min(), rental_data_df['year_month'].max()

In [None]:
rental_data_df.groupby('year').size()

In [None]:
df = rental_data_df.groupby(['year', 'town']).size().reset_index()
df.to_csv('temp.csv')

In [None]:
rental_data_df['town'].nunique()

In [None]:
rental_data_df.groupby('town').size().sort_values().head()

In [None]:
df = rental_data_df.groupby(['year','town'])['monthly_rent'].mean().reset_index()
df.loc[df.groupby('year')['monthly_rent'].idxmax()]

In [None]:
rental_data_df.groupby('town').size().sort_values(ascending = False).head()

In [None]:
rental_data_df.groupby('flat_type').size()

# Exploratory Data Analysis
<a id="EDA"></a>

In [None]:
plotting('Town rents', 'boxplot', rental_data_df, 'town', 'monthly_rent')

In [None]:
plotting('Town rents across years', 'boxplot', rental_data_df, 'town', 'monthly_rent', 'year')

In [None]:
plotting('Town rents across years and different room-size', 'boxplot', rental_data_df, 'flat_type', 'monthly_rent', 'year', desired_order = rental_data_df['flat_type'].sort_values().unique().tolist())

In [None]:
df = rental_data_df.groupby(['town', 'year']).size().reset_index()
df = df.rename(columns = {0:'count'})
plotting('Data count across towns and years', 'barplot', df, 'town', 'count', 'year')

# Statistical Testing
<a id="Stats"></a>

In [None]:
plotting(f'histogram', 'histplot', rental_data_df, 'monthly_rent', 'monthly_rent', category = 'year')

In [None]:
for year in rental_data_df['year'].unique():
    df = rental_data_df[rental_data_df['year'] == year]
    plotting(f'histogram for {year}', 'histplot', df, 'monthly_rent', 'monthly_rent', x_min = rental_data_df['monthly_rent'].min() * 0.9, x_max = rental_data_df['monthly_rent'].max() * 1.1, y_max = 5000)

In [None]:
import matplotlib.pyplot as plt

for year in rental_data_df['year'].unique():
    data = rental_data_df[rental_data_df['year'] == year]
    stats.probplot(data['monthly_rent'], dist="norm", plot=plt)
    plt.title(f"Q-Q Plot of Rental Prices {year}")
    plt.show()

In [None]:


for year in rental_data_df['year'].unique():
    data = rental_data_df[rental_data_df['year'] == year]
    result = anderson(data['monthly_rent'])
    print(f"Anderson-Darling Test Statistic: {result.statistic}, Critical Values: {result.critical_values}")
    
    # If the test statistic is greater than the critical value at 5% significance, reject the null hypothesis
    if result.statistic > result.critical_values[2]:
        print("The data is not normally distributed.")
    else:
        print("The data is normally distributed.")


In [None]:
factor = 'year'
years_groups = [group['monthly_rent'].values for name, group in rental_data_df.groupby(factor)]

# Perform Kruskal-Wallis test
h_stat, p_value = kruskal(*years_groups)

# Check the p-value
print(f"Kruskal-Wallis H-statistic: {h_stat}, P-value: {p_value}")

# If p-value < 0.05, reject the null hypothesis (significant difference between groups)
if p_value < 0.05:
    print("There are significant differences in rental prices across the years.")
else:
    print("There are no significant differences in rental prices across the years.")

In [None]:
factor = 'town'
years_groups = [group['monthly_rent'].values for name, group in rental_data_df.groupby(factor)]

# Perform Kruskal-Wallis test
h_stat, p_value = kruskal(*years_groups)

# Check the p-value
print(f"Kruskal-Wallis H-statistic: {h_stat}, P-value: {p_value}")

# If p-value < 0.05, reject the null hypothesis (significant difference between groups)
if p_value < 0.05:
    print("There are significant differences in rental prices across the towns.")
else:
    print("There are no significant differences in rental prices across the towns.")

In [None]:
rental_data_df['year'] = pd.to_numeric(rental_data_df['year'], errors='coerce') 
rental_data_df['monthly_rent'] = pd.to_numeric(rental_data_df['monthly_rent'], errors='coerce')  

# Prepare the data (Assuming 'year' is numeric and 'rental_price' is continuous)
X = rental_data_df['year']
y = rental_data_df['monthly_rent']

# Add constant term (intercept)
X = sm.add_constant(X)

# Fit the model
model = sm.OLS(y, X).fit()

# Get summary of regression results
print(model.summary())


# HDB to coordinates

In [None]:
api_key = load_api_key()

In [None]:
# https://www.onemap.gov.sg/apidocs/search

start = time.time()
rental_data_df['postal_code'] = None
rental_data_df['latitude'] = None
rental_data_df['longitude'] = None
row = 0

for address in rental_data_df['address'].unique():
    url = f"https://www.onemap.gov.sg/api/common/elastic/search?searchVal={address}&returnGeom=Y&getAddrDetails=Y&pageNum=1"
    headers = {"Authorization": api_key}
    response = requests.get(url, headers=headers)
    while response.status_code != 200:
        time.sleep(169)
        response = requests.request("GET", url, headers=headers)

    parsed_data = json.loads(response.text)
    rental_data_df.loc[rental_data_df['address'] == address, 'postal_code'] = parsed_data['results'][0]['POSTAL']
    rental_data_df.loc[rental_data_df['address'] == address, 'latitude'] = parsed_data['results'][0]['LATITUDE']
    rental_data_df.loc[rental_data_df['address'] == address, 'longitude'] = parsed_data['results'][0]['LONGITUDE']
    if row%100 == 0:
        print(row, time.time() - start)
    row = row + 1
start -  time.time()

# Handling wrong postal code

In [None]:
rental_data_df = pd.read_csv('inputs/rental_with_coordinates.csv')

In [None]:
unique_address_postal_df = rental_data_df[['address', 'postal_code']].drop_duplicates(keep ='first')
unique_address_postal_df['postal_code'].value_counts()

In [None]:
rental_data_df.loc[rental_data_df['postal_code'] == '530211']

In [None]:
address = '21 HOUGANG AVE 3'
url = f"https://www.onemap.gov.sg/api/common/elastic/search?searchVal={address}&returnGeom=Y&getAddrDetails=Y&pageNum=1"
headers = {"Authorization": api_key}
response = requests.get(url, headers=headers)
while response.status_code != 200:
    time.sleep(169)
    response = requests.request("GET", url, headers=headers)

parsed_data = json.loads(response.text)
rental_data_df.loc[rental_data_df['address'] == address, 'postal_code'] = parsed_data['results'][0]['POSTAL']
rental_data_df.loc[rental_data_df['address'] == address, 'latitude'] = parsed_data['results'][0]['LATITUDE']
rental_data_df.loc[rental_data_df['address'] == address, 'longitude'] = parsed_data['results'][0]['LONGITUDE']

In [None]:
rental_data_df.loc[rental_data_df['address'] == address]

In [None]:
rental_data_df.loc[rental_data_df['address'] == address, 'postal_code'] = '530021'
rental_data_df.loc[rental_data_df['address'] == address, 'latitude'] = 1.36424
rental_data_df.loc[rental_data_df['address'] == address, 'longitude'] = 103.8914777

In [None]:
rental_data_df.loc[rental_data_df['address'] == address]

In [None]:
rental_data_df

In [None]:
rental_data_df.to_csv('inputs/rental_with_coordinates.csv', index = False)