# **Import modules & fetch data from Open Data Melbourne**

In [1]:
import requests
import pandas as pd
import os


# define function to fetch data from website using API

def fetch_data(base_url, dataset, api_key, num_records = 99, offset = 0):
    all_records = []
    max_offset = 9900  # Maximum number of requests

    while True:
        # maximum limit check
        if offset > max_offset:
            break

        # Create API request URL
        filters = f'{dataset}/records?limit={num_records}&offset={offset}'
        url = f'{base_url}{filters}&api_key={api_key}'

        # Start request
        try:
            result = requests.get(url, timeout=10)
            result.raise_for_status()
            records = result.json().get('results')
        except requests.exceptions.RequestException as e:
            raise Exception(f"API request failed: {e}")
        if records is None:
            break
        all_records.extend(records)
        if len(records) < num_records:
            break

        # next cycle offset
        offset += num_records

    # DataFrame all data
    df = pd.DataFrame(all_records)
    return df

API_KEY = os.environ.get('MELBOURNE_API_KEY', input("Please enter your API key: "))
BASE_URL = 'https://data.melbourne.vic.gov.au/api/explore/v2.1/catalog/datasets/'

Please enter your API key: be95dc3bb26fd856f52d6bf1aaf777b2068541652c5da2dc57c3c431


# **Dataset:** Development activity monitor

## Load dataset

In [2]:
# get data
DEV_ACTIVITY = 'development-activity-monitor'

df_development = fetch_data(BASE_URL, DEV_ACTIVITY, API_KEY)

df_development.head()

Unnamed: 0,data_format,development_key,status,year_completed,clue_small_area,clue_block,street_address,property_id,property_id_2,property_id_3,...,hospital_flr,recreation_flr,publicdispaly_flr,community_flr,car_spaces,bike_spaces,town_planning_application,longitude,latitude,geopoint
0,Pre May 16,X000479,COMPLETED,2006,North Melbourne,342,191-201 Abbotsford Street NORTH MELBOURNE VIC ...,100023,,,...,0,0,0,0,0,0,0,144.94503,-37.802822,"{'lon': 144.9450298, 'lat': -37.80282184}"
1,Pre May 16,X000459,COMPLETED,2005,North Melbourne,333,218-224 Abbotsford Street NORTH MELBOURNE VIC ...,100119,,,...,0,0,0,0,0,0,0,144.945947,-37.802049,"{'lon': 144.9459475, 'lat': -37.80204879}"
2,Pre May 16,X000573,COMPLETED,2013,West Melbourne (Residential),414,56-62 Abbotsford Street WEST MELBOURNE VIC 3003,100144,,,...,0,0,0,0,0,0,0,144.944719,-37.806791,"{'lon': 144.9447186, 'lat': -37.80679128}"
3,Pre May 16,X000563,COMPLETED,2014,West Melbourne (Residential),409,1-9 Stawell Street WEST MELBOURNE VIC 3003,100441,,,...,0,0,0,0,0,28,0,144.942096,-37.806072,"{'lon': 144.9420962, 'lat': -37.80607242}"
4,Pre May 16,X000997,COMPLETED,2007,North Melbourne,1012,229-235 Arden Street NORTH MELBOURNE VIC 3051,100556,,,...,0,0,0,0,0,0,0,144.939286,-37.800374,"{'lon': 144.9392856, 'lat': -37.80037382}"


In [3]:
df_development.isna().sum()

data_format                       0
development_key                   0
status                            0
year_completed                  380
clue_small_area                   0
clue_block                        0
street_address                    0
property_id                       0
property_id_2                  1239
property_id_3                  1367
property_id_4                  1393
property_id_5                  1403
floors_above                      0
resi_dwellings                    0
studio_dwe                        0
one_bdrm_dwe                      0
two_bdrm_dwe                      0
three_bdrm_dwe                    0
student_apartments                0
student_beds                      0
student_accommodation_units       0
institutional_accom_beds          0
hotel_rooms                       0
serviced_apartments               0
hotels_serviced_apartments        0
hostel_beds                       0
childcare_places                  0
office_flr                  

## Data cleaning

### Fill 'property_id_5' and remove unwanted columns

In [4]:
# save as a new dataset to preserve the original data

df_dev_filled = df_development.copy()

# fill 'property_id_5' with values from 'property_id_4', then 'property_id_3', then 'property_id_2', then 'property_id'

df_dev_filled['property_id_5'] = df_dev_filled['property_id_5'].fillna(df_dev_filled['property_id_4']).fillna(df_dev_filled['property_id_3']).fillna(df_dev_filled['property_id_2']).fillna(df_dev_filled['property_id'])

# drop 'property_id_4' to 'property_id', and other unwanted columns

df_dev_filled.drop(['property_id_4', 'property_id_3', 'property_id_2', 'property_id', 'data_format', 'development_key', 'town_planning_application'], axis = 1, inplace = True)

len(df_dev_filled)

1406

In [5]:
df_dev_filled.head()

Unnamed: 0,status,year_completed,clue_small_area,clue_block,street_address,property_id_5,floors_above,resi_dwellings,studio_dwe,one_bdrm_dwe,...,education_flr,hospital_flr,recreation_flr,publicdispaly_flr,community_flr,car_spaces,bike_spaces,longitude,latitude,geopoint
0,COMPLETED,2006,North Melbourne,342,191-201 Abbotsford Street NORTH MELBOURNE VIC ...,100023,2,17,0,0,...,0,0,0,0,0,0,0,144.94503,-37.802822,"{'lon': 144.9450298, 'lat': -37.80282184}"
1,COMPLETED,2005,North Melbourne,333,218-224 Abbotsford Street NORTH MELBOURNE VIC ...,100119,3,12,0,0,...,0,0,0,0,0,0,0,144.945947,-37.802049,"{'lon': 144.9459475, 'lat': -37.80204879}"
2,COMPLETED,2013,West Melbourne (Residential),414,56-62 Abbotsford Street WEST MELBOURNE VIC 3003,100144,4,3,0,0,...,0,0,0,0,0,0,0,144.944719,-37.806791,"{'lon': 144.9447186, 'lat': -37.80679128}"
3,COMPLETED,2014,West Melbourne (Residential),409,1-9 Stawell Street WEST MELBOURNE VIC 3003,100441,3,28,0,17,...,0,0,0,0,0,0,28,144.942096,-37.806072,"{'lon': 144.9420962, 'lat': -37.80607242}"
4,COMPLETED,2007,North Melbourne,1012,229-235 Arden Street NORTH MELBOURNE VIC 3051,100556,2,0,0,0,...,0,0,0,0,0,0,0,144.939286,-37.800374,"{'lon': 144.9392856, 'lat': -37.80037382}"


In [6]:
df_dev_filled.isna().sum()

status                           0
year_completed                 380
clue_small_area                  0
clue_block                       0
street_address                   0
property_id_5                    0
floors_above                     0
resi_dwellings                   0
studio_dwe                       0
one_bdrm_dwe                     0
two_bdrm_dwe                     0
three_bdrm_dwe                   0
student_apartments               0
student_beds                     0
student_accommodation_units      0
institutional_accom_beds         0
hotel_rooms                      0
serviced_apartments              0
hotels_serviced_apartments       0
hostel_beds                      0
childcare_places                 0
office_flr                       0
retail_flr                       0
industrial_flr                   0
storage_flr                      0
education_flr                    0
hospital_flr                     0
recreation_flr                   0
publicdispaly_flr   

In [7]:
# rename 'property_id_5'
df_dev_filled.rename(columns = {'property_id_5': 'property_id'}, inplace = True)

# check the dataframe
df_dev_filled.head()

Unnamed: 0,status,year_completed,clue_small_area,clue_block,street_address,property_id,floors_above,resi_dwellings,studio_dwe,one_bdrm_dwe,...,education_flr,hospital_flr,recreation_flr,publicdispaly_flr,community_flr,car_spaces,bike_spaces,longitude,latitude,geopoint
0,COMPLETED,2006,North Melbourne,342,191-201 Abbotsford Street NORTH MELBOURNE VIC ...,100023,2,17,0,0,...,0,0,0,0,0,0,0,144.94503,-37.802822,"{'lon': 144.9450298, 'lat': -37.80282184}"
1,COMPLETED,2005,North Melbourne,333,218-224 Abbotsford Street NORTH MELBOURNE VIC ...,100119,3,12,0,0,...,0,0,0,0,0,0,0,144.945947,-37.802049,"{'lon': 144.9459475, 'lat': -37.80204879}"
2,COMPLETED,2013,West Melbourne (Residential),414,56-62 Abbotsford Street WEST MELBOURNE VIC 3003,100144,4,3,0,0,...,0,0,0,0,0,0,0,144.944719,-37.806791,"{'lon': 144.9447186, 'lat': -37.80679128}"
3,COMPLETED,2014,West Melbourne (Residential),409,1-9 Stawell Street WEST MELBOURNE VIC 3003,100441,3,28,0,17,...,0,0,0,0,0,0,28,144.942096,-37.806072,"{'lon': 144.9420962, 'lat': -37.80607242}"
4,COMPLETED,2007,North Melbourne,1012,229-235 Arden Street NORTH MELBOURNE VIC 3051,100556,2,0,0,0,...,0,0,0,0,0,0,0,144.939286,-37.800374,"{'lon': 144.9392856, 'lat': -37.80037382}"


### Fix duplicated value in 'property_id_5' if any

In [8]:
# identify duplicates in 'property_id' if any
dup = df_dev_filled[df_dev_filled.duplicated(subset = 'property_id', keep = False)]

# sort these duplicates by 'property_id' and then by 'year_completed' in descending order
dup_sorted = dup.sort_values(by = ['property_id', 'year_completed'], ascending = [True, False])

# display rows where 'property_id' has duplicates
print(dup_sorted)

len(dup_sorted)

                  status year_completed               clue_small_area  \
460   UNDER CONSTRUCTION           None  West Melbourne (Residential)   
1383  UNDER CONSTRUCTION           None  West Melbourne (Residential)   
307            COMPLETED           2019  West Melbourne (Residential)   
776            COMPLETED           2019  West Melbourne (Residential)   
435             APPROVED           None  West Melbourne (Residential)   
...                  ...            ...                           ...   
1177           COMPLETED           2017  West Melbourne (Residential)   
343            COMPLETED           2023               Melbourne (CBD)   
1281           COMPLETED           2023               Melbourne (CBD)   
1274           COMPLETED           2023               Melbourne (CBD)   
1275           COMPLETED           2023               Melbourne (CBD)   

      clue_block                                   street_address property_id  \
460          412  17-37 Abbotsford Street 

373

In [9]:
# drop duplicates and those are under construnction
# keeping the first (which has the latest year) if years are different

df_dev_clean = df_dev_filled.drop(dup_sorted[dup_sorted.duplicated(subset = 'property_id', keep = 'first')].index)

print(df_dev_clean)

                  status year_completed               clue_small_area  \
0              COMPLETED           2006               North Melbourne   
1              COMPLETED           2005               North Melbourne   
2              COMPLETED           2013  West Melbourne (Residential)   
3              COMPLETED           2014  West Melbourne (Residential)   
4              COMPLETED           2007               North Melbourne   
...                  ...            ...                           ...   
1401  UNDER CONSTRUCTION           None                       Carlton   
1402  UNDER CONSTRUCTION           None                East Melbourne   
1403  UNDER CONSTRUCTION           None                     Southbank   
1404  UNDER CONSTRUCTION           None                East Melbourne   
1405  UNDER CONSTRUCTION           None               North Melbourne   

      clue_block                                     street_address  \
0            342  191-201 Abbotsford Street NORTH ME

In [10]:
## check if there still have duplicated values in 'property_id'

dup_2 = df_dev_clean[df_dev_clean.duplicated(subset = 'property_id', keep = False)]

In [11]:
print(dup_2)

len(dup_2)

Empty DataFrame
Columns: [status, year_completed, clue_small_area, clue_block, street_address, property_id, floors_above, resi_dwellings, studio_dwe, one_bdrm_dwe, two_bdrm_dwe, three_bdrm_dwe, student_apartments, student_beds, student_accommodation_units, institutional_accom_beds, hotel_rooms, serviced_apartments, hotels_serviced_apartments, hostel_beds, childcare_places, office_flr, retail_flr, industrial_flr, storage_flr, education_flr, hospital_flr, recreation_flr, publicdispaly_flr, community_flr, car_spaces, bike_spaces, longitude, latitude, geopoint]
Index: []

[0 rows x 35 columns]


0

In [12]:
# remove 'office_flr', 'retail_flr', 'industrial_flr', 'storage_flr', 'education_flr', 'hospital_flr', 'recreation_flr', 'publicdispaly_flr', 'community_flr'


---

In [13]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [14]:
df_dev_filled.to_csv('/content/gdrive/My Drive/Colab Notebooks/df_dev_filled.csv', index = False)

# **Dataset:** Residential dwellings

## Load dataset

In [15]:
# get data
RES_DWELLINGS = 'residential-dwellings'

df_residential = fetch_data(BASE_URL, RES_DWELLINGS, API_KEY)

df_residential.head()

Unnamed: 0,census_year,block_id,property_id,base_property_id,building_address,clue_small_area,dwelling_type,dwelling_number,longitude,latitude,location
0,2017,355,107908,107908,463 Queensberry Street NORTH MELBOURNE 3051,North Melbourne,House/Townhouse,1,144.952007,-37.803558,"{'lon': 144.95200710155, 'lat': -37.8035582919..."
1,2017,355,107910,107910,467 Queensberry Street NORTH MELBOURNE 3051,North Melbourne,House/Townhouse,1,144.951877,-37.803541,"{'lon': 144.95187690720002, 'lat': -37.8035406..."
2,2017,355,107913,107913,477 Queensberry Street NORTH MELBOURNE 3051,North Melbourne,House/Townhouse,1,144.951609,-37.803527,"{'lon': 144.95160858014998, 'lat': -37.8035273..."
3,2017,355,107914,107914,479 Queensberry Street NORTH MELBOURNE 3051,North Melbourne,House/Townhouse,1,144.95155,-37.803521,"{'lon': 144.95154960515, 'lat': -37.8035207254..."
4,2017,355,107915,107915,481 Queensberry Street NORTH MELBOURNE 3051,North Melbourne,House/Townhouse,1,144.951488,-37.803514,"{'lon': 144.95148784765, 'lat': -37.8035138005..."


In [16]:
df_residential.isna().sum()   # check if there are nulls

census_year          0
block_id             0
property_id          0
base_property_id     0
building_address     0
clue_small_area      0
dwelling_type        0
dwelling_number      0
longitude           43
latitude            43
location            43
dtype: int64

In [17]:
len(df_residential)

9999

## Data cleaning

In [18]:
# drop 'base_property_id'
df_res_dropped = df_residential.drop(['base_property_id'], axis = 1)

df_res_dropped.head()

Unnamed: 0,census_year,block_id,property_id,building_address,clue_small_area,dwelling_type,dwelling_number,longitude,latitude,location
0,2017,355,107908,463 Queensberry Street NORTH MELBOURNE 3051,North Melbourne,House/Townhouse,1,144.952007,-37.803558,"{'lon': 144.95200710155, 'lat': -37.8035582919..."
1,2017,355,107910,467 Queensberry Street NORTH MELBOURNE 3051,North Melbourne,House/Townhouse,1,144.951877,-37.803541,"{'lon': 144.95187690720002, 'lat': -37.8035406..."
2,2017,355,107913,477 Queensberry Street NORTH MELBOURNE 3051,North Melbourne,House/Townhouse,1,144.951609,-37.803527,"{'lon': 144.95160858014998, 'lat': -37.8035273..."
3,2017,355,107914,479 Queensberry Street NORTH MELBOURNE 3051,North Melbourne,House/Townhouse,1,144.95155,-37.803521,"{'lon': 144.95154960515, 'lat': -37.8035207254..."
4,2017,355,107915,481 Queensberry Street NORTH MELBOURNE 3051,North Melbourne,House/Townhouse,1,144.951488,-37.803514,"{'lon': 144.95148784765, 'lat': -37.8035138005..."


In [19]:
# duplicate ID checking on 'property_id'
# if any, keep row with the latest year

# sort dataframe by 'property_id' then 'census_year' (latest year first for each group of 'property_id')
df_res_sorted = df_res_dropped.sort_values(by = ['property_id', 'census_year'], ascending = [True, False])

# drop duplicates in 'property_id', keeping only the row with the latest year in 'census_year'
df_res_cleaned = df_res_sorted.drop_duplicates(subset = 'property_id', keep = 'first')

print(df_res_cleaned)

     census_year  block_id property_id  \
5995        2006       528           1   
3574        2019       411      100001   
5245        2006       342      100011   
4660        2019       342      100014   
4661        2019       342      100015   
...          ...       ...         ...   
3221        2019       346      681458   
1766        2018      2531      681562   
3949        2019       527      681850   
3170        2019      1108      688500   
4671        2019       342      690277   

                                    building_address  \
5995          23-25 Wakefield Street KENSINGTON 3031   
3574      1-13 Abbotsford Street WEST MELBOURNE 3003   
5245  151-153 Abbotsford Street NORTH MELBOURNE 3051   
4660  163-165 Abbotsford Street NORTH MELBOURNE 3051   
4661      167 Abbotsford Street NORTH MELBOURNE 3051   
...                                              ...   
3221          17 Purcell Street NORTH MELBOURNE 3051   
1766            17 Nottingham Street KENSINGTON

In [20]:
df_res_cleaned.isna().sum()

census_year          0
block_id             0
property_id          0
building_address     0
clue_small_area      0
dwelling_type        0
dwelling_number      0
longitude           24
latitude            24
location            24
dtype: int64

# **Dataset:** Building information

## Load dataset

In [21]:
# get data
BUILDING_INFO = 'buildings-with-name-age-size-accessibility-and-bicycle-facilities'

df_bldg_info = fetch_data(BASE_URL, BUILDING_INFO, API_KEY)

df_bldg_info.head()

Unnamed: 0,census_year,block_id,property_id,base_property_id,building_name,street_address,clue_small_area,construction_year,refurbished_year,number_of_floors_above_ground,predominant_space_use,accessibility_type,accessibility_type_description,accessibility_rating,bicycle_spaces,has_showers,longitude,latitude,location
0,2003,608,104285,104285,,84 George Street EAST MELBOURNE 3002,East Melbourne,,,2.0,House/Townhouse,Not determined or not applicable,Building is not considered to be publicly acce...,0,,,144.987521,-37.815299,"{'lon': 144.9875211528448, 'lat': -37.81529934..."
1,2003,608,104287,104287,,80 George Street EAST MELBOURNE 3002,East Melbourne,,,2.0,House/Townhouse,Not determined or not applicable,Building is not considered to be publicly acce...,0,,,144.987677,-37.815316,"{'lon': 144.98767742034823, 'lat': -37.8153163..."
2,2003,608,104947,104947,,81-83 Hotham Street EAST MELBOURNE 3002,East Melbourne,,,2.0,House/Townhouse,Not determined or not applicable,Building is not considered to be publicly acce...,0,,,144.987589,-37.814577,"{'lon': 144.98758896457355, 'lat': -37.8145774..."
3,2003,608,104948,104948,,87 Hotham Street EAST MELBOURNE 3002,East Melbourne,,,2.0,House/Townhouse,Not determined or not applicable,Building is not considered to be publicly acce...,0,,,144.987486,-37.814572,"{'lon': 144.9874861960862, 'lat': -37.81457155..."
4,2003,608,104949,104949,,89 Hotham Street EAST MELBOURNE 3002,East Melbourne,,,2.0,Residential Apartment,Not determined or not applicable,Building is not considered to be publicly acce...,0,,,144.987408,-37.814593,"{'lon': 144.98740820504202, 'lat': -37.8145933..."


In [22]:
df_bldg_info.isna().sum()   # check if there are nulls

census_year                          0
block_id                             0
property_id                          0
base_property_id                     0
building_name                     7808
street_address                       0
clue_small_area                      0
construction_year                 4399
refurbished_year                  8392
number_of_floors_above_ground        6
predominant_space_use                0
accessibility_type                1020
accessibility_type_description    1020
accessibility_rating              1020
bicycle_spaces                    6035
has_showers                       9999
longitude                          434
latitude                           434
location                           434
dtype: int64

In [23]:
len(df_bldg_info)

9999

## Data cleaning

In [24]:
# drop 'base_property_id' for 'df_bldg_info' dataset
df_bldg_dropped = df_bldg_info.drop(['base_property_id', 'building_name'], axis = 1)

df_bldg_dropped.head()

Unnamed: 0,census_year,block_id,property_id,street_address,clue_small_area,construction_year,refurbished_year,number_of_floors_above_ground,predominant_space_use,accessibility_type,accessibility_type_description,accessibility_rating,bicycle_spaces,has_showers,longitude,latitude,location
0,2003,608,104285,84 George Street EAST MELBOURNE 3002,East Melbourne,,,2.0,House/Townhouse,Not determined or not applicable,Building is not considered to be publicly acce...,0,,,144.987521,-37.815299,"{'lon': 144.9875211528448, 'lat': -37.81529934..."
1,2003,608,104287,80 George Street EAST MELBOURNE 3002,East Melbourne,,,2.0,House/Townhouse,Not determined or not applicable,Building is not considered to be publicly acce...,0,,,144.987677,-37.815316,"{'lon': 144.98767742034823, 'lat': -37.8153163..."
2,2003,608,104947,81-83 Hotham Street EAST MELBOURNE 3002,East Melbourne,,,2.0,House/Townhouse,Not determined or not applicable,Building is not considered to be publicly acce...,0,,,144.987589,-37.814577,"{'lon': 144.98758896457355, 'lat': -37.8145774..."
3,2003,608,104948,87 Hotham Street EAST MELBOURNE 3002,East Melbourne,,,2.0,House/Townhouse,Not determined or not applicable,Building is not considered to be publicly acce...,0,,,144.987486,-37.814572,"{'lon': 144.9874861960862, 'lat': -37.81457155..."
4,2003,608,104949,89 Hotham Street EAST MELBOURNE 3002,East Melbourne,,,2.0,Residential Apartment,Not determined or not applicable,Building is not considered to be publicly acce...,0,,,144.987408,-37.814593,"{'lon': 144.98740820504202, 'lat': -37.8145933..."


In [25]:
# duplicate ID checking on 'property_id'
# if any, keep row with the latest year

# sort dataframe by 'property_id' then 'census_year' (latest year first for each group of 'property_id')
df_bldg_sorted = df_bldg_dropped.sort_values(by = ['property_id', 'census_year'], ascending = [True, False])

# drop duplicates in 'property_id', keeping only the row with the latest year in 'census_year'
df_bldg_cleaned = df_bldg_sorted.drop_duplicates(subset = 'property_id', keep = 'first')

print(df_bldg_cleaned)

     census_year  block_id property_id  \
6362        2017       411      100001   
2863        2004       412      100002   
6363        2017       413      100003   
6364        2017       413      100004   
5838        2017       342      100009   
...          ...       ...         ...   
7140        2021       736      706730   
4173        2021        87      707517   
9132        2020        37      709874   
6802        2021       568      712803   
4193        2021        94      713467   

                                        street_address  \
6362        1-13 Abbotsford Street WEST MELBOURNE 3003   
2863       17-37 Abbotsford Street WEST MELBOURNE 3003   
6363       39-49 Abbotsford Street WEST MELBOURNE 3003   
6364       51-57 Abbotsford Street WEST MELBOURNE 3003   
5838    135-141 Abbotsford Street NORTH MELBOURNE 3051   
...                                                ...   
7140           37-47 Balston Street SOUTHBANK VIC 3006   
4173      303-307 Exhibition St

# **Merging**

### Merge 'Development activity monitor' and 'Residential dwellings'

In [26]:
list(df_dev_clean)

['status',
 'year_completed',
 'clue_small_area',
 'clue_block',
 'street_address',
 'property_id',
 'floors_above',
 'resi_dwellings',
 'studio_dwe',
 'one_bdrm_dwe',
 'two_bdrm_dwe',
 'three_bdrm_dwe',
 'student_apartments',
 'student_beds',
 'student_accommodation_units',
 'institutional_accom_beds',
 'hotel_rooms',
 'serviced_apartments',
 'hotels_serviced_apartments',
 'hostel_beds',
 'childcare_places',
 'office_flr',
 'retail_flr',
 'industrial_flr',
 'storage_flr',
 'education_flr',
 'hospital_flr',
 'recreation_flr',
 'publicdispaly_flr',
 'community_flr',
 'car_spaces',
 'bike_spaces',
 'longitude',
 'latitude',
 'geopoint']

In [27]:
list(df_res_cleaned)

['census_year',
 'block_id',
 'property_id',
 'building_address',
 'clue_small_area',
 'dwelling_type',
 'dwelling_number',
 'longitude',
 'latitude',
 'location']

In [28]:
# skip duplicated columns while merging:
# 'block_id', 'clue_small_area', 'street_address', 'longitude', 'latitude' and 'geopoint'

dev_exclude = ['street_address', 'longitude', 'latitude', 'geopoint']
df_dev_selected = df_dev_clean.drop(dev_exclude, axis = 1)

df_res_selected = df_res_cleaned[['census_year', 'property_id', 'building_address', 'dwelling_type', 'dwelling_number', 'longitude', 'latitude', 'location']]

In [29]:
# inner join

inner_dev_res = pd.merge(df_dev_selected, df_res_selected, on = 'property_id', how = 'inner')

len(inner_dev_res)

250

In [30]:
# outer join

outer_dev_res = pd.merge(df_dev_selected, df_res_selected, on = 'property_id', how = 'outer')

len(outer_dev_res)

7658

In [31]:
# right join

right_dev_res = pd.merge(df_dev_selected, df_res_selected, on = 'property_id', how = 'right')

len(right_dev_res)

6734

In [32]:
# left join

left_dev_res = pd.merge(df_dev_selected, df_res_selected, on = 'property_id', how = 'left')

len(left_dev_res)

1174

In [33]:
list(left_dev_res.columns)

['status',
 'year_completed',
 'clue_small_area',
 'clue_block',
 'property_id',
 'floors_above',
 'resi_dwellings',
 'studio_dwe',
 'one_bdrm_dwe',
 'two_bdrm_dwe',
 'three_bdrm_dwe',
 'student_apartments',
 'student_beds',
 'student_accommodation_units',
 'institutional_accom_beds',
 'hotel_rooms',
 'serviced_apartments',
 'hotels_serviced_apartments',
 'hostel_beds',
 'childcare_places',
 'office_flr',
 'retail_flr',
 'industrial_flr',
 'storage_flr',
 'education_flr',
 'hospital_flr',
 'recreation_flr',
 'publicdispaly_flr',
 'community_flr',
 'car_spaces',
 'bike_spaces',
 'census_year',
 'building_address',
 'dwelling_type',
 'dwelling_number',
 'longitude',
 'latitude',
 'location']

In [34]:
print(left_dev_res)

                  status year_completed               clue_small_area  \
0              COMPLETED           2006               North Melbourne   
1              COMPLETED           2005               North Melbourne   
2              COMPLETED           2013  West Melbourne (Residential)   
3              COMPLETED           2014  West Melbourne (Residential)   
4              COMPLETED           2007               North Melbourne   
...                  ...            ...                           ...   
1169  UNDER CONSTRUCTION           None                       Carlton   
1170  UNDER CONSTRUCTION           None                East Melbourne   
1171  UNDER CONSTRUCTION           None                     Southbank   
1172  UNDER CONSTRUCTION           None                East Melbourne   
1173  UNDER CONSTRUCTION           None               North Melbourne   

      clue_block property_id  floors_above  resi_dwellings  studio_dwe  \
0            342      100023             2       

### Merge 'Development & Residential' with 'Building information'

In [35]:
list(df_bldg_cleaned)

['census_year',
 'block_id',
 'property_id',
 'street_address',
 'clue_small_area',
 'construction_year',
 'refurbished_year',
 'number_of_floors_above_ground',
 'predominant_space_use',
 'accessibility_type',
 'accessibility_type_description',
 'accessibility_rating',
 'bicycle_spaces',
 'has_showers',
 'longitude',
 'latitude',
 'location']

In [36]:
# skip column 'floors_above' from dataframe 'left_dev_res'

dev_res_exclude = ['floors_above']
left_dev_res = left_dev_res.drop(dev_res_exclude, axis = 1)

# skip columns from 'df_bldg_cleaned':
# 'census_year', 'block_id', 'street_address', 'clue_small_area', 'longitude', 'latitude' and 'location'

bldg_exclude = ['census_year', 'block_id', 'street_address', 'clue_small_area', 'longitude', 'latitude', 'location']
df_bldg_selected = df_bldg_cleaned.drop(bldg_exclude, axis = 1)

In [40]:
# left join

left_dev_res_bldg = pd.merge(left_dev_res, df_bldg_selected, on = 'property_id', how = 'left')

len(left_dev_res_bldg)

1174

In [41]:
list(left_dev_res_bldg.columns)

['status',
 'year_completed',
 'clue_small_area',
 'clue_block',
 'property_id',
 'resi_dwellings',
 'studio_dwe',
 'one_bdrm_dwe',
 'two_bdrm_dwe',
 'three_bdrm_dwe',
 'student_apartments',
 'student_beds',
 'student_accommodation_units',
 'institutional_accom_beds',
 'hotel_rooms',
 'serviced_apartments',
 'hotels_serviced_apartments',
 'hostel_beds',
 'childcare_places',
 'office_flr',
 'retail_flr',
 'industrial_flr',
 'storage_flr',
 'education_flr',
 'hospital_flr',
 'recreation_flr',
 'publicdispaly_flr',
 'community_flr',
 'car_spaces',
 'bike_spaces',
 'census_year',
 'building_address',
 'dwelling_type',
 'dwelling_number',
 'longitude',
 'latitude',
 'location',
 'construction_year',
 'refurbished_year',
 'number_of_floors_above_ground',
 'predominant_space_use',
 'accessibility_type',
 'accessibility_type_description',
 'accessibility_rating',
 'bicycle_spaces',
 'has_showers']

In [42]:
print(left_dev_res_bldg)

                  status year_completed               clue_small_area  \
0              COMPLETED           2006               North Melbourne   
1              COMPLETED           2005               North Melbourne   
2              COMPLETED           2013  West Melbourne (Residential)   
3              COMPLETED           2014  West Melbourne (Residential)   
4              COMPLETED           2007               North Melbourne   
...                  ...            ...                           ...   
1169  UNDER CONSTRUCTION           None                       Carlton   
1170  UNDER CONSTRUCTION           None                East Melbourne   
1171  UNDER CONSTRUCTION           None                     Southbank   
1172  UNDER CONSTRUCTION           None                East Melbourne   
1173  UNDER CONSTRUCTION           None               North Melbourne   

      clue_block property_id  resi_dwellings  studio_dwe  one_bdrm_dwe  \
0            342      100023              17     

# **Data analysis**