In [1]:
# Dependencies
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import scipy.stats as st
from scipy.stats import linregress
from pandas.core.reshape.pivot import pivot


In [2]:
# Read CSV
df1 = pd.read_csv("raw_data/2017-18-vacancy.csv")
df2 = pd.read_csv("raw_data/2018-19-vacancy.csv")
df3 = pd.read_csv("raw_data/2019-20-vacancy.csv")
df4 = pd.read_csv("raw_data/2020-21-vacancy.csv")

In [3]:
df1.columns

Index(['Postcode', 'Bedrooms', 'RentalUnitDesc', 'VUDate', 'VTDate', 'TENDate',
       'RUUseType', 'HousingServiceCentre', 'VUDays', 'VTDays', 'TotalVAC',
       'LGA', 'StateElectorate'],
      dtype='object')

In [4]:
df2.columns

Index(['Postcode', 'Bedrooms', 'RentalUnitDesc', 'VUDate', 'VTDate', 'TENDate',
       'RUUseType', 'HousingServiceCentre', 'VUDays', 'VTDays', 'TotalVAC',
       'LGA', 'StateElectorate'],
      dtype='object')

In [5]:
#Fixing column names and reordering as needed
df3 = df3.rename(columns= {'PropertyType':'RentalUnitDesc', 'TotalVacDays':'TotalVac', 'LocalGovtAuthority':'LGA'})
neworder = ['Postcode', 'Bedrooms', 'RentalUnitDesc', 'VUDate', 'VTDate', 'TENDate',
       'RUUseType', 'HousingServiceCentre', 'VUDays', 'VTDays', 'TotalVAC',
       'LGA', 'StateElectorate']
df3=df3.reindex(columns=neworder)
df3.columns

Index(['Postcode', 'Bedrooms', 'RentalUnitDesc', 'VUDate', 'VTDate', 'TENDate',
       'RUUseType', 'HousingServiceCentre', 'VUDays', 'VTDays', 'TotalVAC',
       'LGA', 'StateElectorate'],
      dtype='object')

In [6]:
df4.tail()

Unnamed: 0,Postcode,Bedrooms,PropertyType,VUDate,VTDate,TENDate,RUUseType,VUDays,VTDays,TotalVAC,HousingServiceCentre,LocalGovtAuthority,StateElectorate
6262,4101,1,Apartment,1/08/2020,1/09/2020,1/09/2020,Community Housing,23,0,23,FORTITUDE VALLEY,Brisbane,South Brisbane
6263,4034,3,Apartment,1/05/2021,1/06/2021,1/06/2021,Community Housing,28,0,28,CHERMSIDE,Brisbane,Nudgee
6264,4101,1,Apartment,1/07/2020,1/08/2020,1/08/2020,Community Housing,33,0,33,FORTITUDE VALLEY,Brisbane,South Brisbane
6265,4020,1,Apartment,1/12/2020,1/01/2021,1/02/2021,Community Housing,35,18,53,MORETON BAY,Moreton Bay,Redcliffe
6266,4020,1,Apartment,1/03/2021,1/04/2021,1/04/2021,Community Housing,46,0,46,MORETON BAY,Moreton Bay,Redcliffe


In [7]:
#Concatenating all dataframes
df = pd.concat([df1, df2, df3, df4])
df = df.dropna(subset=['TotalVAC'],inplace = False)
#Dropping rows with 0 days vacant
df = df[df.TotalVAC != 0]
df.shape

(28103, 15)

In [8]:
#Create two new columns for year and month
#Convert date string to year and month
Years = []
Months = []
date_splited = []
dates = pd.Series(df['TENDate'])
for date in dates:
    if '/' in date:
        date_splited = date.split('/')
        Years.append(date_splited[2].strip())
        Months.append(f'{date_splited[2]}-{date_splited[1]}')
    elif '-' in date:
        date_splited = date.split('-')
        Years.append(f'20{date_splited[1].strip()}')
        Months.append(date_splited[0].strip())
    else:
        Years.append(date)

print(len(dates))
print(len(Years))
df['Year'] = Years
df['Month'] = Months

28103
28103


In [9]:
#Write to CSV
df.to_csv("all_vacancy_data.csv")
df.head()

Unnamed: 0,Postcode,Bedrooms,RentalUnitDesc,VUDate,VTDate,TENDate,RUUseType,HousingServiceCentre,VUDays,VTDays,TotalVAC,LGA,StateElectorate,PropertyType,LocalGovtAuthority,Year,Month
0,4032,3,DETACHED HOUSE,9/10/2017,2/11/2017,3/11/2017,Public Housing,105,24,1,25,BRISBANE,STAFFORD,,,2017,2017-11
1,4108,2,APARTMENT,3/04/2018,18/04/2018,2/05/2018,Public Housing,107,15,14,29,BRISBANE,TOOHEY,,,2018,2018-05
2,4108,2,APARTMENT,1/08/2017,11/08/2017,21/08/2017,Public Housing,107,10,10,20,BRISBANE,TOOHEY,,,2017,2017-08
3,4108,2,APARTMENT,1/03/2018,23/03/2018,28/03/2018,Public Housing,107,22,5,27,BRISBANE,TOOHEY,,,2018,2018-03
4,4108,2,APARTMENT,15/11/2017,1/12/2017,2/01/2018,Public Housing,107,16,32,48,BRISBANE,TOOHEY,,,2018,2018-01
