In [1]:
import os
import json
import pandas as pd
import numpy as np
import requests
from constants import data_folder, backend_url

from suburb_remoteness import suburb_remoteness_df

In [2]:
directory = os.path.join(data_folder, "suburb-profile") # directory where suburb profile data is stored
url = f'{backend_url}/suburb/' # url to post data to db
states = os.listdir(directory)

suburb_data_df = pd.DataFrame()

for state_code in states:
    d2 = os.path.join(directory, state_code)
    suburbs = os.listdir(d2)

    for suburb in suburbs:
        d3 = os.path.join(d2, suburb)
        json_files = os.listdir(d3)
        suburb, postcode = suburb[:-5], suburb[-4:]

        # Add missing import for json module
        for json_f in json_files:
            with open(os.path.join(d3, json_f)) as f:
                suburb_profile = json.load(f)
            
            try:
                mkt_insights = suburb_profile['props']['pageProps']['details']['marketInsights']
            except:
                continue
            
            for value in mkt_insights:
                temp = {}
                temp['state'] = state_code
                temp['suburb'] = suburb
                temp['postcode'] = postcode
                temp['beds'] = value['beds']
                temp['propertyType'] = value['propertyType']
                temp['medianPrice'] = value['medianPrice'] if value['medianPrice'] > 0 else np.nan
                temp['medianRent'] = value['medianRentPrice'] if value['medianRentPrice'] > 0 else np.nan
                temp['avgDaysOnMarket'] = value['avgDaysOnMarket'] if value['avgDaysOnMarket'] > 0 else np.nan
                temp['soldThisYear'] = value['nrSoldThisYear']
                temp['entryLevelPrice'] = value['entryLevelPrice'] if value['entryLevelPrice'] > 0 else np.nan
                temp['luxuryLevelPrice'] = value['luxuryLevelPrice'] if value['luxuryLevelPrice'] > 0 else np.nan
                temp['annualGrowth'] = np.nan
                for i in value['salesGrowthList']:
                    if i['year'] == '2023':
                        temp['annualGrowth'] = i['annualGrowth'] if i['annualGrowth'] > 0 else np.nan
                
                suburb_data_df = pd.concat([suburb_data_df, pd.DataFrame([temp])], ignore_index=True)
        
        print('Finished processing suburb:', suburb, 'postcode:', postcode, 'state:', state_code)

suburb_data_df['rentalYield'] = (suburb_data_df['medianRent'] * 52) / suburb_data_df['medianPrice']
suburb_data_df['totalYield'] = suburb_data_df['rentalYield'] + suburb_data_df['annualGrowth']
suburb_data_df = pd.merge(suburb_data_df, suburb_remoteness_df, on=['postcode', 'state'], how='left')

#suburb_data_df.to_csv('suburb_data.csv', index=False)
suburb_data_json = json.loads(suburb_data_df.to_json(orient='records'))

Finished processing suburb: ACTON postcode: 2601 state: ACT
Finished processing suburb: AINSLIE postcode: 2602 state: ACT
Finished processing suburb: AMAROO postcode: 2914 state: ACT
Finished processing suburb: ARANDA postcode: 2614 state: ACT
Finished processing suburb: BANKS postcode: 2906 state: ACT
Finished processing suburb: BARTON postcode: 2600 state: ACT
Finished processing suburb: BELCONNEN postcode: 2617 state: ACT
Finished processing suburb: BONNER postcode: 2914 state: ACT
Finished processing suburb: BONYTHON postcode: 2905 state: ACT
Finished processing suburb: BRADDON postcode: 2612 state: ACT
Finished processing suburb: BRUCE postcode: 2617 state: ACT
Finished processing suburb: CALWELL postcode: 2905 state: ACT
Finished processing suburb: CAMPBELL postcode: 2612 state: ACT
Finished processing suburb: CANBERRA postcode: 2600 state: ACT
Finished processing suburb: CASEY postcode: 2913 state: ACT
Finished processing suburb: CHAPMAN postcode: 2611 state: ACT
Finished proces

In [8]:
#count total rows in suburb_data_df
suburb_data_df.shape[0]

25544

In [9]:
batch_size = 1000
num_batches = len(suburb_data_df) // batch_size + 1

for batch in range(num_batches):
    start_index = batch * batch_size
    end_index = min((batch + 1) * batch_size, len(suburb_data_df))
    batch_data = suburb_data_json[start_index:end_index]

    # Send batch_data using requests.post()
    r = requests.post(url, json=suburb_data_json)

    if r.status_code == 200:
        print(f'Suburb data batch {batch+1}/{num_batches} migrated successfully')
    else:
        print(f'Error in migrating suburb data batch {batch+1}/{num_batches}: {r.status_code} {r.reason}')
    
    break


Error in migrating suburb data batch 1/26: 500 Internal Server Error


In [11]:
batch_data_json[:1]

[{'state': 'ACT',
  'suburb': 'ACTON',
  'postcode': '2601',
  'beds': 2,
  'propertyType': 'Unit',
  'medianPrice': None,
  'medianRent': None,
  'avgDaysOnMarket': None,
  'soldThisYear': 1,
  'entryLevelPrice': None,
  'luxuryLevelPrice': None,
  'annualGrowth': None,
  'rentalYield': None,
  'totalYield': None,
  'remoteness': 'Major Cities of Australia',
  'remoteness_code': '0'}]