In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
%matplotlib inline
import seaborn
import re

## Load the data using pandas

In [3]:
df = pd.read_csv('data.csv', parse_dates=True, dtype = {'Id':'string', 'Title': 'string', 'Company': 'string', 'Date': 'string', 'Location': 'string', 'Area': 'string', 'Classification': 'string', 'SubClassification': 'string', 'Requirement': 'string', 'FullDescription': 'string', 'LowestSalary': 'int', 'HighestSalary': 'int', 'JobType': 'string'})

## Clean the data

In [4]:
df['Id'] = df['Id'].str.extract('(^[0-9]+)').astype(int)                         
df['Date'] = pd.to_datetime(df['Date'])

In [5]:
print('\n', df.dtypes)


 Id                                 int32
Title                             string
Company                           string
Date                 datetime64[ns, UTC]
Location                          string
Area                              string
Classification                    string
SubClassification                 string
Requirement                       string
FullDescription                   string
LowestSalary                       int32
HighestSalary                      int32
JobType                           string
dtype: object


## Hypothesis 1: Tech will be dominant in capital cities

In [7]:
class Sector:
    def __init__(self, name):
        self.subSectorCounts = dict()
        self.name = name
    def AddSubSector(self, subSector):
        if subSector not in self.subSectorCounts:
            self.subSectorCounts.update({subSector : 1})
        else:
            self.subSectorCounts[subSector] += 1
    def MostDominant(self):
        subSector = ""
        amt = 0
        for key in self.subSectorCounts:
            cur = self.subSectorCounts[key]
            if cur > amt:
                amt = cur
                subSector = key
        return sector

class City:
    def __init__(self, name):
        self.sectors = dict()
        self.sectorCounts = dict()
        self.name = name
        self.capital = name in ["Canberra", "Sydney", "Darwin", "Brisbane", "Adelaide", "Hobart", "Melbourne", "Perth", "ACT"]
    def IsCapital(self):
        return self.capital
    def __str__(self):
        return self.name + " (" + self.MostDominant() + ")" 
    def AddSector(self, sector, subSector):
        if sector not in self.sectors:
            self.sectorCounts.update({sector : 1})
            self.sectors.update({sector: Sector(sector)})
        else:
            self.sectorCounts[sector] += 1
        self.sectors[sector].AddSubSector(subSector)
    def MostDominant(self):
        sector = ""
        amt = 0
        for key in self.sectorCounts:
            cur = self.sectorCounts[key]
            if cur > amt:
                amt = cur
                sector = key
        return sector

cityDict = dict()
for index, row in df.iterrows():
    city = row["Location"]
    sector = row["Classification"]
    subSector = row["SubClassification"]
    
    if not pd.isnull(city) and not pd.isnull(sector): 
        if city not in cityDict:
            cityDict.update({city : City(city)})
        cityDict[city].AddSector(sector, subSector)

capitals = list()
nonCapitals = list()
for key in cityDict:
    cur = str(cityDict[key])
    if cityDict[key].IsCapital():
        capitals.append(cur)
    else:
        nonCapitals.append(cur)

print("Capitals")
for capital in capitals:
    print(capital)

print("\nNonCapitals")
for nonCapital in nonCapitals:
    print(nonCapital)

Capitals
Sydney (Information & Communication Technology)
Brisbane (Information & Communication Technology)
Adelaide (Trades & Services)
Melbourne (Information & Communication Technology)
Perth (Trades & Services)
Darwin (Healthcare & Medical)
ACT (Information & Communication Technology)
Hobart (Healthcare & Medical)

NonCapitals
Richmond & Hawkesbury (Manufacturing, Transport & Logistics)
Gosford & Central Coast (Healthcare & Medical)
Mackay & Coalfields (Mining, Resources & Energy)
Sunshine Coast (Hospitality & Tourism)
Gold Coast (Hospitality & Tourism)
West Gippsland & Latrobe Valley (Healthcare & Medical)
Hervey Bay & Fraser Coast (Healthcare & Medical)
South West Coast VIC (Hospitality & Tourism)
Mornington Peninsula & Bass Coast (Hospitality & Tourism)
Port Hedland, Karratha & Pilbara (Mining, Resources & Energy)
Ballarat & Central Highlands (Healthcare & Medical)
Bendigo, Goldfields & Macedon Ranges (Healthcare & Medical)
Yarra Valley & High Country (Hospitality & Tourism)
Coffs

## Hypothesis 2: We do not think sub categories of industries will vary in average salary costs in different australian cities.