In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
%matplotlib inline
import seaborn
import re

## Load the data using pandas

In [3]:
df = pd.read_csv('data.csv', parse_dates=True, dtype = {'Id':'string', 'Title': 'string', 'Company': 'string', 'Date': 'string', 'Location': 'string', 'Area': 'string', 'Classification': 'string', 'SubClassification': 'string', 'Requirement': 'string', 'FullDescription': 'string', 'LowestSalary': 'int', 'HighestSalary': 'int', 'JobType': 'string'})

## Clean the data

In [4]:
df['Id'] = df['Id'].str.extract('(^[0-9]+)').astype(int)                         
df['Date'] = pd.to_datetime(df['Date'])

In [5]:
print('\n', df.dtypes)


 Id                                 int32
Title                             string
Company                           string
Date                 datetime64[ns, UTC]
Location                          string
Area                              string
Classification                    string
SubClassification                 string
Requirement                       string
FullDescription                   string
LowestSalary                       int32
HighestSalary                      int32
JobType                           string
dtype: object


## Hypothesis 1: Tech will be dominant in capital cities

In [21]:
class SubSector:
    def __init__(self, name):
        self.salaries = list()
        self.name = name
    def __str__(self):
        return self.name 
    def AddSalary(self, lowest, highest):
        self.salaries.append((lowest, highest))
    def NumSalaries(self):
        return len(self.salaries)
    def AvgSalary(self):
        total = 0
        for tup in self.salaries:
            total += tup[1]
        return total / self.NumSalaries()

class Sector:
    def __init__(self, name):
        self.subSectors = dict()
        self.name = name
    def __str__(self):
        return self.name 
    def AddSubSector(self, subSector, lowestSalary, highestSalary):
        if subSector not in self.subSectors:
            self.subSectors.update({subSector : SubSector(subSector)})
        self.subSectors[subSector].AddSalary(lowestSalary, highestSalary)
    def MostDominantSubSector(self):
        subSector = object()
        amt = 0
        for key in self.subSectors:
            cur = self.subSectors[key]
            curAmt = cur.NumSalaries()
            if cur > amt:
                amt = cur
                subSector = cur
        return subSector
    def NumSubSectors(self):
        return len(self.subSectors)
    def AvgSalary(self):
        total = 0
        for key in self.subSectors:
            subSector = self.subSectors[key]
            total += subSector.AvgSalary()
        return total / self.NumSubSectors()

class City:
    def __init__(self, name):
        self.sectors = dict()
        self.sectorCounts = dict()
        self.name = name
        self.capital = name in ["Canberra", "Sydney", "Darwin", "Brisbane", "Adelaide", "Hobart", "Melbourne", "Perth", "ACT"]
    def IsCapital(self):
        return self.capital
    def __str__(self):
        return self.name 
    def AddSector(self, sector, subSector, lowestSalary, highestSalary):
        if sector not in self.sectors:
            self.sectorCounts.update({sector : 1})
            self.sectors.update({sector: Sector(sector)})
        else:
            self.sectorCounts[sector] += 1
        self.sectors[sector].AddSubSector(subSector, lowestSalary, highestSalary)
    def MostDominantSector(self):
        sector = object()
        amt = 0
        for key in self.sectorCounts:
            cur = self.sectorCounts[key]
            if cur > amt:
                amt = cur
                sector = self.sectors[key]
        return sector
    def NumSectors(self):
        return len(self.sectors)
    def AvgSalary(self):
        total = 0
        for key in self.sectors:
            total += self.sectors[key].AvgSalary()
        return total / self.NumSectors()

cityDict = dict()
for index, row in df.iterrows():
    city = row["Location"]
    sector = row["Classification"]
    subSector = row["SubClassification"]
    lowestSalary = row["LowestSalary"]
    highestSalary = row["HighestSalary"]
    
    if not pd.isnull(city) and not pd.isnull(sector) and not pd.isnull(subSector) and not pd.isnull(lowestSalary) and not pd.isnull(highestSalary): 
        if city not in cityDict:
            cityDict.update({city : City(city)})
        cityDict[city].AddSector(sector, subSector, lowestSalary, highestSalary)

capitals = list()
nonCapitals = list()
for key in cityDict:
    cur = cityDict[key]
    if cityDict[key].IsCapital():
        capitals.append(cur)
    else:
        nonCapitals.append(cur)

print("Capitals")
for capital in capitals:
    dominant = capital.MostDominantSector()
    print(str(capital) + " (" + str(dominant) + ") - " + str(dominant.AvgSalary()))

print("\nNonCapitals")
for nonCapital in nonCapitals:
    dominant = nonCapital.MostDominantSector()
    print(str(nonCapital) + " (" + str(dominant) + ") - " + str(dominant.AvgSalary()))

Capitals
Sydney (Information & Communication Technology) - 236.7014297873612
Brisbane (Information & Communication Technology) - 234.5485975770055
Adelaide (Trades & Services) - 53.73280049814219
Melbourne (Information & Communication Technology) - 225.908935290161
Perth (Trades & Services) - 65.00354375425074
Darwin (Healthcare & Medical) - 168.23990683229812
ACT (Information & Communication Technology) - 343.602212980112
Hobart (Healthcare & Medical) - 144.7873395761327

NonCapitals
Richmond & Hawkesbury (Manufacturing, Transport & Logistics) - 109.21861471861472
Gosford & Central Coast (Healthcare & Medical) - 116.88692745616767
Mackay & Coalfields (Mining, Resources & Energy) - 154.63717324866337
Sunshine Coast (Hospitality & Tourism) - 46.09313087383522
Gold Coast (Hospitality & Tourism) - 48.21824862434942
West Gippsland & Latrobe Valley (Healthcare & Medical) - 118.23896103896104
Hervey Bay & Fraser Coast (Healthcare & Medical) - 155.15836385836388
South West Coast VIC (Hospital

## Hypothesis 2: We do not think sub categories of industries will vary in average salary costs in different australian cities.