In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
%matplotlib inline
import json
import seaborn
import re


## Load the data using pandas

In [2]:
df = pd.read_csv('data.csv', parse_dates=True, dtype = {'Id':'string', 'Title': 'string', 'Company': 'string', 'Date': 'string', 'Location': 'string', 'Area': 'string', 'Classification': 'string', 'SubClassification': 'string', 'Requirement': 'string', 'FullDescription': 'string', 'LowestSalary': 'int', 'HighestSalary': 'int', 'JobType': 'string'})

## Clean the data

In [3]:
df['Id'] = df['Id'].str.extract('(^[0-9]+)').astype(int)                         
df['Date'] = pd.to_datetime(df['Date'])

In [4]:
print('\n', df.dtypes)


 Id                                 int32
Title                             string
Company                           string
Date                 datetime64[ns, UTC]
Location                          string
Area                              string
Classification                    string
SubClassification                 string
Requirement                       string
FullDescription                   string
LowestSalary                       int32
HighestSalary                      int32
JobType                           string
dtype: object


## Hypothesis 1: Tech will be dominant in capital cities

In [12]:
class SubSector:
    def __init__(self, name):
        self.salaries = list()
        self.name = name
    def __str__(self):
        return self.name 
    def AddSalary(self, lowest, highest):
        self.salaries.append((lowest, highest))
    def NumSalaries(self):
        return len(self.salaries)
    def AvgSalary(self):
        total = 0
        for tup in self.salaries:
            total += tup[1]
        return total / self.NumSalaries()

class Sector:
    def __init__(self, name):
        self.subSectors = dict()
        self.name = name
    def __str__(self):
        return self.name 
    def AddSubSector(self, subSector, lowestSalary, highestSalary):
        if subSector not in self.subSectors:
            self.subSectors.update({subSector : SubSector(subSector)})
        self.subSectors[subSector].AddSalary(lowestSalary, highestSalary)
    def MostDominantSubSector(self):
        subSector = object()
        amt = 0
        for key in self.subSectors:
            cur = self.subSectors[key]
            curAmt = cur.NumSalaries()
            if cur > amt:
                amt = cur
                subSector = cur
        return subSector
    def NumSubSectors(self):
        return len(self.subSectors)
    def AvgSalary(self):
        total = 0
        for key in self.subSectors:
            subSector = self.subSectors[key]
            total += subSector.AvgSalary()
        return total / self.NumSubSectors()

class City:
    def __init__(self, name):
        self.sectors = dict()
        self.sectorCounts = dict()
        self.name = name
        self.capital = name in ["Canberra", "Sydney", "Darwin", "Brisbane", "Adelaide", "Hobart", "Melbourne", "Perth", "ACT"]
    def IsCapital(self):
        return self.capital
    def __str__(self):
        return self.name 
    def AddSector(self, sector, subSector, lowestSalary, highestSalary):
        if sector not in self.sectors:
            self.sectorCounts.update({sector : 1})
            self.sectors.update({sector: Sector(sector)})
        else:
            self.sectorCounts[sector] += 1
        self.sectors[sector].AddSubSector(subSector, lowestSalary, highestSalary)
    def MostDominantSector(self):
        sector = object()
        amt = 0
        for key in self.sectorCounts:
            cur = self.sectorCounts[key]
            if cur > amt:
                amt = cur
                sector = self.sectors[key]
        return sector
    def NumSectors(self):
        return len(self.sectors)
    def AvgSalary(self):
        total = 0
        for key in self.sectors:
            total += self.sectors[key].AvgSalary()
        return total / self.NumSectors()

cityDict = dict()
for index, row in df.iterrows():
    city = row["Location"]
    sector = row["Classification"]
    subSector = row["SubClassification"]
    lowestSalary = row["LowestSalary"]
    highestSalary = row["HighestSalary"]
    
    if not pd.isnull(city) and not pd.isnull(sector) and not pd.isnull(subSector) and not pd.isnull(lowestSalary) and not pd.isnull(highestSalary): 
        if city not in cityDict:
            cityDict.update({city : City(city)})
        cityDict[city].AddSector(sector, subSector, lowestSalary, highestSalary)

allCities = list()
capitals = list()
nonCapitals = list()
for key in cityDict:
    cur = cityDict[key]
    if cityDict[key].IsCapital():
        capitals.append(cur)
    else:
        nonCapitals.append(cur)

#print("Capitals")
for capital in capitals:
    dominant = capital.MostDominantSector()
    #print(str(capital) + " (" + str(dominant) + ") - " + str(dominant.AvgSalary()))
    allCities.append([str(capital), str(dominant), dominant.AvgSalary()])

#print("\nNonCapitals")
for nonCapital in nonCapitals:
    dominant = nonCapital.MostDominantSector()
    #print(str(nonCapital) + " (" + str(dominant) + ") - " + str(dominant.AvgSalary()))
    allCities.append([str(nonCapital), str(dominant), dominant.AvgSalary()])

#print(allCities)
with open('baseData.json', 'w') as outfile:
    json.dump(allCities, outfile)

### Text analysis

In [44]:
requirements = list()
for index, row in df.iterrows():
    sector = str(row["Classification"])
    requirement = str(row["Requirement"]).strip()
    
    if sector == "Information & Communication Technology":
        requirements.append(requirement)

In [55]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 1), min_df = 1, stop_words = 'english', max_features=500)

features = tfidf.fit(requirements)
corpus_tf_idf = tfidf.transform(requirements)

sum_words = corpus_tf_idf.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in tfidf.vocabulary_.items()]

print(sorted(words_freq, key = lambda x: x[1], reverse=True)[:5])

query = "vmware java"

new_features = tfidf.transform([query])

cosine_similarities = linear_kernel(new_features, corpus_tf_idf).flatten()
related_docs_indices = cosine_similarities.argsort()[::-1]

print('Top-{0} documents'.format(topk))
for i in range(topk):
    print(i, ": ", requirements[related_docs_indices[i]])

[('contract', 849.9973961419004), ('opportunity', 832.9341336343739), ('join', 829.6022350961781), ('team', 793.2975806517229), ('business', 720.8327124871363)]
Top-5 documents
0 :  VMware Engineer needed for a VmWare Migration project. We have a project required to migrate an ESXi Server remotely.
1 :  Maintain, Support and provide Virtualisation (VMware) solutions to our enterprise clients | CBD Location | VMware/ vRealise/ Vblock
2 :  This is a great role for a Java Developer with VMWare and AWS to work for a large CBD based Financial organisation
3 :  Architect Virtualisation VMWare Cloud TOGAF Permanent
4 :  Architect Virtualisation VMWare Cloud TOGAF Permanent
