## 3803ICT_Assignment

### Part 1 - Data Preparation and Preprocessing

In [15]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Read in data
df = pd.read_csv('../data/data.csv', low_memory=False)
df.head()

Unnamed: 0,Id,Title,Company,Date,Location,Area,Classification,SubClassification,Requirement,FullDescription,LowestSalary,HighestSalary,JobType
0,37404348,Casual Stock Replenisher,Aldi Stores,2018-10-07T00:00:00.000Z,Sydney,North West & Hills District,Retail & Consumer Products,Retail Assistants,Our Casual Stock Replenishers pride themselves...,,0,30,
1,37404337,Casual Stock Replenisher,Aldi Stores,2018-10-07T00:00:00.000Z,Richmond & Hawkesbury,,Retail & Consumer Products,Retail Assistants,Our Casual Stock Replenishers pride themselves...,,0,30,
2,37404356,RETAIL SALES SUPERSTARS and STYLISTS Wanted - ...,LB Creative Pty Ltd,2018-10-07T00:00:00.000Z,Brisbane,CBD & Inner Suburbs,Retail & Consumer Products,Retail Assistants,BRAND NEW FLAGSHIP STORE OPENING - SUNSHINE PLAZA,,0,30,
3,37404330,Team member - Belrose,Anaconda Group Pty Ltd,2018-10-07T00:00:00.000Z,Gosford & Central Coast,,Retail & Consumer Products,Retail Assistants,Bring it on - do you love the great outdoors a...,,0,30,
4,37404308,"Business Banking Contact Centre Specialist, Ni...",Commonwealth Bank - Business & Private Banking,2018-10-07T00:00:00.000Z,Sydney,Ryde & Macquarie Park,Call Centre & Customer Service,Sales - Inbound,"We are seeking highly articulate, enthusiastic...",,0,30,


In [2]:
# Check dtypes
df.dtypes

Id                   object
Title                object
Company              object
Date                 object
Location             object
Area                 object
Classification       object
SubClassification    object
Requirement          object
FullDescription      object
LowestSalary          int64
HighestSalary         int64
JobType              object
dtype: object

In [16]:
# Change Date dtype to datetime
df['Date'] = pd.to_datetime(df['Date'])

In [18]:
# Remove all '&searchrequesttoken...' strings from Id column
df['Id'] = df['Id'].str.split('&').str[0]
df['Id'] = df['Id'].astype(int)
df.dtypes

Id                                 int32
Title                             object
Company                           object
Date                 datetime64[ns, UTC]
Location                          object
Area                              object
Classification                    object
SubClassification                 object
Requirement                       object
FullDescription                   object
LowestSalary                       int64
HighestSalary                      int64
JobType                           object
dtype: object

In [19]:
# Check for duplicates
df[df.duplicated()].shape[0]

108

In [20]:
# Drop duplicates
df = df.drop_duplicates()
df[df.duplicated()].shape[0]

0

In [21]:
# Check for null entries
df.isnull().sum()

Id                        0
Title                     0
Company               11997
Date                      0
Location             121248
Area                 195787
Classification       121248
SubClassification    121248
Requirement               7
FullDescription       16135
LowestSalary              0
HighestSalary             0
JobType               16058
dtype: int64

In [22]:
# Drop null entries
df = df.dropna()
df.isnull().sum()

Id                   0
Title                0
Company              0
Date                 0
Location             0
Area                 0
Classification       0
SubClassification    0
Requirement          0
FullDescription      0
LowestSalary         0
HighestSalary        0
JobType              0
dtype: int64

In [24]:
df['FullDescription'][121]

'<p>&nbsp;</p>\n        <p><strong><em>*&nbsp; Secure long term role with genuine career path to supervisor</em></strong></p>\n        <p><strong><em>*&nbsp; Competitive hourly rate with regular opportunity for overtime</em></strong></p>\n        <p><strong><em>*&nbsp; Full on-the-job training</em></strong></p>\n        <p><strong>About the&nbsp;role</strong></p>\n        <p>Having recently won a significant new national contract we are looking for another trade qualified person with welding and fabrication skills to help manage increased demands on our production and installation departments.&nbsp; This role will\n          see you involved in both manufacturing and on-site installation and there is a genuine career path to supervisor if that is your goal.&nbsp; Initially your role will require you to:-</p>\n        <ul>\n          <li>read and interpret drawings&nbsp;</li>\n          <li>fabricate and assemble orders as required</li>\n          <li>provide input to enhance factory pr

In [26]:
# Clean 'FullDescription' of html tags, symbols and extra spaces
df['FullDescription'] = df['FullDescription'].str.replace('\&(.*?)\;', ' ')
df['FullDescription'] = df['FullDescription'].str.replace('<[^<]+?>', ' ')
df['FullDescription'] = df['FullDescription'].str.replace('\n', ' ')
df['FullDescription'] = df['FullDescription'].str.replace('*', ' ')
df['FullDescription'] = df['FullDescription'].str.replace('\s+', ' ')

In [27]:
df['FullDescription'][121]

' Secure long term role with genuine career path to supervisor Competitive hourly rate with regular opportunity for overtime Full on-the-job training About the role Having recently won a significant new national contract we are looking for another trade qualified person with welding and fabrication skills to help manage increased demands on our production and installation departments. This role will see you involved in both manufacturing and on-site installation and there is a genuine career path to supervisor if that is your goal. Initially your role will require you to:- read and interpret drawings fabricate and assemble orders as required provide input to enhance factory processes pack and dispatch orders perform on-site installations (full training will be given) About you This role is ideal for a trade qualified person (welder, boilermaker, fabricator etc) with good hands-on skills who will enjoy dividing their time between factory/manufacturing and on-site installations. Because 

#### TODO? - clean requirements

### Data Preparation all in one

In [31]:
# Change Date dtype to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Remove all '&searchrequesttoken...' strings from Id column
df['Id'] = df['Id'].str.split('&').str[0]
df['Id'] = df['Id'].astype(int)

# Drop duplicates
df = df.drop_duplicates()

# Drop null entries
df = df.dropna()

# Clean 'FullDescription' of html tags, symbols and extra spaces
df['FullDescription'] = df['FullDescription'].str.replace('\&(.*?)\;', ' ')
df['FullDescription'] = df['FullDescription'].str.replace('<[^<]+?>', ' ')
df['FullDescription'] = df['FullDescription'].str.replace('\n', ' ')
df['FullDescription'] = df['FullDescription'].str.replace('*', ' ')
df['FullDescription'] = df['FullDescription'].str.replace('\s+', ' ')

df.head()

Unnamed: 0,Id,Title,Company,Date,Location,Area,Classification,SubClassification,Requirement,FullDescription,LowestSalary,HighestSalary,JobType
121,37404238,Fabricator/Installer,WORKPLACE ACCESS & SAFETY,2018-10-07 00:00:00+00:00,Melbourne,Bayside & South Eastern Suburbs,Trades & Services,Welders & Boilermakers,Trade qualified person with skills in welding ...,Secure long term role with genuine career pat...,0,30,Full Time
122,37404195,Boilermaker,RPM Contracting QLD P/l,2018-10-07 00:00:00+00:00,Brisbane,Southern Suburbs & Logan,Trades & Services,Welders & Boilermakers,Perm rate $30. Structural steel fab & weld out...,One of Australia's best engineering workshops...,0,30,Full Time
125,37404288,Casual Childcare Positions | Bondi Junction,anzuk Education,2018-10-07 00:00:00+00:00,Sydney,"CBD, Inner West & Eastern Suburbs",Education & Training,Teaching - Early Childhood,"anzuk education are searching for reliable, en...",What is anzuk? anzuk Early Childhood is a rec...,0,30,Contract/Temp
126,37404267,Technician,Zoom Recruitment & Training,2018-10-07 00:00:00+00:00,Sydney,South West & M5 Corridor,Engineering,Mechanical Engineering,"This Australian Icon, connects the people of t...","This Australian Icon, connects the people of ...",0,30,Full Time
127,37404230,Systems Engineer,Humanised Group,2018-10-07 00:00:00+00:00,Brisbane,CBD & Inner Suburbs,Information & Communication Technology,Networks & Systems Administration,Systems Engineer to work on BAU/Projects for a...,The Company This organisation is well-establi...,0,30,Full Time


### Part 2 - Data Analysis and Interpretation

In [29]:
df.Classification.unique()

array(['Trades & Services', 'Education & Training', 'Engineering',
       'Information & Communication Technology',
       'Marketing & Communications', 'CEO & General Management',
       'Human Resources & Recruitment', 'Accounting', 'Legal',
       'Real Estate & Property', 'Manufacturing, Transport & Logistics',
       'Healthcare & Medical', 'Banking & Financial Services',
       'Construction', 'Sport & Recreation', 'Government & Defence',
       'Hospitality & Tourism', 'Sales', 'Retail & Consumer Products',
       'Call Centre & Customer Service',
       'Administration & Office Support', 'Design & Architecture',
       'Mining, Resources & Energy', 'Science & Technology',
       'Advertising, Arts & Media', 'Insurance & Superannuation',
       'Farming, Animals & Conservation',
       'Community Services & Development', 'Self Employment',
       'Consulting & Strategy'], dtype=object)

In [32]:
df.SubClassification.unique()

array(['Welders & Boilermakers', 'Teaching - Early Childhood',
       'Mechanical Engineering', 'Networks & Systems Administration',
       'Product Management & Development',
       'Programme & Project Management', 'General/Business Unit Manager',
       'Help Desk & IT Support', 'Remuneration & Benefits',
       'Business Services & Corporate Advisory', 'Sales - Pre & Post',
       'Maintenance & Handyperson Services', 'Environment & Planning Law',
       'Security', 'Other', 'Financial Accounting & Reporting',
       'Management', 'Pharmacy', 'Consultants',
       'Psychology, Counselling & Social Work', 'Architects',
       'Compliance & Risk', 'Audit - External', 'Analysts',
       'Teaching - Tertiary', 'Audit - Internal',
       'Stockbroking & Trading', 'Corporate Finance & Investment Banking',
       'Digital & Search Marketing', 'Sales', 'Project Management',
       'Contracts Management', 'Financial Managers & Controllers',
       'Clinical/Medical Research', 'Business/Syst

In [33]:
df.Title.unique()

array(['Fabricator/Installer', 'Boilermaker',
       'Casual Childcare Positions  | Bondi Junction', ...,
       'Head of Merchandise Planning',
       'IT Project/ Program Manager - Banking',
       'IT Test Analyst OR Senior IT Test Analyst'], dtype=object)