In [3]:
#!pip install pandas
#!pip install seaborn
import pandas as pd
import numpy as np
import random

# plotting
from matplotlib import pyplot as plt
import seaborn as sns

import nltk
from nltk.probability import FreqDist # frequency 
from nltk.tokenize import word_tokenize # tokenize
from nltk.tag import pos_tag # POS tag
from nltk.stem import PorterStemmer # stemming
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords # to remove stop words

# pip install -U spacy
# python -m spacy download en_core_web_sm
import spacy
from spacy.tokens.doc import Doc
from spacy.vocab import Vocab
from spacy.tokenizer import Tokenizer
from spacy.matcher import Matcher # linguistic pattern
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter # count frequent noun phrases
from sklearn.model_selection import train_test_split  

# onehot encoding
from sklearn.preprocessing import OneHotEncoder
import collections
from argparse import Namespace
import os
import string
import sklearn

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm
from torch.nn.utils import rnn as rnn_utils

# Data Exploration

In [210]:
dat = pd.read_csv('seek_australia.csv')

In [211]:
dat.head()

Unnamed: 0,category,city,company_name,geo,job_board,job_description,job_title,job_type,post_date,salary_offered,state,url
0,Retail & Consumer Products,Sydney,Frontline Executive Retail Sydney,AU,seek,Have you had 10 years experience in fresh pro...,Store Manager - Fresh Produce,Full Time,2018-04-15T23:13:45Z,$100k Base + Super + Benefits,North Shore & Northern Beaches,https://www.seek.com.au/job/35989382
1,Government & Defence,Brisbane,Powerlink,AU,seek,The Opportunity: The Client Solution Analyst ...,Client Solution Analyst,Full Time,2018-04-15T23:04:40Z,Excellent remuneration packages,Northern Suburbs,https://www.seek.com.au/job/35989272
2,Trades & Services,Sydney,Richard Jay Laundry,AU,seek,An innovative business development role for a...,Service Technician / Installer - NSW,Full Time,2018-04-15T23:04:31Z,,Parramatta & Western Suburbs,https://www.seek.com.au/job/35989270
3,Trades & Services,Melbourne,Adaptalift Hyster,AU,seek,About the role: We are seeking an Automotive W...,Workshop Technician I Material Handling Equipment,Full Time,2018-04-16T03:15:17Z,,Bayside & South Eastern Suburbs,https://www.seek.com.au/job/35993203
4,Trades & Services,Adelaide,Bakers Delight G&M,AU,seek,Â Early starts and weekend shifts. No experie...,APPRENTICESHIP JUNIOR BAKER,Full Time,2018-04-16T01:26:50Z,,,https://www.seek.com.au/job/35991578


In [212]:
dat.isnull().sum()

category               0
city                   0
company_name           0
geo                    0
job_board              0
job_description      345
job_title              0
job_type               0
post_date              0
salary_offered     21048
state              10820
url                    0
dtype: int64

In [213]:
dat = dat[dat['job_description'].notna()]

In [214]:
dat.isnull().sum()

category               0
city                   0
company_name           0
geo                    0
job_board              0
job_description        0
job_title              0
job_type               0
post_date              0
salary_offered     20811
state              10718
url                    0
dtype: int64

In [215]:
dat

Unnamed: 0,category,city,company_name,geo,job_board,job_description,job_title,job_type,post_date,salary_offered,state,url
0,Retail & Consumer Products,Sydney,Frontline Executive Retail Sydney,AU,seek,Have you had 10 years experience in fresh pro...,Store Manager - Fresh Produce,Full Time,2018-04-15T23:13:45Z,$100k Base + Super + Benefits,North Shore & Northern Beaches,https://www.seek.com.au/job/35989382
1,Government & Defence,Brisbane,Powerlink,AU,seek,The Opportunity: The Client Solution Analyst ...,Client Solution Analyst,Full Time,2018-04-15T23:04:40Z,Excellent remuneration packages,Northern Suburbs,https://www.seek.com.au/job/35989272
2,Trades & Services,Sydney,Richard Jay Laundry,AU,seek,An innovative business development role for a...,Service Technician / Installer - NSW,Full Time,2018-04-15T23:04:31Z,,Parramatta & Western Suburbs,https://www.seek.com.au/job/35989270
3,Trades & Services,Melbourne,Adaptalift Hyster,AU,seek,About the role: We are seeking an Automotive W...,Workshop Technician I Material Handling Equipment,Full Time,2018-04-16T03:15:17Z,,Bayside & South Eastern Suburbs,https://www.seek.com.au/job/35993203
4,Trades & Services,Adelaide,Bakers Delight G&M,AU,seek,Â Early starts and weekend shifts. No experie...,APPRENTICESHIP JUNIOR BAKER,Full Time,2018-04-16T01:26:50Z,,,https://www.seek.com.au/job/35991578
...,...,...,...,...,...,...,...,...,...,...,...,...
29995,Hospitality & Tourism,Sydney,Radisson Blu Plaza Hotel Sydney,AU,seek,Hotel snapshot The Radisson Blu Plaza Sydney ...,Bar Supervisor,Full Time,2018-04-11T04:20:40Z,"Annualised salary, uniform + Super","CBD, Inner West & Eastern Suburbs",https://www.seek.com.au/job/35958503
29996,CEO & General Management,ACT,Airservices Australia,AU,seek,The Organisation Airservices is a government ...,Deputy Board Secretary,Full Time,2018-04-11T04:00:49Z,Salary package to be negotiated,,https://www.seek.com.au/job/35958100
29997,Accounting,Melbourne,The Hassett Group,AU,seek,ABOUT THE COMPANY AND ROLE Our client is one o...,Corporate Accountant,Full Time,2018-04-11T02:45:37Z,$110k Package On Offer!,CBD & Inner Suburbs,https://www.seek.com.au/job/35956991
29998,Government & Defence,ACT,SOS Recruitment,AU,seek,Long term contract for 12 months with possibl...,APS 6 & EL1 Account Managers,Contract/Temp,2018-04-11T04:55:16Z,,,https://www.seek.com.au/job/35959184


In [216]:
dat2 = dat.loc[:2019]

In [217]:
dat2.shape

(2000, 12)

In [218]:
data = dat2[['job_description', 'job_type', 'category']]

In [219]:
data.head()

Unnamed: 0,job_description,job_type,category
0,Have you had 10 years experience in fresh pro...,Full Time,Retail & Consumer Products
1,The Opportunity: The Client Solution Analyst ...,Full Time,Government & Defence
2,An innovative business development role for a...,Full Time,Trades & Services
3,About the role: We are seeking an Automotive W...,Full Time,Trades & Services
4,Â Early starts and weekend shifts. No experie...,Full Time,Trades & Services


In [220]:
data.isnull().sum()

job_description    0
job_type           0
category           0
dtype: int64

In [240]:
# data['job_description'].fillna("", inplace=True)
# we need to drop na otherwise the 
# data = data.dropna()
data['job_description_new'] = data['job_description'].map(lambda x: re.sub(r'\W+', ' ', x))
def remove_nonEglish(data):
    return re.sub("[^\x00-\x7F]+", "", data)

def remove_multiSpace(data):
    return re.sub(' +', ' ', data)

data['job_description_new'] = data['job_description_new'].apply(lambda x: remove_nonEglish(x))
data['job_description_new'] = data['job_description_new'].apply(lambda x: remove_multiSpace(x))

In [235]:
data.to_csv("seek_australia_2000.csv")

In [237]:
unique_category = data['category'].unique().tolist()

cate_desc_dict = {}

for cate in unique_category:
    cate = str(cate)
    desc_documents = data.loc[data['category'] == cate, 'job_description_new']
    # print(desc_documents)
    corpus = []
    for desc in desc_documents:
        # print(desc)
        corpus.append(str(desc))

    cate_desc_dict[cate] = corpus

In [238]:
len(cate_desc_dict)

30

# Pre-trained vectors

In [18]:
import sys
assert sys.version_info[0]==3
assert sys.version_info[1] >= 5

from gensim.models import KeyedVectors
from gensim.test.utils import datapath
import pprint
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [10, 5]
import numpy as np
import random
import scipy as sp
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA

START_TOKEN = '<START>'
END_TOKEN = '<END>'

np.random.seed(0)
random.seed(0)

In [19]:
texts = cate_desc_dict["Self Employment"]
job_desc_token = []
for text in texts:
    sentences = nltk.sent_tokenize(text)
    for sentence in sentences:
        words = nltk.word_tokenize(sentence)
        # ner_tagged_words = nltk.ne_chunk(nltk.pos_tag(words))
        # job_descip_token = job_descip_token + [ner_tagged_words]
        job_desc_token.extend(words)

In [20]:
len(job_desc_token)

279

In [21]:
len(set(job_desc_token))

185

In [22]:
cate_desc_dict["Self Employment"]

['Join a team you ll love within a company Australia loves At Aussie we pride ourselves on educating new to industry brokers developing the businesses of experienced Mortgage Brokers and helping people transition to a self employed opportunity they love What Aussie offers you Ongoing training development and support Comprehensive panel of lenders Leading technology Opportunity to work towards other Aussie channels such as franchise Free two year mentoring program for all new brokers Uncapped commission to build your own future Supportive vibrant team culture What does being a Mortgage Broker involve A broker s day is wide and varied A typical day could involve meeting with a prospective customer to review their financial situation and borrowing capacity liaising with lenders to track the progress of a loan managing the flow of documentation for multiple loan lodgements meeting with a referral partner to build up business generation networks overseeing the integrity of compliance proces

In [23]:
def read_corpus(category="Self Employment"):
    sents = []
    for text in cate_desc_dict[category]:
        for sentence in nltk.sent_tokenize(text):
            sent = [START_TOKEN] + [w.lower() for w in nltk.word_tokenize(sentence)] + [END_TOKEN]
            sents.append(sent)
    # return [[[START_TOKEN] + [w.lower() for w in nltk.word_tokenize(sentence)] + [END_TOKEN] for sentence in nltk.sent_tokenize(text)] for text in cate_desc_dict[category]]
    return sents

data_corpus = read_corpus()
pprint.pprint(data_corpus[:3], compact=True, width=100)

[['<START>', 'join', 'a', 'team', 'you', 'll', 'love', 'within', 'a', 'company', 'australia',
  'loves', 'at', 'aussie', 'we', 'pride', 'ourselves', 'on', 'educating', 'new', 'to', 'industry',
  'brokers', 'developing', 'the', 'businesses', 'of', 'experienced', 'mortgage', 'brokers', 'and',
  'helping', 'people', 'transition', 'to', 'a', 'self', 'employed', 'opportunity', 'they', 'love',
  'what', 'aussie', 'offers', 'you', 'ongoing', 'training', 'development', 'and', 'support',
  'comprehensive', 'panel', 'of', 'lenders', 'leading', 'technology', 'opportunity', 'to', 'work',
  'towards', 'other', 'aussie', 'channels', 'such', 'as', 'franchise', 'free', 'two', 'year',
  'mentoring', 'program', 'for', 'all', 'new', 'brokers', 'uncapped', 'commission', 'to', 'build',
  'your', 'own', 'future', 'supportive', 'vibrant', 'team', 'culture', 'what', 'does', 'being', 'a',
  'mortgage', 'broker', 'involve', 'a', 'broker', 's', 'day', 'is', 'wide', 'and', 'varied', 'a',
  'typical', 'day', 'coul

In [24]:
def distinct_words(corpus):
    """ Determine a list of distinct words for the corpus.
        Params:
            corpus (list of list of strings): corpus of documents
        Return:
            corpus_words (list of strings): list of distinct words across the corpus, sorted (using python 'sorted' function)
            num_corpus_words (integer): number of distinct words across the corpus
    """
    corpus_words = []
    num_corpus_words = -1
    
    # ------------------
    # Write your implementation here.
    corpus_words = sorted(list(set([y for x in corpus for y in x])))
#     corpus_words = [y for x in corpus for y in x] 
#     corpus_words = list(set(corpus_words)) # unique words 
#     corpus_words = sorted(corpus_words) # sorts
    num_corpus_words = len(corpus_words)
    # ------------------

    return corpus_words, num_corpus_words

test_corpus_words, num_corpus_words = distinct_words(data_corpus)

In [25]:
test_corpus_words

['<END>',
 '<START>',
 'a',
 'achieve',
 'activities',
 'aggregator',
 'all',
 'also',
 'am',
 'and',
 'appointments',
 'as',
 'at',
 'aussie',
 'australia',
 'be',
 'being',
 'borrowing',
 'broker',
 'brokers',
 'build',
 'built',
 'business',
 'businesses',
 'busy',
 'capacity',
 'channels',
 'check',
 'child',
 'children',
 'clients',
 'commission',
 'company',
 'compliance',
 'comprehensive',
 'could',
 'culture',
 'customer',
 'daily',
 'day',
 'developing',
 'development',
 'do',
 'documentation',
 'does',
 'educating',
 'employed',
 'experience',
 'experienced',
 'families',
 'financial',
 'first',
 'flexibility',
 'flow',
 'for',
 'franchise',
 'free',
 'from',
 'future',
 'generation',
 'goals',
 'have',
 'help',
 'helping',
 'highly',
 'how',
 'i',
 'industry',
 'info',
 'integrity',
 'involve',
 'is',
 'it',
 'job',
 'join',
 'know',
 'leading',
 'leave',
 'lenders',
 'liaising',
 'lifestyle',
 'll',
 'loan',
 'lodgements',
 'looking',
 'love',
 'loves',
 'managing',
 'mater

In [26]:
from sklearn.manifold import TSNE
from nltk.corpus import stopwords
import numpy as np
import matplotlib.pyplot as plt
import gensim.downloader as api
from bokeh.plotting import figure, show, output_file
from bokeh.io import push_notebook, output_notebook
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show
from bokeh.io import push_notebook, output_notebook
from bokeh.models import ColumnDataSource, LabelSet
wv = api.load('word2vec-google-news-300')

[--------------------------------------------------] 1.5% 24.5/1662.8MB downloaded

KeyboardInterrupt: 

In [27]:
# pretrained model
wv = api.load('word2vec-google-news-300')

In [38]:
def interactive_tsne(text_labels, tsne_array):
    '''makes an interactive scatter plot with text labels for each point'''

    # Define a dataframe to be used by bokeh context
    bokeh_df = pd.DataFrame(tsne_array, text_labels, columns=['x','y'])
    bokeh_df['text_labels'] = bokeh_df.index

    # interactive controls to include to the plot
    TOOLS="hover, zoom_in, zoom_out, box_zoom, undo, redo, reset, box_select"
    output_file("plot.html")

    p = figure(tools=TOOLS, plot_width=700, plot_height=700)

    # define data source for the plot
    source = ColumnDataSource(bokeh_df)

    # scatter plot
    p.scatter('x', 'y', source=source, fill_alpha=0.6,
              fill_color="#8724B5",
              line_color=None)

    # text labels
    labels = LabelSet(x='x', y='y', text='text_labels', y_offset=8,
                      text_font_size="8pt", text_color="#555555",
                      source=source, text_align='center')

    p.add_layout(labels)

    # show plot inline
    output_notebook()
    show(p)

In [118]:
# stopwords = stopwords.words('english')
# vocab = test_corpus_words
# input_vocab =  [word for word in vocab if word in wv.key_to_index.keys() and word not in stopwords]
vocab = test_corpus_words
input_vocab =  [word for word in vocab if word in wv.key_to_index.keys()]
X = wv[input_vocab]
# find tsne coords for 2 dimensions
tsne = TSNE(n_components=2, random_state=0)
X_tsne = tsne.fit_transform(X)

print(input_vocab)

points = len(input_vocab)
interactive_tsne(list(input_vocab)[:points], X_tsne)



['achieve', 'activities', 'aggregator', 'all', 'also', 'am', 'appointments', 'as', 'at', 'aussie', 'australia', 'be', 'being', 'borrowing', 'broker', 'brokers', 'build', 'built', 'business', 'businesses', 'busy', 'capacity', 'channels', 'check', 'child', 'children', 'clients', 'commission', 'company', 'compliance', 'comprehensive', 'could', 'culture', 'customer', 'daily', 'day', 'developing', 'development', 'do', 'documentation', 'does', 'educating', 'employed', 'experience', 'experienced', 'families', 'financial', 'first', 'flexibility', 'flow', 'for', 'franchise', 'free', 'from', 'future', 'generation', 'goals', 'have', 'help', 'helping', 'highly', 'how', 'i', 'industry', 'info', 'integrity', 'involve', 'is', 'it', 'job', 'join', 'know', 'leading', 'leave', 'lenders', 'liaising', 'lifestyle', 'll', 'loan', 'lodgements', 'looking', 'love', 'loves', 'managing', 'maternity', 'meeting', 'melanie', 'mentoring', 'mobile', 'more', 'mortgage', 'mother', 'motivated', 'multiple', 'mum', 'my', 

# Purposely trained vectors

In [25]:
import gensim
import re
from gensim.corpora import Dictionary

In [28]:
def vectorizer(category):
    document = cate_desc_dict[category]
    doc_tokenized = gensim.utils.simple_preprocess(str(document), deacc=True)
    # print(doc_tokenized[:10])
    return doc_tokenized

In [29]:
doc_tokenized = vectorizer("Self Employment")

In [30]:
print(doc_tokenized[:10])

['join', 'team', 'you', 'll', 'love', 'within', 'company', 'australia', 'loves', 'at']


In [31]:
from gensim.models import Word2Vec

cores = 16
model = Word2Vec(min_count=1,
                     window=2,
                     vector_size=100,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

In [32]:
from time import time

t = time()

model.build_vocab([doc_tokenized], progress_per=10)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

Time to build vocab: 0.0 mins


In [33]:
t = time()

model.train(test_corpus_words, total_examples=model.corpus_count, epochs=1000, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

Time to train the model: 0.12 mins


In [34]:
model.wv.key_to_index.keys()

dict_keys(['to', 'with', 'and', 'of', 'for', 'mortgage', 'my', 'the', 'brokers', 'what', 'broker', 'being', 'aussie', 'other', 'business', 'loan', 'offers', 'ongoing', 'training', 'support', 'perfect', 'their', 'financial', 'meeting', 'day', 'help', 'involve', 'working', 'now', 'no', 'looking', 'build', 'future', 'two', 'lenders', 'love', 'employed', 'self', 'team', 'you', 'people', 'opportunity', 'on', 'we', 'new', 'they', 'supportive', 'at', 'your', 'own', 'culture', 'pride', 'vibrant', 'company', 'does', 'loves', 'australia', 'uncapped', 'within', 'is', 'wide', 'varied', 'typical', 'could', 'll', 'commission', 'mentoring', 'all', 'towards', 'transition', 'development', 'helping', 'experienced', 'comprehensive', 'panel', 'prospective', 'leading', 'technology', 'work', 'businesses', 'ourselves', 'channels', 'developing', 'such', 'as', 'industry', 'franchise', 'free', 'educating', 'year', 'program', 'video', 'borrowing', 'customer', 'job', 'supports', 'also', 'it', 'flexibility', 'life

In [39]:
import nltk
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
# vocab = ['company', 'for', 'client', 'is', 'we', 'object', 'seeking', 'automotive']
# input_vocab =  [word for word in vocab if word in model.wv.key_to_index.keys() and word not in stop_words]
input_vocab = model.wv.index_to_key
X = model.wv[input_vocab]
# find tsne coords for 2 dimensions
tsne = TSNE(n_components=2, random_state=0)
X_tsne = tsne.fit_transform(X)

print(input_vocab)

points = len(input_vocab)
interactive_tsne(list(input_vocab)[:points], X_tsne)



['to', 'with', 'and', 'of', 'for', 'mortgage', 'my', 'the', 'brokers', 'what', 'broker', 'being', 'aussie', 'other', 'business', 'loan', 'offers', 'ongoing', 'training', 'support', 'perfect', 'their', 'financial', 'meeting', 'day', 'help', 'involve', 'working', 'now', 'no', 'looking', 'build', 'future', 'two', 'lenders', 'love', 'employed', 'self', 'team', 'you', 'people', 'opportunity', 'on', 'we', 'new', 'they', 'supportive', 'at', 'your', 'own', 'culture', 'pride', 'vibrant', 'company', 'does', 'loves', 'australia', 'uncapped', 'within', 'is', 'wide', 'varied', 'typical', 'could', 'll', 'commission', 'mentoring', 'all', 'towards', 'transition', 'development', 'helping', 'experienced', 'comprehensive', 'panel', 'prospective', 'leading', 'technology', 'work', 'businesses', 'ourselves', 'channels', 'developing', 'such', 'as', 'industry', 'franchise', 'free', 'educating', 'year', 'program', 'video', 'borrowing', 'customer', 'job', 'supports', 'also', 'it', 'flexibility', 'lifestyle', 's

# Data preprocessing

In [242]:
data

Unnamed: 0,job_description,job_type,category,job_type_new,job_description_new
0,Have you had 10 years experience in fresh pro...,Full Time,Retail & Consumer Products,Full Time,Have you had 10 years experience in fresh pro...
1,The Opportunity: The Client Solution Analyst ...,Full Time,Government & Defence,Full Time,The Opportunity The Client Solution Analyst p...
2,An innovative business development role for a...,Full Time,Trades & Services,Full Time,An innovative business development role for a...
3,About the role: We are seeking an Automotive W...,Full Time,Trades & Services,Full Time,About the role We are seeking an Automotive Wo...
4,Â Early starts and weekend shifts. No experie...,Full Time,Trades & Services,Full Time,Early starts and weekend shifts No experience...
...,...,...,...,...,...
1995,2 positions available Based in Kelmscott and ...,Full Time,Trades & Services,Full Time,2 positions available Based in Kelmscott and ...
1996,North Bondi FishÂ Located a few short steps f...,Full Time,Hospitality & Tourism,Full Time,North Bondi Fish Located a few short steps fro...
1997,Process Workers â€“ South Gippsland Rapidly g...,Casual/Vacation,"Manufacturing, Transport & Logistics",Other,Process Workers South Gippsland Rapidly growi...
1998,"JPS, Australia's leading Consultancy for the ...",Full Time,Trades & Services,Full Time,JPS Australia s leading Consultancy for the P...


Reindex the dataset

In [243]:
data = data.reset_index()
data = data.drop(['index'], axis = 1)

In [244]:
from gensim.models import TfidfModel
from gensim.corpora import Dictionary

### Splitting data for task 1

In [245]:
data

Unnamed: 0,job_description,job_type,category,job_type_new,job_description_new
0,Have you had 10 years experience in fresh pro...,Full Time,Retail & Consumer Products,Full Time,Have you had 10 years experience in fresh pro...
1,The Opportunity: The Client Solution Analyst ...,Full Time,Government & Defence,Full Time,The Opportunity The Client Solution Analyst p...
2,An innovative business development role for a...,Full Time,Trades & Services,Full Time,An innovative business development role for a...
3,About the role: We are seeking an Automotive W...,Full Time,Trades & Services,Full Time,About the role We are seeking an Automotive Wo...
4,Â Early starts and weekend shifts. No experie...,Full Time,Trades & Services,Full Time,Early starts and weekend shifts No experience...
...,...,...,...,...,...
1995,2 positions available Based in Kelmscott and ...,Full Time,Trades & Services,Full Time,2 positions available Based in Kelmscott and ...
1996,North Bondi FishÂ Located a few short steps f...,Full Time,Hospitality & Tourism,Full Time,North Bondi Fish Located a few short steps fro...
1997,Process Workers â€“ South Gippsland Rapidly g...,Casual/Vacation,"Manufacturing, Transport & Logistics",Other,Process Workers South Gippsland Rapidly growi...
1998,"JPS, Australia's leading Consultancy for the ...",Full Time,Trades & Services,Full Time,JPS Australia s leading Consultancy for the P...


In [246]:
data["job_type_new"] = np.where(data["job_type"] == "Full Time", "Full Time", "Other")

In [247]:
data

Unnamed: 0,job_description,job_type,category,job_type_new,job_description_new
0,Have you had 10 years experience in fresh pro...,Full Time,Retail & Consumer Products,Full Time,Have you had 10 years experience in fresh pro...
1,The Opportunity: The Client Solution Analyst ...,Full Time,Government & Defence,Full Time,The Opportunity The Client Solution Analyst p...
2,An innovative business development role for a...,Full Time,Trades & Services,Full Time,An innovative business development role for a...
3,About the role: We are seeking an Automotive W...,Full Time,Trades & Services,Full Time,About the role We are seeking an Automotive Wo...
4,Â Early starts and weekend shifts. No experie...,Full Time,Trades & Services,Full Time,Early starts and weekend shifts No experience...
...,...,...,...,...,...
1995,2 positions available Based in Kelmscott and ...,Full Time,Trades & Services,Full Time,2 positions available Based in Kelmscott and ...
1996,North Bondi FishÂ Located a few short steps f...,Full Time,Hospitality & Tourism,Full Time,North Bondi Fish Located a few short steps fro...
1997,Process Workers â€“ South Gippsland Rapidly g...,Casual/Vacation,"Manufacturing, Transport & Logistics",Other,Process Workers South Gippsland Rapidly growi...
1998,"JPS, Australia's leading Consultancy for the ...",Full Time,Trades & Services,Full Time,JPS Australia s leading Consultancy for the P...


In [248]:
args = Namespace(
    raw_dataset_csv="seek_australia_2000.csv",
    train_proportion=0.7,
    val_proportion=0.10,
    test_proportion=0.20,
    output_munged_csv="ass02_task01.csv",
    seed=1337
)

In [249]:
# Splitting train by job_type
# Create dict
by_type = collections.defaultdict(list)
for _, row in data.iterrows():
    by_type[row.job_type_new].append(row.to_dict())

In [250]:
# Create split data
final_list = []
np.random.seed(args.seed)

for _, item_list in sorted(by_type.items()):

    np.random.shuffle(item_list)
    
    n_total = len(item_list)
    n_train = int(args.train_proportion * n_total)
    n_val = int(args.val_proportion * n_total)
    n_test = int(args.test_proportion*n_total)
    
    # Give data point a split attribute
    for item in item_list[:n_train]:
        item['split'] = 'train'
    
    for item in item_list[n_train:n_train+n_val]:
        item['split'] = 'val'
    
    for item in item_list[n_train+n_val:]:
        item['split'] = 'test'

    # Add to final list
    final_list.extend(item_list)

In [251]:
split_task1 = pd.DataFrame(final_list)
split_task1.split.value_counts()
# len(split_task1) = 2000

train    1399
test      402
val       199
Name: split, dtype: int64

In [252]:
set(split_task1.split)

{'test', 'train', 'val'}

In [253]:
split_task1 = split_task1[["category", "job_description_new", "job_type_new", "split"]]

In [254]:
split_task1[pd.isnull(split_task1.job_description_new)]

Unnamed: 0,category,job_description_new,job_type_new,split


In [255]:
split_task1

Unnamed: 0,category,job_description_new,job_type_new,split
0,Construction,About Laing O Rourke Laing O Rourke is a 6 bi...,Full Time,train
1,Call Centre & Customer Service,Probate Recovery Specialists Phillips Cohen A...,Full Time,train
2,Sales,Global FMCG giant premium brands and multiple...,Full Time,train
3,Hospitality & Tourism,ENDLESS PROGRESSION IN THE EVENTS WORLD MANAG...,Full Time,train
4,Administration & Office Support,This rapidly growing Company provides excepti...,Full Time,train
...,...,...,...,...
1995,Hospitality & Tourism,A business in Sutherland is seeking a casual ...,Other,test
1996,Information & Communication Technology,We are seeking an experienced Data Migration ...,Other,test
1997,Retail & Consumer Products,Steve s Liquor is a growing liquor retailer w...,Other,test
1998,Healthcare & Medical,Enthusiastic and committed Disabiltiy Support...,Other,test


In [256]:
pd.Series(dict(FreqDist(split_task1.job_type_new)))

Full Time    1383
Other         617
dtype: int64

In [257]:
split_task1.dtypes

category               object
job_description_new    object
job_type_new           object
split                  object
dtype: object

In [258]:
split_task1.to_csv(args.output_munged_csv, index=False)

### Splitting data for task 2

In [259]:
# Splitting train by category
# Create dict
by_cate = collections.defaultdict(list)
for _, row in data.iterrows():
    by_cate[row.category].append(row.to_dict())

In [260]:
# Create split data
final_list = []
np.random.seed(args.seed)

for _, item_list in sorted(by_cate.items()):

    np.random.shuffle(item_list)
    
    n_total = len(item_list)
    n_train = int(args.train_proportion * n_total)
    n_val = int(args.val_proportion * n_total)
    n_test = int(args.test_proportion*n_total)
    
    # Give data point a split attribute
    for item in item_list[:n_train]:
        item['split'] = 'train'
    
    for item in item_list[n_train:n_train+n_val]:
        item['split'] = 'val'
    
    for item in item_list[n_train+n_val:]:
        item['split'] = 'test'

    # Add to final list
    final_list.extend(item_list)

In [261]:
split_task2 = pd.DataFrame(final_list)
split_task2.split.value_counts()

train    1387
test      425
val       188
Name: split, dtype: int64

In [262]:
set(split_task1.split)

{'test', 'train', 'val'}

In [263]:
split_task2

Unnamed: 0,job_description,job_type,category,job_type_new,job_description_new,split
0,The Company We are currently working with a G...,Contract/Temp,Accounting,Other,The Company We are currently working with a G...,train
1,About the Company My clientÂ is a global mark...,Full Time,Accounting,Full Time,About the Company My client is a global marke...,train
2,About the business After two years in operati...,Part Time,Accounting,Other,About the business After two years in operati...,train
3,The Firm This firm has been in businessÂ for ...,Full Time,Accounting,Full Time,The Firm This firm has been in business for o...,train
4,Project Finance Manager Â The Opportunity At...,Full Time,Accounting,Full Time,Project Finance Manager The Opportunity At Pw...,train
...,...,...,...,...,...,...
1995,LaseMedics are going through a period of expa...,Part Time,Trades & Services,Other,LaseMedics are going through a period of expa...,test
1996,Hays Trades & Labour require carpenters for a...,Contract/Temp,Trades & Services,Other,Hays Trades Labour require carpenters for an ...,test
1997,About Us GJK Facility Services is one of the ...,Full Time,Trades & Services,Full Time,About Us GJK Facility Services is one of the ...,test
1998,"Well presented, professional,Â highly motivat...",Full Time,Trades & Services,Full Time,Well presented professional highly motivated ...,test


### Select Top 10 Words For Each Category And Job Description

In [297]:
data2 = data.loc[data['category'] == 'Accounting']
data2 = data2.reset_index(drop=True)
job_description_text = data2.loc[:,'job_description_new']


In [298]:
job_description_text

0       Insolvency Intermediate or Senior Level Job C...
1       Sydney CBD office Great Opportunity for caree...
2       The Company We are currently working with a G...
3       This progressive business is seeking a skille...
4       The Country Fire Authority CFA is one of the ...
                             ...                        
103     The Business Our client is a growing Financia...
104     The Organisation Our client is a high profile...
105     TNR have a fantastic opportunity for an exper...
106    Join well established financial advisory firm ...
107     Join a well respected boutique accounting com...
Name: job_description_new, Length: 108, dtype: object

In [299]:
def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) 

In [300]:
from nltk.corpus import stopwords
doc_tokenized = list(sent_to_words(job_description_text))
dictionary = Dictionary()
BoW_corpus = [dictionary.doc2bow(doc, allow_update=True) for doc in doc_tokenized]
BoW_corpus

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 3),
  (7, 6),
  (8, 1),
  (9, 1),
  (10, 2),
  (11, 2),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 3),
  (17, 1),
  (18, 1),
  (19, 3),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 2),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 2),
  (46, 1),
  (47, 2),
  (48, 2),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 2),
  (54, 1),
  (55, 6),
  (56, 1),
  (57, 2),
  (58, 1),
  (59, 1),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 3),
  (64, 1),
  (65, 8),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 5),
  (70, 1),
  (71, 1),
  (72, 2),
  (73, 1),
  (74, 3),
  (75, 1),
  (76, 1),
  (77, 2),
  (78, 1),
  (79, 1),
  (80, 1),
  (81, 1),
  (82, 2),
  (83, 1),
  (84, 2),
  (85, 1),
  (86, 1),
  (87, 2),
  (88, 1),
  (89, 1),
  (90, 1),
  (91, 1)

In [301]:
len(BoW_corpus[1])

296

In [302]:
tfidf = TfidfModel(BoW_corpus, smartirs='ntc')

In [303]:
from heapq import nlargest
words = []
for corpus_idx in range(len(tfidf[BoW_corpus])):
    # chose 10 smallest TFIDF values and remove them
    maximun_tfidf = tfidf[BoW_corpus[corpus_idx]]
    maximun_tfidf.sort(key=lambda x: x[1], reverse=True)

In [304]:
top_words = []
for ele in maximun_tfidf:
    cur_row = dictionary[ele[0]].strip()
    top_words.append(cur_row)

In [305]:
def convert(lst):
    return ([i for item in lst for i in item.split()])

In [306]:
for i, job_descr in enumerate(job_description_text):
    arr = []
    for word in top_words:        
        word = word.lower()
        if word in job_descr:
            arr.append(word)
            if len(arr) == 10:
                break
    listToStr = ' '.join([str(elem) for elem in arr])
    data2.loc[[i],'top_words'] = str(listToStr)

In [307]:
data2

Unnamed: 0,job_description,job_type,category,job_type_new,job_description_new,top_words
0,"Insolvency Intermediate or Senior Level Job, ...",Full Time,Accounting,Full Time,Insolvency Intermediate or Senior Level Job C...,ward accounting look along firm control no cre...
1,Sydney CBD office Great Opportunity for caree...,Full Time,Accounting,Full Time,Sydney CBD office Great Opportunity for caree...,break put use health service control no per af...
2,The Company We are currently working with a G...,Contract/Temp,Accounting,Other,The Company We are currently working with a G...,they respected service no credit access per re...
3,This progressive business is seeking a skille...,Full Time,Accounting,Full Time,This progressive business is seeking a skille...,ward pride they accounting look along service ...
4,The Country Fire Authority (CFA) is one of th...,Full Time,Accounting,Full Time,The Country Fire Authority CFA is one of the ...,put hours member service erp per functions tak...
...,...,...,...,...,...,...
103,The Business Our client is a growing Financia...,Full Time,Accounting,Full Time,The Business Our client is a growing Financia...,no per offering grow send taxation return seek...
104,The Organisation Our client is a high profile...,Full Time,Accounting,Full Time,The Organisation Our client is a high profile...,every they member no per grow ideal relevant f...
105,TNR have a fantastic opportunity for an exper...,Full Time,Accounting,Full Time,TNR have a fantastic opportunity for an exper...,ward look interesting firm service no per taxa...
106,Join well established financial advisory firm ...,Full Time,Accounting,Full Time,Join well established financial advisory firm ...,companies they accounting establish use health...


### Dataset for Task 1 Feed Forward Neural Network

In [None]:
# Dataset
from torch.utils.data import Dataset
class FeedFowardDataset(Dataset):
    def __init__(self, task1_df, vectorizer):
        self.task1_df = task1_df
        self._vectorizer = vectorizer
        self.train_df = self.task1_df[self.task1_df.split=='train']
        self.train_size = len(self.train_df)
        self.val_df = self.task1_df[self.task1_df.split=='val']
        self.validation_size = len(self.val_df)
        self.test_df = self.task1_df[self.task1_df.split=='test']
        self.test_size = len(self.test_df)
        self._lookup_dict = {'train': (self.train_df, self.train_size),
                            'val': (self.val_df, self.validation_size),
                            'test': (self.test_df, self.test_size)}
        self.set_split('train')
    @classmethod
    def load_dataset_and_make_vectorizer(cls, task1_csv):
        task1_df = pd.read_csv(task1_csv)
        return cls(task1_df, OneHotVectorizer.from_dataframe(task1_df))
    def get_vectorizer(self):
        return self._vectorizer
    def set_split(self, split="train"):
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]
    def __len__(self):
        return self._target_size
    def __getitem__(self, index):
        row = self._target_df.iloc[index]
        top_10_vector = \
        self._vectorizer.vectorize(row.top_10_words)
        job_type_index = \
        self._vectorizer.job_type_vocab.lookup_token(row.job_type_new)
        return {'x_data': top_10_vector,
                'y_target': job_type_index}
    def get_num_batches(self, batch_size):
        return len(self) // batch_size

### Vectorizer for Task 1 Feed Forward Neural Network

In [None]:
# Vectorizer
from collections import Counter
import string
class OneHotVectorizer(object):
    def __init__(self, top_10_vocab, job_type_vocab):
        self.top_10_vocab = top_10_vocab
        self.job_type_vocab = job_type_vocab
    def vectorize(self, nar):
        one_hot = np.zeros(len(self.top_10_vocab), dtype=np.float32)
        for token in nar.split(" "):
            if token not in string.punctuation:
                one_hot[self.top_10_vocab.lookup_token(token)] = 1
        return one_hot
    @classmethod
    def from_dataframe(cls, task1_df, cutoff=15):
        top_10_vocab = Vocabulary(add_unk=True)
        job_type_vocab = Vocabulary(add_unk=False)
        # Add job_types
        for job_type in sorted(set(task1_df.job_type_new)):
            job_type_vocab.add_token(job_type)
        # Add top words if count > provided count
        word_counts = Counter()
        for nar in task1_df.top_10_words:
            for word in nar.split(" "):
                if word not in string.punctuation:
                    word_counts[word] += 1
        for word, count in word_counts.items():
            if count > cutoff:
                top_10_vocab.add_token(word)
        return cls(top_10_vocab, job_type_vocab)
    @classmethod
    def from_serializable(cls, contents):
        top_10_vocab = Vocabulary.from_serializable(contents['top_10_vocab'])
        job_type_vocab = Vocabulary.from_serializable(contents['job_type_vocab'])
        return cls(top_10_vocab=top_10_vocab, job_type_vocab=job_type_vocab)
    def to_serializable(self):
        return {'top_10_vocab': self.top_10_vocab.to_serializable(),
                'job_type_vocab': self.job_type_vocab.to_serializable()}

# Task 1 Binary Document Classification

## One-Hot Encoding Vectorizers and Feed-Forward Neural Network Model

In [296]:
# Dataset
from torch.utils.data import Dataset
class FeedFowardDataset(Dataset):
    def __init__(self, task1_df, vectorizer):
        self.task1_df = task1_df
        self._vectorizer = vectorizer
        self.train_df = self.task1_df[self.task1_df.split=='train']
        self.train_size = len(self.train_df)
        self.val_df = self.task1_df[self.task1_df.split=='val']
        self.validation_size = len(self.val_df)
        self.test_df = self.task1_df[self.task1_df.split=='test']
        self.test_size = len(self.test_df)
        self._lookup_dict = {'train': (self.train_df, self.train_size),
                            'val': (self.val_df, self.validation_size),
                            'test': (self.test_df, self.test_size)}
        self.set_split('train')
    @classmethod
    def load_dataset_and_make_vectorizer(cls, task1_csv):
        task1_df = pd.read_csv(task1_csv)
        return cls(task1_df, OneHotVectorizer.from_dataframe(task1_df))
    def get_vectorizer(self):
        return self._vectorizer
    def set_split(self, split="train"):
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]
    def __len__(self):
        return self._target_size
    def __getitem__(self, index):
        row = self._target_df.iloc[index]
        top_10_vector = \
        self._vectorizer.vectorize(row.top_10_words)
        job_type_index = \
        self._vectorizer.job_type_vocab.lookup_token(row.job_type_new)
        return {'x_data': top_10_vector,
                'y_target': job_type_index}
    def get_num_batches(self, batch_size):
        return len(self) // batch_size

In [297]:
# Vocabulary
class Vocabulary(object):
    def __init__(self, token_to_idx=None, add_unk=True, unk_token="<UNK>"):
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx
        self._idx_to_token = {idx: token 
                                for token, idx in self._token_to_idx.items()}
        self._add_unk = add_unk
        self._unk_token = unk_token
        self.unk_index = 1
        if add_unk:
            self.unk_index = self.add_token(unk_token)
    def to_serializable(self):
        return {'token_to_idx': self._token_to_idx,
                'add_unk': self._add_unk,
                'unk_token': self._unk_token}
    @classmethod
    def from_serializable(cls, contents):
        return cls(**contents)
    def add_token(self, token):
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index
    def lookup_token(self, token):
        if self._add_unk:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]
    def lookup_index(self, index):
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]
    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)
    def __len__(self):
        return len(self._token_to_idx)

In [298]:
# Vectorizer
from collections import Counter
import string
class OneHotVectorizer(object):
    def __init__(self, top_10_vocab, job_type_vocab):
        self.top_10_vocab = top_10_vocab
        self.job_type_vocab = job_type_vocab
    def vectorize(self, nar):
        one_hot = np.zeros(len(self.top_10_vocab), dtype=np.float32)
        for token in nar.split(" "):
            if token not in string.punctuation:
                one_hot[self.top_10_vocab.lookup_token(token)] = 1
        return one_hot
    @classmethod
    def from_dataframe(cls, task1_df, cutoff=15):
        top_10_vocab = Vocabulary(add_unk=True)
        job_type_vocab = Vocabulary(add_unk=False)
        # Add job_types
        for job_type in sorted(set(task1_df.job_type_new)):
            job_type_vocab.add_token(job_type)
        # Add top words if count > provided count
        word_counts = Counter()
        for nar in task1_df.top_10_words:
            for word in nar.split(" "):
                if word not in string.punctuation:
                    word_counts[word] += 1
        for word, count in word_counts.items():
            if count > cutoff:
                top_10_vocab.add_token(word)
        return cls(top_10_vocab, job_type_vocab)
    @classmethod
    def from_serializable(cls, contents):
        top_10_vocab = Vocabulary.from_serializable(contents['top_10_vocab'])
        job_type_vocab = Vocabulary.from_serializable(contents['job_type_vocab'])
        return cls(top_10_vocab=top_10_vocab, job_type_vocab=job_type_vocab)
    def to_serializable(self):
        return {'top_10_vocab': self.top_10_vocab.to_serializable(),
                'job_type_vocab': self.job_type_vocab.to_serializable()}

In [299]:
# Dataloader
from torch.utils.data import DataLoader
def generate_batches(dataset, batch_size, shuffle=True, drop_last=True, device="cpu"):
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size, 
                            shuffle=shuffle, drop_last=drop_last)
    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

In [300]:
# Perceptron Classifier
import torch.nn as nn
import torch.nn.functional as F
class FeedForwardClassifier(nn.Module):
    def __init__(self, num_features):
        super(FeedForwardClassifier, self).__init__()
        self.fc1 = nn.Linear(in_features=num_features, out_features=1)
    
    def forward(self, x_in, apply_sigmoid=False):
        y_out = self.fc1(x_in).squeeze()
        if apply_sigmoid:
            y_out = torch.sigmoid(y_out)
        return y_out

In [301]:
# Initial Setup
from argparse import Namespace
args = Namespace(
    # Data and path information
    frequency_cutoff=15,
    model_state_file='model.pth',
    task1_csv="ass02_task01.csv",
    save_dir='model_storage/task01/',
    vectorizer_file='vectorizer.json',
    # No model hyperparameters
    # Training hyperparameters
    batch_size=128,
    early_stopping_criteria=5,
    learning_rate=0.001,
    num_epochs=100,
    seed=1337,
    # Runtime options
    cuda=True,
    device='cuda',
)

In [302]:
# Training preparation
import torch
import torch.optim as optim
import pandas as pd

def make_train_state(args):
    return {'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': [],
            'test_loss': 1,
            'test_acc': 1}

train_state = make_train_state(args)
if not torch.cuda.is_available():
    args.cuda = False
args.device = torch.device("cuda" if args.cuda else "cpu")

# dataset and vectorizer
dataset = FeedFowardDataset.load_dataset_and_make_vectorizer(args.task1_csv)
vectorizer = dataset.get_vectorizer()
# model
classifier = FeedForwardClassifier(num_features=len(vectorizer.top_10_vocab))
classifier = classifier.to(args.device)
# loss and optimizer
loss_func = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)

In [303]:
def compute_accuracy(y_pred, y_target):
    y_target = y_target.cpu()
    y_pred_indices = (torch.sigmoid(y_pred)>0.5).cpu().long()#.max(dim=1)[1]
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100

In [304]:
# Training
import numpy as np
for epoch_index in range(args.num_epochs):
    train_state['epoch_index'] = epoch_index
    # Iterate over training dataset
    # setup: batch generator, set loss and acc to 0, set train mode on
    dataset.set_split('train')
    batch_generator = generate_batches(dataset, batch_size=args.batch_size, device=args.device)
    running_loss = 0.0
    running_acc = 0.0
    classifier.train()
    for batch_index, batch_dict in enumerate(batch_generator):
        # the training routine is 5 steps:
        # step 1. zero the gradients
        optimizer.zero_grad()
        # step 2. compute the output
        y_pred = classifier(x_in=batch_dict['x_data'].float())
        # step 3. compute the loss
        loss = loss_func(y_pred, batch_dict['y_target'].float())
        loss_batch = loss.item()
        running_loss += (loss_batch-running_loss) / (batch_index + 1)
        # step 4. use loss to produce gradients
        loss.backward()
        # step 5. use optimizer to take gradient step
        optimizer.step()
        # compute the accuracy
        acc_batch = compute_accuracy(y_pred, batch_dict['y_target'])
        running_acc += (acc_batch - running_acc) / (batch_index + 1)

    train_state['train_loss'].append(running_loss)
    train_state['train_acc'].append(running_acc)

    # Iterate over val dataset
    # setup: batch generator, set loss and acc to 0, set eval mode on
    dataset.set_split('val')
    batch_generator = generate_batches(dataset, batch_size=args.batch_size, device=args.device)
    running_loss = 0.
    running_acc = 0.
    classifier.eval()

    for batch_index, batch_dict in enumerate(batch_generator):
        # step 1. compute the output
        y_pred = classifier(x_in=batch_dict['x_data'].float())
        # step 2. compute the loss
        loss = loss_func(y_pred, batch_dict['y_target'].float())
        loss_batch = loss.item()
        running_loss += (loss_batch - running_loss) / (batch_index + 1)
        # step 3. compute the accuracy
        acc_batch = compute_accuracy(y_pred, batch_dict['y_target'])
        running_acc += (acc_batch - running_acc) / (batch_index + 1)
    train_state['val_loss'].append(running_loss)
    train_state['val_acc'].append(running_acc)

In [305]:
# Evaluation
dataset.set_split('test')
batch_generator = generate_batches(dataset,batch_size=args.batch_size,device=args.device)
running_loss = 0.
running_acc = 0.
classifier.eval()
for batch_index, batch_dict in enumerate(batch_generator):
    # compute the output
    y_pred = classifier(x_in=batch_dict['x_data'].float())
    # compute the loss
    loss = loss_func(y_pred, batch_dict['y_target'].float())
    loss_batch = loss.item()
    running_loss += (loss_batch - running_loss) / (batch_index + 1)
    # compute the accuracy
    acc_batch = compute_accuracy(y_pred, batch_dict['y_target'])
    running_acc += (acc_batch - running_acc) / (batch_index + 1)
train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc

In [306]:
print("Test loss: {:.3f}".format(train_state['test_loss']))
print("Test Accuracy: {:.2f}".format(train_state['test_acc']))
# Test loss: 0.611
# Test Accuracy: 69.01

Test loss: 0.609
Test Accuracy: 68.75


In [307]:
# Inference and Classifying new data points
import re
# Preprocess the reviews
def preprocess_text(text):
    if type(text) == float:
        print(text)
    text = text.lower()
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text
def predict_job_type(top_10_word, classifier, vectorizer, decision_threshold=0.5):
    top_10_word = preprocess_text(top_10_word)
    vectorized_top_10_word= torch.tensor(vectorizer.vectorize(top_10_word)).to(args.device)
    result = classifier(vectorized_top_10_word.view(1, -1))
    probability_value = torch.sigmoid(result).item()
    index = 1
    if probability_value < decision_threshold:
        index = 0
    return vectorizer.job_type_vocab.lookup_index(index)

In [308]:
# test_top_10_word can be changed to other string to obtain the prediction.
test_top_10_word = split_task1.loc[5, 'top_10_words']
split_task1.loc[5, 'job_type_new']

'Full Time'

In [309]:
test_top_10_word = "we are looking for someone wants a full time job"

In [310]:
prediction = predict_job_type(test_top_10_word, classifier, vectorizer)
print("{} > {}".format(test_top_10_word, prediction))

we are looking for someone wants a full time job > Full Time


In [311]:
# Inspecting model weights
# Sort weights
fc1_weights = classifier.fc1.weight.to('cpu').detach()[0]
_, indices = torch.sort(fc1_weights, dim=0, descending=True)
indices = indices.numpy().tolist()

In [314]:
# Top 20 full time job type words
print("Influential words in full time job type words:")
print("")
for i in range(10):
    print(vectorizer.top_10_vocab.lookup_index(indices[i]))

Influential words in full time job type words:

warrants
witness
el
surveying
<UNK>
oriented
repayments
bigger
regulations
substantially


In [315]:
# Top 20 other job type words
print("Influential words in other job type:")
print("")
indices.reverse()
for i in range(10):
    print(vectorizer.top_10_vocab.lookup_index(indices[i])) 

Influential words in other job type:

afternoon
white
cutting
accessible
independence
analysts
affordable
suspected
treating
refer
