In [1]:
#!pip install pandas
#!pip install seaborn
#!pip install wordcloud
import pandas as pd
import numpy as np
import random
import json

# plotting
from matplotlib import pyplot as plt
import seaborn as sns

import nltk
from nltk.probability import FreqDist # frequency 
from nltk.tokenize import word_tokenize # tokenize
from nltk.tag import pos_tag # POS tag
from nltk.stem import PorterStemmer # stemming
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords # to remove stop words

# pip install -U spacy
# python -m spacy download en_core_web_sm
import spacy
from spacy.tokens.doc import Doc
from spacy.vocab import Vocab
from spacy.tokenizer import Tokenizer
from spacy.matcher import Matcher # linguistic pattern
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter # count frequent noun phrases
from sklearn.model_selection import train_test_split  

# onehot encoding
from sklearn.preprocessing import OneHotEncoder
import collections
from argparse import Namespace
import os
import string
import sklearn

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm
from torch.nn.utils import rnn as rnn_utils

# Data Exploration

In [2]:
dat = pd.read_csv('seek_australia.csv')

In [3]:
dat.head()

Unnamed: 0,category,city,company_name,geo,job_board,job_description,job_title,job_type,post_date,salary_offered,state,url
0,Retail & Consumer Products,Sydney,Frontline Executive Retail Sydney,AU,seek,Have you had 10 years experience in fresh pro...,Store Manager - Fresh Produce,Full Time,2018-04-15T23:13:45Z,$100k Base + Super + Benefits,North Shore & Northern Beaches,https://www.seek.com.au/job/35989382
1,Government & Defence,Brisbane,Powerlink,AU,seek,The Opportunity: The Client Solution Analyst ...,Client Solution Analyst,Full Time,2018-04-15T23:04:40Z,Excellent remuneration packages,Northern Suburbs,https://www.seek.com.au/job/35989272
2,Trades & Services,Sydney,Richard Jay Laundry,AU,seek,An innovative business development role for a...,Service Technician / Installer - NSW,Full Time,2018-04-15T23:04:31Z,,Parramatta & Western Suburbs,https://www.seek.com.au/job/35989270
3,Trades & Services,Melbourne,Adaptalift Hyster,AU,seek,About the role: We are seeking an Automotive W...,Workshop Technician I Material Handling Equipment,Full Time,2018-04-16T03:15:17Z,,Bayside & South Eastern Suburbs,https://www.seek.com.au/job/35993203
4,Trades & Services,Adelaide,Bakers Delight G&M,AU,seek,Â Early starts and weekend shifts. No experie...,APPRENTICESHIP JUNIOR BAKER,Full Time,2018-04-16T01:26:50Z,,,https://www.seek.com.au/job/35991578


In [4]:
dat.isnull().sum()

category               0
city                   0
company_name           0
geo                    0
job_board              0
job_description      345
job_title              0
job_type               0
post_date              0
salary_offered     21048
state              10820
url                    0
dtype: int64

In [5]:
dat = dat[dat['job_description'].notna()]

In [6]:
dat.isnull().sum()

category               0
city                   0
company_name           0
geo                    0
job_board              0
job_description        0
job_title              0
job_type               0
post_date              0
salary_offered     20811
state              10718
url                    0
dtype: int64

In [7]:
dat

Unnamed: 0,category,city,company_name,geo,job_board,job_description,job_title,job_type,post_date,salary_offered,state,url
0,Retail & Consumer Products,Sydney,Frontline Executive Retail Sydney,AU,seek,Have you had 10 years experience in fresh pro...,Store Manager - Fresh Produce,Full Time,2018-04-15T23:13:45Z,$100k Base + Super + Benefits,North Shore & Northern Beaches,https://www.seek.com.au/job/35989382
1,Government & Defence,Brisbane,Powerlink,AU,seek,The Opportunity: The Client Solution Analyst ...,Client Solution Analyst,Full Time,2018-04-15T23:04:40Z,Excellent remuneration packages,Northern Suburbs,https://www.seek.com.au/job/35989272
2,Trades & Services,Sydney,Richard Jay Laundry,AU,seek,An innovative business development role for a...,Service Technician / Installer - NSW,Full Time,2018-04-15T23:04:31Z,,Parramatta & Western Suburbs,https://www.seek.com.au/job/35989270
3,Trades & Services,Melbourne,Adaptalift Hyster,AU,seek,About the role: We are seeking an Automotive W...,Workshop Technician I Material Handling Equipment,Full Time,2018-04-16T03:15:17Z,,Bayside & South Eastern Suburbs,https://www.seek.com.au/job/35993203
4,Trades & Services,Adelaide,Bakers Delight G&M,AU,seek,Â Early starts and weekend shifts. No experie...,APPRENTICESHIP JUNIOR BAKER,Full Time,2018-04-16T01:26:50Z,,,https://www.seek.com.au/job/35991578
...,...,...,...,...,...,...,...,...,...,...,...,...
29995,Hospitality & Tourism,Sydney,Radisson Blu Plaza Hotel Sydney,AU,seek,Hotel snapshot The Radisson Blu Plaza Sydney ...,Bar Supervisor,Full Time,2018-04-11T04:20:40Z,"Annualised salary, uniform + Super","CBD, Inner West & Eastern Suburbs",https://www.seek.com.au/job/35958503
29996,CEO & General Management,ACT,Airservices Australia,AU,seek,The Organisation Airservices is a government ...,Deputy Board Secretary,Full Time,2018-04-11T04:00:49Z,Salary package to be negotiated,,https://www.seek.com.au/job/35958100
29997,Accounting,Melbourne,The Hassett Group,AU,seek,ABOUT THE COMPANY AND ROLE Our client is one o...,Corporate Accountant,Full Time,2018-04-11T02:45:37Z,$110k Package On Offer!,CBD & Inner Suburbs,https://www.seek.com.au/job/35956991
29998,Government & Defence,ACT,SOS Recruitment,AU,seek,Long term contract for 12 months with possibl...,APS 6 & EL1 Account Managers,Contract/Temp,2018-04-11T04:55:16Z,,,https://www.seek.com.au/job/35959184


In [8]:
dat2 = dat.loc[:2019]

In [9]:
dat2.shape

(2000, 12)

In [10]:
data = dat2[['job_description', 'job_type', 'category']]

In [11]:
data.head()

Unnamed: 0,job_description,job_type,category
0,Have you had 10 years experience in fresh pro...,Full Time,Retail & Consumer Products
1,The Opportunity: The Client Solution Analyst ...,Full Time,Government & Defence
2,An innovative business development role for a...,Full Time,Trades & Services
3,About the role: We are seeking an Automotive W...,Full Time,Trades & Services
4,Â Early starts and weekend shifts. No experie...,Full Time,Trades & Services


In [12]:
data.isnull().sum()

job_description    0
job_type           0
category           0
dtype: int64

In [13]:
# data['job_description'].fillna("", inplace=True)
# we need to drop na otherwise the 
# data = data.dropna()
data['job_description_new'] = data['job_description'].map(lambda x: re.sub(r'\W+', ' ', x))
def remove_nonEglish(data):
    return re.sub("[^\x00-\x7F]+", "", data)

def remove_multiSpace(data):
    return re.sub(' +', ' ', data)

data['job_description_new'] = data['job_description_new'].apply(lambda x: remove_nonEglish(x))
data['job_description_new'] = data['job_description_new'].apply(lambda x: remove_multiSpace(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['job_description_new'] = data['job_description'].map(lambda x: re.sub(r'\W+', ' ', x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['job_description_new'] = data['job_description_new'].apply(lambda x: remove_nonEglish(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['job_descript

In [14]:
data.to_csv("seek_australia_2000.csv")

In [15]:
data['job_description_new'][0]

' Have you had 10 years experience in fresh produce that wants to manage their own store for a family owned Australian company that is passionate about food We are looking for Must have 10 years in the fresh food business and have the passion for the role Current 2IC looking to progress with training into Store manager role Excellent customer service and communication skills Be hands on and have a can do attitude Be into the fresh food business and have the passion for the role Hardworking ambitious and competitive people who are passionate about good food Are able to maximise the financial return in their market ensuring it meets sales margin and wages budgets Have exceptional merchandising capabilities and customer service skills helping us to create unique shopping experiences for our customers Have a wealth of knowledge of fresh food retailing and a willingness to share this knowledge Can lead manage and motivate a teams Must be able to work weekend and use to early starts which is

In [16]:
unique_category = data['category'].unique().tolist()

cate_desc_dict = {}

for cate in unique_category:
    cate = str(cate)
    desc_documents = data.loc[data['category'] == cate, 'job_description_new']
    # print(desc_documents)
    corpus = []
    for desc in desc_documents:
        # print(desc)
        corpus.append(str(desc))

    cate_desc_dict[cate] = corpus

In [17]:
len(cate_desc_dict)

30

# Pre-trained vectors

In [18]:
import sys
assert sys.version_info[0]==3
assert sys.version_info[1] >= 5

from gensim.models import KeyedVectors
from gensim.test.utils import datapath
import pprint
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [10, 5]
import numpy as np
import random
import scipy as sp
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA

START_TOKEN = '<START>'
END_TOKEN = '<END>'

np.random.seed(0)
random.seed(0)

In [19]:
texts = cate_desc_dict["Self Employment"]
job_desc_token = []
for text in texts:
    sentences = nltk.sent_tokenize(text)
    for sentence in sentences:
        words = nltk.word_tokenize(sentence)
        # ner_tagged_words = nltk.ne_chunk(nltk.pos_tag(words))
        # job_descip_token = job_descip_token + [ner_tagged_words]
        job_desc_token.extend(words)

In [20]:
len(job_desc_token)

279

In [21]:
len(set(job_desc_token))

185

In [22]:
cate_desc_dict["Self Employment"]

['Join a team you ll love within a company Australia loves At Aussie we pride ourselves on educating new to industry brokers developing the businesses of experienced Mortgage Brokers and helping people transition to a self employed opportunity they love What Aussie offers you Ongoing training development and support Comprehensive panel of lenders Leading technology Opportunity to work towards other Aussie channels such as franchise Free two year mentoring program for all new brokers Uncapped commission to build your own future Supportive vibrant team culture What does being a Mortgage Broker involve A broker s day is wide and varied A typical day could involve meeting with a prospective customer to review their financial situation and borrowing capacity liaising with lenders to track the progress of a loan managing the flow of documentation for multiple loan lodgements meeting with a referral partner to build up business generation networks overseeing the integrity of compliance proces

In [23]:
def read_corpus(category="Self Employment"):
    sents = []
    for text in cate_desc_dict[category]:
        for sentence in nltk.sent_tokenize(text):
            sent = [START_TOKEN] + [w.lower() for w in nltk.word_tokenize(sentence)] + [END_TOKEN]
            sents.append(sent)
    # return [[[START_TOKEN] + [w.lower() for w in nltk.word_tokenize(sentence)] + [END_TOKEN] for sentence in nltk.sent_tokenize(text)] for text in cate_desc_dict[category]]
    return sents

data_corpus = read_corpus()
pprint.pprint(data_corpus[:3], compact=True, width=100)

[['<START>', 'join', 'a', 'team', 'you', 'll', 'love', 'within', 'a', 'company', 'australia',
  'loves', 'at', 'aussie', 'we', 'pride', 'ourselves', 'on', 'educating', 'new', 'to', 'industry',
  'brokers', 'developing', 'the', 'businesses', 'of', 'experienced', 'mortgage', 'brokers', 'and',
  'helping', 'people', 'transition', 'to', 'a', 'self', 'employed', 'opportunity', 'they', 'love',
  'what', 'aussie', 'offers', 'you', 'ongoing', 'training', 'development', 'and', 'support',
  'comprehensive', 'panel', 'of', 'lenders', 'leading', 'technology', 'opportunity', 'to', 'work',
  'towards', 'other', 'aussie', 'channels', 'such', 'as', 'franchise', 'free', 'two', 'year',
  'mentoring', 'program', 'for', 'all', 'new', 'brokers', 'uncapped', 'commission', 'to', 'build',
  'your', 'own', 'future', 'supportive', 'vibrant', 'team', 'culture', 'what', 'does', 'being', 'a',
  'mortgage', 'broker', 'involve', 'a', 'broker', 's', 'day', 'is', 'wide', 'and', 'varied', 'a',
  'typical', 'day', 'coul

In [24]:
def distinct_words(corpus):
    """ Determine a list of distinct words for the corpus.
        Params:
            corpus (list of list of strings): corpus of documents
        Return:
            corpus_words (list of strings): list of distinct words across the corpus, sorted (using python 'sorted' function)
            num_corpus_words (integer): number of distinct words across the corpus
    """
    corpus_words = []
    num_corpus_words = -1
    
    # ------------------
    # Write your implementation here.
    corpus_words = sorted(list(set([y for x in corpus for y in x])))
#     corpus_words = [y for x in corpus for y in x] 
#     corpus_words = list(set(corpus_words)) # unique words 
#     corpus_words = sorted(corpus_words) # sorts
    num_corpus_words = len(corpus_words)
    # ------------------

    return corpus_words, num_corpus_words

test_corpus_words, num_corpus_words = distinct_words(data_corpus)

In [25]:
test_corpus_words

['<END>',
 '<START>',
 'a',
 'achieve',
 'activities',
 'aggregator',
 'all',
 'also',
 'am',
 'and',
 'appointments',
 'as',
 'at',
 'aussie',
 'australia',
 'be',
 'being',
 'borrowing',
 'broker',
 'brokers',
 'build',
 'built',
 'business',
 'businesses',
 'busy',
 'capacity',
 'channels',
 'check',
 'child',
 'children',
 'clients',
 'commission',
 'company',
 'compliance',
 'comprehensive',
 'could',
 'culture',
 'customer',
 'daily',
 'day',
 'developing',
 'development',
 'do',
 'documentation',
 'does',
 'educating',
 'employed',
 'experience',
 'experienced',
 'families',
 'financial',
 'first',
 'flexibility',
 'flow',
 'for',
 'franchise',
 'free',
 'from',
 'future',
 'generation',
 'goals',
 'have',
 'help',
 'helping',
 'highly',
 'how',
 'i',
 'industry',
 'info',
 'integrity',
 'involve',
 'is',
 'it',
 'job',
 'join',
 'know',
 'leading',
 'leave',
 'lenders',
 'liaising',
 'lifestyle',
 'll',
 'loan',
 'lodgements',
 'looking',
 'love',
 'loves',
 'managing',
 'mater

In [26]:
from sklearn.manifold import TSNE
from nltk.corpus import stopwords
import numpy as np
import matplotlib.pyplot as plt
import gensim.downloader as api
from bokeh.plotting import figure, show, output_file
from bokeh.io import push_notebook, output_notebook
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show
from bokeh.io import push_notebook, output_notebook
from bokeh.models import ColumnDataSource, LabelSet
wv = api.load('word2vec-google-news-300')

In [27]:
# pretrained model
wv = api.load('word2vec-google-news-300')

In [28]:
def interactive_tsne(text_labels, tsne_array):
    '''makes an interactive scatter plot with text labels for each point'''

    # Define a dataframe to be used by bokeh context
    bokeh_df = pd.DataFrame(tsne_array, text_labels, columns=['x','y'])
    bokeh_df['text_labels'] = bokeh_df.index

    # interactive controls to include to the plot
    TOOLS="hover, zoom_in, zoom_out, box_zoom, undo, redo, reset, box_select"
    output_file("plot.html")

    p = figure(tools=TOOLS, plot_width=700, plot_height=700)

    # define data source for the plot
    source = ColumnDataSource(bokeh_df)

    # scatter plot
    p.scatter('x', 'y', source=source, fill_alpha=0.6,
              fill_color="#8724B5",
              line_color=None)

    # text labels
    labels = LabelSet(x='x', y='y', text='text_labels', y_offset=8,
                      text_font_size="8pt", text_color="#555555",
                      source=source, text_align='center')

    p.add_layout(labels)

    # show plot inline
    output_notebook()
    show(p)

In [29]:
# stopwords = stopwords.words('english')
# vocab = test_corpus_words
# input_vocab =  [word for word in vocab if word in wv.key_to_index.keys() and word not in stopwords]
vocab = test_corpus_words
input_vocab =  [word for word in vocab if word in wv.key_to_index.keys()]
X = wv[input_vocab]
# find tsne coords for 2 dimensions
tsne = TSNE(n_components=2, random_state=0)
X_tsne = tsne.fit_transform(X)

print(input_vocab)

points = len(input_vocab)
interactive_tsne(list(input_vocab)[:points], X_tsne)



['achieve', 'activities', 'aggregator', 'all', 'also', 'am', 'appointments', 'as', 'at', 'aussie', 'australia', 'be', 'being', 'borrowing', 'broker', 'brokers', 'build', 'built', 'business', 'businesses', 'busy', 'capacity', 'channels', 'check', 'child', 'children', 'clients', 'commission', 'company', 'compliance', 'comprehensive', 'could', 'culture', 'customer', 'daily', 'day', 'developing', 'development', 'do', 'documentation', 'does', 'educating', 'employed', 'experience', 'experienced', 'families', 'financial', 'first', 'flexibility', 'flow', 'for', 'franchise', 'free', 'from', 'future', 'generation', 'goals', 'have', 'help', 'helping', 'highly', 'how', 'i', 'industry', 'info', 'integrity', 'involve', 'is', 'it', 'job', 'join', 'know', 'leading', 'leave', 'lenders', 'liaising', 'lifestyle', 'll', 'loan', 'lodgements', 'looking', 'love', 'loves', 'managing', 'maternity', 'meeting', 'melanie', 'mentoring', 'mobile', 'more', 'mortgage', 'mother', 'motivated', 'multiple', 'mum', 'my', 

# Purposely trained vectors

In [30]:
import gensim
import re
from gensim.corpora import Dictionary

In [31]:
def vectorizer(category):
    document = cate_desc_dict[category]
    doc_tokenized = gensim.utils.simple_preprocess(str(document), deacc=True)
    # print(doc_tokenized[:10])
    return doc_tokenized

In [32]:
doc_tokenized = vectorizer("Self Employment")

In [33]:
print(doc_tokenized[:10])

['join', 'team', 'you', 'll', 'love', 'within', 'company', 'australia', 'loves', 'at']


In [34]:
from gensim.models import Word2Vec

cores = 16
model = Word2Vec(min_count=1,
                     window=2,
                     vector_size=100,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

In [35]:
from time import time

t = time()

model.build_vocab([doc_tokenized], progress_per=10)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

Time to build vocab: 0.0 mins


In [36]:
t = time()

model.train(test_corpus_words, total_examples=model.corpus_count, epochs=1000, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

Time to train the model: 0.05 mins


In [37]:
model.wv.key_to_index.keys()

dict_keys(['to', 'with', 'and', 'of', 'for', 'mortgage', 'my', 'the', 'brokers', 'what', 'broker', 'being', 'aussie', 'other', 'business', 'loan', 'offers', 'ongoing', 'training', 'support', 'perfect', 'their', 'financial', 'meeting', 'day', 'help', 'involve', 'working', 'now', 'no', 'looking', 'build', 'future', 'two', 'lenders', 'love', 'employed', 'self', 'team', 'you', 'people', 'opportunity', 'on', 'we', 'new', 'they', 'supportive', 'at', 'your', 'own', 'culture', 'pride', 'vibrant', 'company', 'does', 'loves', 'australia', 'uncapped', 'within', 'is', 'wide', 'varied', 'typical', 'could', 'll', 'commission', 'mentoring', 'all', 'towards', 'transition', 'development', 'helping', 'experienced', 'comprehensive', 'panel', 'prospective', 'leading', 'technology', 'work', 'businesses', 'ourselves', 'channels', 'developing', 'such', 'as', 'industry', 'franchise', 'free', 'educating', 'year', 'program', 'video', 'borrowing', 'customer', 'job', 'supports', 'also', 'it', 'flexibility', 'life

In [38]:
import nltk
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
# vocab = ['company', 'for', 'client', 'is', 'we', 'object', 'seeking', 'automotive']
input_vocab =  [word for word in vocab if word in model.wv.key_to_index.keys() and word not in stop_words]
# input_vocab = model.wv.index_to_key
X = model.wv[input_vocab]
# find tsne coords for 2 dimensions
tsne = TSNE(n_components=2, random_state=0)
X_tsne = tsne.fit_transform(X)

print(input_vocab)

points = len(input_vocab)
interactive_tsne(list(input_vocab)[:points], X_tsne)



['achieve', 'activities', 'aggregator', 'also', 'appointments', 'aussie', 'australia', 'borrowing', 'broker', 'brokers', 'build', 'built', 'business', 'businesses', 'busy', 'capacity', 'channels', 'check', 'child', 'children', 'clients', 'commission', 'company', 'compliance', 'comprehensive', 'could', 'culture', 'customer', 'daily', 'day', 'developing', 'development', 'documentation', 'educating', 'employed', 'experience', 'experienced', 'families', 'financial', 'first', 'flexibility', 'flow', 'franchise', 'free', 'future', 'generation', 'goals', 'help', 'helping', 'highly', 'industry', 'info', 'integrity', 'involve', 'job', 'join', 'know', 'leading', 'leave', 'lenders', 'liaising', 'lifestyle', 'loan', 'lodgements', 'looking', 'love', 'loves', 'managing', 'maternity', 'meeting', 'melanie', 'mentoring', 'mobile', 'mortgage', 'mother', 'motivated', 'multiple', 'mum', 'necessary', 'networks', 'new', 'offers', 'ongoing', 'opportunity', 'organising', 'overseeing', 'panel', 'partner', 'pass

# Data preprocessing

In [39]:
data    d

Unnamed: 0,job_description,job_type,category,job_description_new
0,Have you had 10 years experience in fresh pro...,Full Time,Retail & Consumer Products,Have you had 10 years experience in fresh pro...
1,The Opportunity: The Client Solution Analyst ...,Full Time,Government & Defence,The Opportunity The Client Solution Analyst p...
2,An innovative business development role for a...,Full Time,Trades & Services,An innovative business development role for a...
3,About the role: We are seeking an Automotive W...,Full Time,Trades & Services,About the role We are seeking an Automotive Wo...
4,Â Early starts and weekend shifts. No experie...,Full Time,Trades & Services,Early starts and weekend shifts No experience...
...,...,...,...,...
2015,2 positions available Based in Kelmscott and ...,Full Time,Trades & Services,2 positions available Based in Kelmscott and ...
2016,North Bondi FishÂ Located a few short steps f...,Full Time,Hospitality & Tourism,North Bondi Fish Located a few short steps fro...
2017,Process Workers â€“ South Gippsland Rapidly g...,Casual/Vacation,"Manufacturing, Transport & Logistics",Process Workers South Gippsland Rapidly growi...
2018,"JPS, Australia's leading Consultancy for the ...",Full Time,Trades & Services,JPS Australia s leading Consultancy for the P...


Reindex the dataset

In [40]:
data = data.reset_index()
data = data.drop(['index'], axis = 1)

### Caculate TFIDF score for each category and pick top 10 words for each job description

In [119]:
from heapq import nlargest
def sent_to_words(sentences):

    for sentence in sentences:

        # deacc=True removes punctuations

        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

def convert(lst):

    return ([i for item in lst for i in item.split()])

In [120]:
def top_10_words(data, category):
    top_words = []
    data = data.loc[data['category'] == category]
    data = data.reset_index(drop=True)
    job_description_text = data.loc[:,'job_description_new']
    doc_tokenized = list(sent_to_words(job_description_text))
    dictionary = Dictionary()
    BoW_corpus = [dictionary.doc2bow(doc, allow_update=True) for doc in doc_tokenized]
    BoW_corpus
    tfidf = TfidfModel(BoW_corpus, smartirs='ntc')

    for corpus_idx in range(len(tfidf[BoW_corpus])):

        # chose 10 smallest TFIDF values and remove them
        maximun_tfidf = tfidf[BoW_corpus[corpus_idx]]
        maximun_tfidf.sort(key=lambda x: x[1], reverse=True)

    for ele in maximun_tfidf:
        cur_row = dictionary[ele[0]].strip()
        top_words.append(cur_row)

    for i, job_descr in enumerate(job_description_text):
        arr = []
        for word in top_words:        
            word = word.lower()
            if word in job_descr:
                arr.append(word)
                if len(arr) == 10:
                    break
        listToStr = ' '.join([str(elem) for elem in arr])
        data.loc[[i],'top_10_words'] = str(listToStr)
    return data

In [121]:
category = 'Accounting'
data2 = top_10_words(data, category)

In [122]:
data2

Unnamed: 0,job_description,job_type,category,job_description_new,top_10_words,job_type_new
0,"Insolvency Intermediate or Senior Level Job, ...",Full Time,Accounting,Insolvency Intermediate or Senior Level Job C...,ward accounting look along firm control no cre...,Full Time
1,Sydney CBD office Great Opportunity for caree...,Full Time,Accounting,Sydney CBD office Great Opportunity for caree...,break put use health service control no per af...,Full Time
2,The Company We are currently working with a G...,Contract/Temp,Accounting,The Company We are currently working with a G...,they respected service no credit access per re...,Other
3,This progressive business is seeking a skille...,Full Time,Accounting,This progressive business is seeking a skille...,ward pride they accounting look along service ...,Full Time
4,The Country Fire Authority (CFA) is one of th...,Full Time,Accounting,The Country Fire Authority CFA is one of the ...,put hours member service erp per functions tak...,Full Time
...,...,...,...,...,...,...
103,The Business Our client is a growing Financia...,Full Time,Accounting,The Business Our client is a growing Financia...,no per offering grow send taxation return seek...,Full Time
104,The Organisation Our client is a high profile...,Full Time,Accounting,The Organisation Our client is a high profile...,every they member no per grow ideal relevant f...,Full Time
105,TNR have a fantastic opportunity for an exper...,Full Time,Accounting,TNR have a fantastic opportunity for an exper...,ward look interesting firm service no per taxa...,Full Time
106,Join well established financial advisory firm ...,Full Time,Accounting,Join well established financial advisory firm ...,companies they accounting establish use health...,Full Time


### Select top 10 ranked words for each job description

In [42]:
from gensim.models import TfidfModel
from gensim.corpora import Dictionary

In [43]:
data

Unnamed: 0,job_description,job_type,category,job_description_new
0,Have you had 10 years experience in fresh pro...,Full Time,Retail & Consumer Products,Have you had 10 years experience in fresh pro...
1,The Opportunity: The Client Solution Analyst ...,Full Time,Government & Defence,The Opportunity The Client Solution Analyst p...
2,An innovative business development role for a...,Full Time,Trades & Services,An innovative business development role for a...
3,About the role: We are seeking an Automotive W...,Full Time,Trades & Services,About the role We are seeking an Automotive Wo...
4,Â Early starts and weekend shifts. No experie...,Full Time,Trades & Services,Early starts and weekend shifts No experience...
...,...,...,...,...
1995,2 positions available Based in Kelmscott and ...,Full Time,Trades & Services,2 positions available Based in Kelmscott and ...
1996,North Bondi FishÂ Located a few short steps f...,Full Time,Hospitality & Tourism,North Bondi Fish Located a few short steps fro...
1997,Process Workers â€“ South Gippsland Rapidly g...,Casual/Vacation,"Manufacturing, Transport & Logistics",Process Workers South Gippsland Rapidly growi...
1998,"JPS, Australia's leading Consultancy for the ...",Full Time,Trades & Services,JPS Australia s leading Consultancy for the P...


In [44]:
job_description_text = data.loc[:,'job_description_new']

In [45]:
def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) 

In [46]:
from nltk.corpus import stopwords
doc_tokenized = list(sent_to_words(job_description_text))
dictionary = Dictionary()
stop_words = stopwords.words('english')
# stop_words = [word for word in doc_tokenized if word not in stopwords]
BoW_corpus = [dictionary.doc2bow(doc, allow_update=True) for doc in doc_tokenized]
BoW_corpus

[[(0, 2),
  (1, 3),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 12),
  (7, 3),
  (8, 3),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 3),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 3),
  (18, 1),
  (19, 2),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 2),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 2),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 5),
  (48, 8),
  (49, 5),
  (50, 1),
  (51, 1),
  (52, 2),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 1),
  (57, 7),
  (58, 1),
  (59, 1),
  (60, 1),
  (61, 4),
  (62, 1),
  (63, 2),
  (64, 4),
  (65, 2),
  (66, 2),
  (67, 1),
  (68, 2),
  (69, 2),
  (70, 1),
  (71, 1),
  (72, 1),
  (73, 1),
  (74, 1),
  (75, 1),
  (76, 1),
  (77, 2),
  (78, 1),
  (79, 2),
  (80, 2),
  (81, 3),
  (82, 1),
  (83, 1),
  (84, 1),
  (85, 1),
  (86, 1),
  (87, 2),
  (88, 1),
  (89, 1),
  (90, 1),
  (91, 1

In [47]:
len(BoW_corpus[1])

168

In [48]:
tfidf = TfidfModel(BoW_corpus, smartirs='ntc')

In [49]:
# data = data.reset_index()
# data = data.drop(['index'], axis = 1)

In [50]:
from heapq import nlargest
words = []
for corpus_idx in range(len(tfidf[BoW_corpus])):
    # chose 10 smallest TFIDF values and remove them
    maximun_tfidf = nlargest(10,tfidf[BoW_corpus[corpus_idx]])
    maximun_tfidf.sort(key=lambda x: x[1], reverse=True)
    top_10_words = []
    for ele in maximun_tfidf:
        cur_row = dictionary[ele[0]].strip()
        top_10_words.append(cur_row)
    w = ' '.join(top_10_words)
    words.append(w)
# print(words)
    # print(w)
for i in range(len(job_description_text)):
    data.loc[[i],'top_10_words'] = str(words[i])


# for corpus_idx in range(len(noun_data)):
#     cur_job_match_list, cur_experience_match_list = [], []

#     # get the 150 biggest tfidf value for each job category
#     maximun_tfidf = nlargest(150,tfidf[BoW_corpus[corpus_idx]])
#     maximun_tfidf.sort(key=lambda x: x[1], reverse=True)

#     # get top 150 noun phrase by using id
#     for ele in maximun_tfidf:
#         cur_row = dictionary[ele[0]].strip()

In [51]:
range(len(job_description_text))

range(0, 2000)

In [52]:
data

Unnamed: 0,job_description,job_type,category,job_description_new,top_10_words
0,Have you had 10 years experience in fresh pro...,Full Time,Retail & Consumer Products,Have you had 10 years experience in fresh pro...,whats willingness years www which what who you...
1,The Opportunity: The Client Solution Analyst ...,Full Time,Government & Defence,The Opportunity The Client Solution Analyst p...,timely update vendors upload user within throu...
2,An innovative business development role for a...,Full Time,Trades & Services,An innovative business development role for a...,van xeros winning warehouses washing usage wat...
3,About the role: We are seeking an Automotive W...,Full Time,Trades & Services,About the role We are seeking an Automotive Wo...,unsolicited workshop stranger troubleshooting ...
4,Â Early starts and weekend shifts. No experie...,Full Time,Trades & Services,Early starts and weekend shifts No experience...,apprenticeship teach run shifts necessary pote...
...,...,...,...,...,...
1995,2 positions available Based in Kelmscott and ...,Full Time,Trades & Services,2 positions available Based in Kelmscott and ...,wtp wanneroo suez neerabup mirrabooka mdl lexi...
1996,North Bondi FishÂ Located a few short steps f...,Full Time,Hospitality & Tourism,North Bondi Fish Located a few short steps fro...,solotel moran steak solomon sock sheaf personi...
1997,Process Workers â€“ South Gippsland Rapidly g...,Casual/Vacation,"Manufacturing, Transport & Logistics",Process Workers South Gippsland Rapidly growi...,viplus gbs gippsland xy toora sphsgsqj jznykz ...
1998,"JPS, Australia's leading Consultancy for the ...",Full Time,Trades & Services,JPS Australia s leading Consultancy for the P...,prepress needing jps workflow suiting stitchin...


### Splitting data for task 1

In [53]:
data

Unnamed: 0,job_description,job_type,category,job_description_new,top_10_words
0,Have you had 10 years experience in fresh pro...,Full Time,Retail & Consumer Products,Have you had 10 years experience in fresh pro...,whats willingness years www which what who you...
1,The Opportunity: The Client Solution Analyst ...,Full Time,Government & Defence,The Opportunity The Client Solution Analyst p...,timely update vendors upload user within throu...
2,An innovative business development role for a...,Full Time,Trades & Services,An innovative business development role for a...,van xeros winning warehouses washing usage wat...
3,About the role: We are seeking an Automotive W...,Full Time,Trades & Services,About the role We are seeking an Automotive Wo...,unsolicited workshop stranger troubleshooting ...
4,Â Early starts and weekend shifts. No experie...,Full Time,Trades & Services,Early starts and weekend shifts No experience...,apprenticeship teach run shifts necessary pote...
...,...,...,...,...,...
1995,2 positions available Based in Kelmscott and ...,Full Time,Trades & Services,2 positions available Based in Kelmscott and ...,wtp wanneroo suez neerabup mirrabooka mdl lexi...
1996,North Bondi FishÂ Located a few short steps f...,Full Time,Hospitality & Tourism,North Bondi Fish Located a few short steps fro...,solotel moran steak solomon sock sheaf personi...
1997,Process Workers â€“ South Gippsland Rapidly g...,Casual/Vacation,"Manufacturing, Transport & Logistics",Process Workers South Gippsland Rapidly growi...,viplus gbs gippsland xy toora sphsgsqj jznykz ...
1998,"JPS, Australia's leading Consultancy for the ...",Full Time,Trades & Services,JPS Australia s leading Consultancy for the P...,prepress needing jps workflow suiting stitchin...


In [54]:
data["job_type_new"] = np.where(data["job_type"] == "Full Time", "Full Time", "Other")

In [55]:
data

Unnamed: 0,job_description,job_type,category,job_description_new,top_10_words,job_type_new
0,Have you had 10 years experience in fresh pro...,Full Time,Retail & Consumer Products,Have you had 10 years experience in fresh pro...,whats willingness years www which what who you...,Full Time
1,The Opportunity: The Client Solution Analyst ...,Full Time,Government & Defence,The Opportunity The Client Solution Analyst p...,timely update vendors upload user within throu...,Full Time
2,An innovative business development role for a...,Full Time,Trades & Services,An innovative business development role for a...,van xeros winning warehouses washing usage wat...,Full Time
3,About the role: We are seeking an Automotive W...,Full Time,Trades & Services,About the role We are seeking an Automotive Wo...,unsolicited workshop stranger troubleshooting ...,Full Time
4,Â Early starts and weekend shifts. No experie...,Full Time,Trades & Services,Early starts and weekend shifts No experience...,apprenticeship teach run shifts necessary pote...,Full Time
...,...,...,...,...,...,...
1995,2 positions available Based in Kelmscott and ...,Full Time,Trades & Services,2 positions available Based in Kelmscott and ...,wtp wanneroo suez neerabup mirrabooka mdl lexi...,Full Time
1996,North Bondi FishÂ Located a few short steps f...,Full Time,Hospitality & Tourism,North Bondi Fish Located a few short steps fro...,solotel moran steak solomon sock sheaf personi...,Full Time
1997,Process Workers â€“ South Gippsland Rapidly g...,Casual/Vacation,"Manufacturing, Transport & Logistics",Process Workers South Gippsland Rapidly growi...,viplus gbs gippsland xy toora sphsgsqj jznykz ...,Other
1998,"JPS, Australia's leading Consultancy for the ...",Full Time,Trades & Services,JPS Australia s leading Consultancy for the P...,prepress needing jps workflow suiting stitchin...,Full Time


In [56]:
args = Namespace(
    raw_dataset_csv="seek_australia_2000.csv",
    train_proportion=0.7,
    val_proportion=0.10,
    test_proportion=0.20,
    output_munged_csv="ass02_task01.csv",
    seed=1337
)

In [57]:
# Splitting train by job_type
# Create dict
by_type = collections.defaultdict(list)
for _, row in data.iterrows():
    by_type[row.job_type_new].append(row.to_dict())

In [58]:
# Create split data
final_list = []
np.random.seed(args.seed)

for _, item_list in sorted(by_type.items()):

    np.random.shuffle(item_list)
    
    n_total = len(item_list)
    n_train = int(args.train_proportion * n_total)
    n_val = int(args.val_proportion * n_total)
    n_test = int(args.test_proportion*n_total)
    
    # Give data point a split attribute
    for item in item_list[:n_train]:
        item['split'] = 'train'
    
    for item in item_list[n_train:n_train+n_val]:
        item['split'] = 'val'
    
    for item in item_list[n_train+n_val:]:
        item['split'] = 'test'

    # Add to final list
    final_list.extend(item_list)

In [59]:
split_task1 = pd.DataFrame(final_list)
split_task1.split.value_counts()
# len(split_task1) = 2000

train    1399
test      402
val       199
Name: split, dtype: int64

In [60]:
set(split_task1.split)

{'test', 'train', 'val'}

In [61]:
split_task1 = split_task1[["category", "job_description_new", "top_10_words", "job_type_new", "split"]]

In [62]:
split_task1[pd.isnull(split_task1.job_description_new)]

Unnamed: 0,category,job_description_new,top_10_words,job_type_new,split


In [63]:
split_task1

Unnamed: 0,category,job_description_new,top_10_words,job_type_new,split
0,Construction,About Laing O Rourke Laing O Rourke is a 6 bi...,rourke laing licensing composition carer brw b...,Full Time,train
1,Call Centre & Customer Service,Probate Recovery Specialists Phillips Cohen A...,cohen probate southbank repayment recovering p...,Full Time,train
2,Sales,Global FMCG giant premium brands and multiple...,dds senor appealing centrally amy executed boa...,Full Time,train
3,Hospitality & Tourism,ENDLESS PROGRESSION IN THE EVENTS WORLD MANAG...,pax spared soundings sergeant plush legends fa...,Full Time,train
4,Administration & Office Support,This rapidly growing Company provides excepti...,achievable resolutions technically prices quot...,Full Time,train
...,...,...,...,...,...
1995,Hospitality & Tourism,A business in Sutherland is seeking a casual ...,cleaner sutherland weekends least casual shoul...,Other,test
1996,Information & Communication Technology,We are seeking an experienced Data Migration ...,migration db undertsand retrieve williams scri...,Other,test
1997,Retail & Consumer Products,Steve s Liquor is a growing liquor retailer w...,liquor steve wholesaler rsa tasmania hrs poten...,Other,test
1998,Healthcare & Medical,Enthusiastic and committed Disabiltiy Support...,jodie rans disabiltiy confortable austism week...,Other,test


In [64]:
pd.Series(dict(FreqDist(split_task1.job_type_new)))

Full Time    1383
Other         617
dtype: int64

In [65]:
split_task1.dtypes

category               object
job_description_new    object
top_10_words           object
job_type_new           object
split                  object
dtype: object

In [66]:
split_task1.to_csv(args.output_munged_csv, index=False)

### Splitting data for task 2

In [67]:
# Splitting train by category
# Create dict
by_cate = collections.defaultdict(list)
for _, row in data.iterrows():
    by_cate[row.category].append(row.to_dict())

In [68]:
# Create split data
final_list = []
np.random.seed(args.seed)

for _, item_list in sorted(by_cate.items()):

    np.random.shuffle(item_list)
    
    n_total = len(item_list)
    n_train = int(args.train_proportion * n_total)
    n_val = int(args.val_proportion * n_total)
    n_test = int(args.test_proportion*n_total)
    
    # Give data point a split attribute
    for item in item_list[:n_train]:
        item['split'] = 'train'
    
    for item in item_list[n_train:n_train+n_val]:
        item['split'] = 'val'
    
    for item in item_list[n_train+n_val:]:
        item['split'] = 'test'

    # Add to final list
    final_list.extend(item_list)

In [69]:
split_task2 = pd.DataFrame(final_list)
split_task2.split.value_counts()

train    1387
test      425
val       188
Name: split, dtype: int64

In [70]:
set(split_task1.split)

{'test', 'train', 'val'}

In [71]:
split_task2

Unnamed: 0,job_description,job_type,category,job_description_new,top_10_words,job_type_new,split
0,The Company We are currently working with a G...,Contract/Temp,Accounting,The Company We are currently working with a G...,negotiating roberthalf headcount overdue limit...,Other,train
1,About the Company My clientÂ is a global mark...,Full Time,Accounting,About the Company My client is a global marke...,renewal acquisitive incl customised domestical...,Full Time,train
2,About the business After two years in operati...,Part Time,Accounting,About the business After two years in operati...,cadmac riverina rigours prudence positivity li...,Other,train
3,The Firm This firm has been in businessÂ for ...,Full Time,Accounting,The Firm This firm has been in business for o...,similarly promegate mii pays impacting enquire...,Full Time,train
4,Project Finance Manager Â The Opportunity At...,Full Time,Accounting,Project Finance Manager The Opportunity At Pw...,vouch unrewarded surround subcontracting solve...,Full Time,train
...,...,...,...,...,...,...,...
1995,LaseMedics are going through a period of expa...,Part Time,Trades & Services,LaseMedics are going through a period of expa...,laser hair sculpting lasemedics vivian gentlel...,Other,test
1996,Hays Trades & Labour require carpenters for a...,Contract/Temp,Trades & Services,Hays Trades Labour require carpenters for an ...,geelong carpenters hayes surf carpenter oli ol...,Other,test
1997,About Us GJK Facility Services is one of the ...,Full Time,Trades & Services,About Us GJK Facility Services is one of the ...,gjk bids yet visitation comprehension tenderin...,Full Time,test
1998,"Well presented, professional,Â highly motivat...",Full Time,Trades & Services,Well presented professional highly motivated ...,franchise rivers lismore dealership dealership...,Full Time,test


### Dataset for Task 1 Feed Forward Neural Network model

In [72]:
# Dataset
from torch.utils.data import Dataset
class FeedFowardDataset(Dataset):
    def __init__(self, task1_df, vectorizer):
        self.task1_df = task1_df
        self._vectorizer = vectorizer
        self.train_df = self.task1_df[self.task1_df.split=='train']
        self.train_size = len(self.train_df)
        self.val_df = self.task1_df[self.task1_df.split=='val']
        self.validation_size = len(self.val_df)
        self.test_df = self.task1_df[self.task1_df.split=='test']
        self.test_size = len(self.test_df)
        self._lookup_dict = {'train': (self.train_df, self.train_size),
                            'val': (self.val_df, self.validation_size),
                            'test': (self.test_df, self.test_size)}
        self.set_split('train')
    @classmethod
    def load_dataset_and_make_vectorizer(cls, task1_csv):
        task1_df = pd.read_csv(task1_csv)
        return cls(task1_df, OneHotVectorizer.from_dataframe(task1_df))
    def get_vectorizer(self):
        return self._vectorizer
    def set_split(self, split="train"):
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]
    def __len__(self):
        return self._target_size
    def __getitem__(self, index):
        row = self._target_df.iloc[index]
        top_10_vector = \
        self._vectorizer.vectorize(row.top_10_words)
        job_type_index = \
        self._vectorizer.job_type_vocab.lookup_token(row.job_type_new)
        return {'x_data': top_10_vector,
                'y_target': job_type_index}
    def get_num_batches(self, batch_size):
        return len(self) // batch_size

### One-Hot Encoding Vectorizer

In [73]:
# Vectorizer
from collections import Counter
import string
class OneHotVectorizer(object):
    def __init__(self, top_10_vocab, job_type_vocab):
        self.top_10_vocab = top_10_vocab
        self.job_type_vocab = job_type_vocab
    def vectorize(self, nar):
        one_hot = np.zeros(len(self.top_10_vocab), dtype=np.float32)
        for token in nar.split(" "):
            if token not in string.punctuation:
                one_hot[self.top_10_vocab.lookup_token(token)] = 1
        return one_hot
    @classmethod
    def from_dataframe(cls, task1_df, cutoff=15):
        top_10_vocab = Vocabulary(add_unk=True)
        job_type_vocab = Vocabulary(add_unk=False)
        # Add job_types
        for job_type in sorted(set(task1_df.job_type_new)):
            job_type_vocab.add_token(job_type)
        # Add top words if count > provided count
        word_counts = Counter()
        for nar in task1_df.top_10_words:
            for word in nar.split(" "):
                if word not in string.punctuation:
                    word_counts[word] += 1
        for word, count in word_counts.items():
            if count > cutoff:
                top_10_vocab.add_token(word)
        return cls(top_10_vocab, job_type_vocab)
    @classmethod
    def from_serializable(cls, contents):
        top_10_vocab = Vocabulary.from_serializable(contents['top_10_vocab'])
        job_type_vocab = Vocabulary.from_serializable(contents['job_type_vocab'])
        return cls(top_10_vocab=top_10_vocab, job_type_vocab=job_type_vocab)
    def to_serializable(self):
        return {'top_10_vocab': self.top_10_vocab.to_serializable(),
                'job_type_vocab': self.job_type_vocab.to_serializable()}

### Dataset for CNN model

In [None]:
# Dataset for CNN by using the top 10 words
class CNNDataset(Dataset):
    def __init__(self, task1cnn_df, vectorizer):
        self.task1cnn_df = task1cnn_df
        self._vectorizer = vectorizer

        # +1 if only using begin_seq, +2 if using both begin and end seq tokens
        measure_len = lambda context: len(context.split(" "))
        self._max_seq_length = max(map(measure_len, task1cnn_df.top_10_words)) + 2
        

        self.train_df = self.task1cnn_df[self.task1cnn_df.split=='train']
        self.train_size = len(self.train_df)
        self.val_df = self.task1cnn_df[self.task1cnn_df.split=='val']
        self.validation_size = len(self.val_df)
        self.test_df = self.task1cnn_df[self.task1cnn_df.split=='test']
        self.test_size = len(self.test_df)
        self._lookup_dict = {'train': (self.train_df, self.train_size),
                             'val': (self.val_df, self.validation_size),
                             'test': (self.test_df, self.test_size)}
        self.set_split('train')

        # Class weights
        class_counts = task1cnn_df.job_type_new.value_counts().to_dict()
        def sort_key(item):
            return self._vectorizer.job_type_new_vocab.lookup_token(item[0])
        sorted_counts = sorted(class_counts.items(), key=sort_key)
        frequencies = [count for _, count in sorted_counts]
        self.class_weights = 1.0 / torch.tensor(frequencies, dtype=torch.float32)
        
    @classmethod
    def load_dataset_and_make_vectorizer(cls, task1CNN_csv):
        task1cnn_df = pd.read_csv(task1CNN_csv)
        train_task1cnn_df = task1cnn_df[task1cnn_df.split=='train']
        return cls(task1cnn_df, TopPretrainedVectorizer.from_dataframe(train_task1cnn_df))

    @classmethod
    def load_dataset_and_load_vectorizer(cls, task1CNN_csv, vectorizer_filepath):
        task1cnn_df = pd.read_csv(task1CNN_csv)
        vectorizer = cls.load_vectorizer_only(vectorizer_filepath)
        return cls(task1CNN_csv, vectorizer)

    @staticmethod
    def load_vectorizer_only(vectorizer_filepath):
        with open(vectorizer_filepath) as fp:
            return NameVectorizer.from_serializable(json.load(fp))

    def save_vectorizer(self, vectorizer_filepath):
        with open(vectorizer_filepath, "w") as fp:
            json.dump(self._vectorizer.to_serializable(), fp)

    def get_vectorizer(self):
        return self._vectorizer

    def set_split(self, split="train"):
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]

    def __len__(self):
        return self._target_size

    def __getitem__(self, index):
        row = self._target_df.iloc[index]

        top_10_words_vector = \
            self._vectorizer.vectorize(row.top_10_words, self._max_seq_length)

        job_type_new_index = \
            self._vectorizer.job_type_new_vocab.lookup_token(row.job_type_new)

        return {'x_data': top_10_words_vector,
                'y_target': job_type_new_index}

    def get_num_batches(self, batch_size):

        return len(self) // batch_size

def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"): 
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

### Pre-trained Vectorizer

In [None]:
# top 10 word pretrained Vectorizer
class TopPretrainedVectorizer(object): 
    def __init__(self, top_10_words_vocab, job_type_new_vocab):
        self.top_10_words_vocab = top_10_words_vocab
        self.job_type_new_vocab = job_type_new_vocab

    def vectorize(self, top_10_words, vector_length=-1):
        indices = [self.top_10_words_vocab.begin_seq_index]
        indices.extend(self.top_10_words_vocab.lookup_token(token) 
                       for token in top_10_words.split(" "))
        indices.append(self.top_10_words_vocab.end_seq_index)

        if vector_length < 0:
            vector_length = len(indices)

        out_vector = np.zeros(vector_length, dtype=np.int64)
        out_vector[:len(indices)] = indices
        out_vector[len(indices):] = self.top_10_words_vocab.mask_index

        return out_vector

    @classmethod
    def from_dataframe(cls, task1cnn_df, cutoff=25):
        job_type_new_vocab = CNNVocabulary()        
        for job_type in sorted(set(task1cnn_df.job_type_new)):
            job_type_new_vocab.add_token(job_type)

        word_counts = Counter()
        for top_10 in task1cnn_df.top_10_words:
            for token in top_10.split(" "):
                if token not in string.punctuation:
                    word_counts[token] += 1
        
        top_10_words_vocab = CNNSequenceVocabulary()
        for word, word_count in word_counts.items():
            if word_count >= cutoff:
                top_10_words_vocab.add_token(word)
        
        return cls(top_10_words_vocab, job_type_new_vocab)

    @classmethod
    def from_serializable(cls, contents):
        top_10_words_vocab = \
            CNNSequenceVocabulary.from_serializable(contents['top_10_words_vocab'])
        job_type_new_vocab =  \
            CNNVocabulary.from_serializable(contents['job_type_new_vocab'])

        return cls(top_10_words_vocab=top_10_words_vocab, job_type_new_vocab=job_type_new_vocab)

    def to_serializable(self):
        return {'top_10_words_vocab': self.top_10_words_vocab.to_serializable(),
                'job_type_new_vocab': self.job_type_new_vocab.to_serializable()}

# Task 1 Binary Document Classification

## One-Hot Encoding Vectorizers and Feed-Forward Neural Network Model

In [74]:
# Dataset
from torch.utils.data import Dataset
class FeedFowardDataset(Dataset):
    def __init__(self, task1_df, vectorizer):
        self.task1_df = task1_df
        self._vectorizer = vectorizer
        self.train_df = self.task1_df[self.task1_df.split=='train']
        self.train_size = len(self.train_df)
        self.val_df = self.task1_df[self.task1_df.split=='val']
        self.validation_size = len(self.val_df)
        self.test_df = self.task1_df[self.task1_df.split=='test']
        self.test_size = len(self.test_df)
        self._lookup_dict = {'train': (self.train_df, self.train_size),
                            'val': (self.val_df, self.validation_size),
                            'test': (self.test_df, self.test_size)}
        self.set_split('train')
    @classmethod
    def load_dataset_and_make_vectorizer(cls, task1_csv):
        task1_df = pd.read_csv(task1_csv)
        return cls(task1_df, OneHotVectorizer.from_dataframe(task1_df))
    def get_vectorizer(self):
        return self._vectorizer
    def set_split(self, split="train"):
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]
    def __len__(self):
        return self._target_size
    def __getitem__(self, index):
        row = self._target_df.iloc[index]
        top_10_vector = \
        self._vectorizer.vectorize(row.top_10_words)
        job_type_index = \
        self._vectorizer.job_type_vocab.lookup_token(row.job_type_new)
        return {'x_data': top_10_vector,
                'y_target': job_type_index}
    def get_num_batches(self, batch_size):
        return len(self) // batch_size

In [75]:
# Vocabulary
class Vocabulary(object):
    def __init__(self, token_to_idx=None, add_unk=True, unk_token="<UNK>"):
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx
        self._idx_to_token = {idx: token 
                                for token, idx in self._token_to_idx.items()}
        self._add_unk = add_unk
        self._unk_token = unk_token
        self.unk_index = 1
        if add_unk:
            self.unk_index = self.add_token(unk_token)
    def to_serializable(self):
        return {'token_to_idx': self._token_to_idx,
                'add_unk': self._add_unk,
                'unk_token': self._unk_token}
    @classmethod
    def from_serializable(cls, contents):
        return cls(**contents)
    def add_token(self, token):
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index
    def lookup_token(self, token):
        if self._add_unk:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]
    def lookup_index(self, index):
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]
    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)
    def __len__(self):
        return len(self._token_to_idx)

In [76]:
# Vectorizer
from collections import Counter
import string
class OneHotVectorizer(object):
    def __init__(self, top_10_vocab, job_type_vocab):
        self.top_10_vocab = top_10_vocab
        self.job_type_vocab = job_type_vocab
    def vectorize(self, nar):
        one_hot = np.zeros(len(self.top_10_vocab), dtype=np.float32)
        for token in nar.split(" "):
            if token not in string.punctuation:
                one_hot[self.top_10_vocab.lookup_token(token)] = 1
        return one_hot
    @classmethod
    def from_dataframe(cls, task1_df, cutoff=15):
        top_10_vocab = Vocabulary(add_unk=True)
        job_type_vocab = Vocabulary(add_unk=False)
        # Add job_types
        for job_type in sorted(set(task1_df.job_type_new)):
            job_type_vocab.add_token(job_type)
        # Add top words if count > provided count
        word_counts = Counter()
        for nar in task1_df.top_10_words:
            for word in nar.split(" "):
                if word not in string.punctuation:
                    word_counts[word] += 1
        for word, count in word_counts.items():
            if count > cutoff:
                top_10_vocab.add_token(word)
        return cls(top_10_vocab, job_type_vocab)
    @classmethod
    def from_serializable(cls, contents):
        top_10_vocab = Vocabulary.from_serializable(contents['top_10_vocab'])
        job_type_vocab = Vocabulary.from_serializable(contents['job_type_vocab'])
        return cls(top_10_vocab=top_10_vocab, job_type_vocab=job_type_vocab)
    def to_serializable(self):
        return {'top_10_vocab': self.top_10_vocab.to_serializable(),
                'job_type_vocab': self.job_type_vocab.to_serializable()}

In [77]:
# Dataloader
from torch.utils.data import DataLoader
def generate_batches(dataset, batch_size, shuffle=True, drop_last=True, device="cpu"):
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size, 
                            shuffle=shuffle, drop_last=drop_last)
    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

In [78]:
# Perceptron Classifier
import torch.nn as nn
import torch.nn.functional as F
class FeedForwardClassifier(nn.Module):
    def __init__(self, num_features):
        super(FeedForwardClassifier, self).__init__()
        self.fc1 = nn.Linear(in_features=num_features, out_features=1)
    
    def forward(self, x_in, apply_sigmoid=False):
        y_out = self.fc1(x_in).squeeze()
        if apply_sigmoid:
            y_out = torch.sigmoid(y_out)
        return y_out

In [79]:
# Initial Setup
from argparse import Namespace
args = Namespace(
    # Data and path information
    frequency_cutoff=15,
    model_state_file='model.pth',
    task1_csv="ass02_task01.csv",
    save_dir='model_storage/task01/',
    vectorizer_file='vectorizer.json',
    # No model hyperparameters
    # Training hyperparameters
    batch_size=128,
    early_stopping_criteria=5,
    learning_rate=0.001,
    num_epochs=100,
    seed=1337,
    # Runtime options
    cuda=True,
    device='cuda',
)

In [80]:
# Training preparation
import torch
import torch.optim as optim
import pandas as pd

def make_train_state(args):
    return {'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': [],
            'test_loss': 1,
            'test_acc': 1}

train_state = make_train_state(args)
if not torch.cuda.is_available():
    args.cuda = False
args.device = torch.device("cuda" if args.cuda else "cpu")

# dataset and vectorizer
dataset = FeedFowardDataset.load_dataset_and_make_vectorizer(args.task1_csv)
vectorizer = dataset.get_vectorizer()
# model
classifier = FeedForwardClassifier(num_features=len(vectorizer.top_10_vocab))
classifier = classifier.to(args.device)
# loss and optimizer
loss_func = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)

In [81]:
def compute_accuracy(y_pred, y_target):
    y_target = y_target.cpu()
    y_pred_indices = (torch.sigmoid(y_pred)>0.5).cpu().long()#.max(dim=1)[1]
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100

In [82]:
# Training
import numpy as np
for epoch_index in range(args.num_epochs):
    train_state['epoch_index'] = epoch_index
    # Iterate over training dataset
    # setup: batch generator, set loss and acc to 0, set train mode on
    dataset.set_split('train')
    batch_generator = generate_batches(dataset, batch_size=args.batch_size, device=args.device)
    running_loss = 0.0
    running_acc = 0.0
    classifier.train()
    for batch_index, batch_dict in enumerate(batch_generator):
        # the training routine is 5 steps:
        # step 1. zero the gradients
        optimizer.zero_grad()
        # step 2. compute the output
        y_pred = classifier(x_in=batch_dict['x_data'].float())
        # step 3. compute the loss
        loss = loss_func(y_pred, batch_dict['y_target'].float())
        loss_batch = loss.item()
        running_loss += (loss_batch-running_loss) / (batch_index + 1)
        # step 4. use loss to produce gradients
        loss.backward()
        # step 5. use optimizer to take gradient step
        optimizer.step()
        # compute the accuracy
        acc_batch = compute_accuracy(y_pred, batch_dict['y_target'])
        running_acc += (acc_batch - running_acc) / (batch_index + 1)

    train_state['train_loss'].append(running_loss)
    train_state['train_acc'].append(running_acc)

    # Iterate over val dataset
    # setup: batch generator, set loss and acc to 0, set eval mode on
    dataset.set_split('val')
    batch_generator = generate_batches(dataset, batch_size=args.batch_size, device=args.device)
    running_loss = 0.
    running_acc = 0.
    classifier.eval()

    for batch_index, batch_dict in enumerate(batch_generator):
        # step 1. compute the output
        y_pred = classifier(x_in=batch_dict['x_data'].float())
        # step 2. compute the loss
        loss = loss_func(y_pred, batch_dict['y_target'].float())
        loss_batch = loss.item()
        running_loss += (loss_batch - running_loss) / (batch_index + 1)
        # step 3. compute the accuracy
        acc_batch = compute_accuracy(y_pred, batch_dict['y_target'])
        running_acc += (acc_batch - running_acc) / (batch_index + 1)
    train_state['val_loss'].append(running_loss)
    train_state['val_acc'].append(running_acc)

In [83]:
# Evaluation
dataset.set_split('test')
batch_generator = generate_batches(dataset,batch_size=args.batch_size,device=args.device)
running_loss = 0.
running_acc = 0.
classifier.eval()
for batch_index, batch_dict in enumerate(batch_generator):
    # compute the output
    y_pred = classifier(x_in=batch_dict['x_data'].float())
    # compute the loss
    loss = loss_func(y_pred, batch_dict['y_target'].float())
    loss_batch = loss.item()
    running_loss += (loss_batch - running_loss) / (batch_index + 1)
    # compute the accuracy
    acc_batch = compute_accuracy(y_pred, batch_dict['y_target'])
    running_acc += (acc_batch - running_acc) / (batch_index + 1)
train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc

In [84]:
print("Test loss: {:.3f}".format(train_state['test_loss']))
print("Test Accuracy: {:.2f}".format(train_state['test_acc']))
# Test loss: 0.611
# Test Accuracy: 69.01

Test loss: 0.610
Test Accuracy: 68.75


In [85]:
# Inference and Classifying new data points
import re
# Preprocess the reviews
def preprocess_text(text):
    if type(text) == float:
        print(text)
    text = text.lower()
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text
def predict_job_type(top_10_word, classifier, vectorizer, decision_threshold=0.5):
    top_10_word = preprocess_text(top_10_word)
    vectorized_top_10_word= torch.tensor(vectorizer.vectorize(top_10_word)).to(args.device)
    result = classifier(vectorized_top_10_word.view(1, -1))
    probability_value = torch.sigmoid(result).item()
    index = 1
    if probability_value < decision_threshold:
        index = 0
    return vectorizer.job_type_vocab.lookup_index(index)

In [86]:
# test_top_10_word can be changed to other string to obtain the prediction.
test_top_10_word = split_task1.loc[5, 'top_10_words']
split_task1.loc[5, 'job_type_new']

'Full Time'

In [87]:
test_top_10_word = "we are looking for someone wants a full time job"

In [88]:
prediction = predict_job_type(test_top_10_word, classifier, vectorizer)
print("{} > {}".format(test_top_10_word, prediction))

we are looking for someone wants a full time job > Full Time


In [89]:
# Inspecting model weights
# Sort weights
fc1_weights = classifier.fc1.weight.to('cpu').detach()[0]
_, indices = torch.sort(fc1_weights, dim=0, descending=True)
indices = indices.numpy().tolist()

In [90]:
# Top 20 full time job type words
print("Influential words in full time job type words:")
print("")
for i in range(10):
    print(vectorizer.top_10_vocab.lookup_index(indices[i]))

Influential words in full time job type words:

afternoon
white
cutting
analysts
affordable
usually
considerable
repayments
regulations
treating


In [91]:
# Top 20 other job type words
print("Influential words in other job type:")
print("")
indices.reverse()
for i in range(10):
    print(vectorizer.top_10_vocab.lookup_index(indices[i])) 

Influential words in other job type:

reading
witness
warrants
bigger
refer
independence
exercising
surveying
voluntary
<UNK>


### Pre-trained Vetorizer and CNN Conv1d Model

In [92]:
# Dataset for CNN by using the top 10 words
class CNNDataset(Dataset):
    def __init__(self, task1cnn_df, vectorizer):
        self.task1cnn_df = task1cnn_df
        self._vectorizer = vectorizer

        # +1 if only using begin_seq, +2 if using both begin and end seq tokens
        measure_len = lambda context: len(context.split(" "))
        self._max_seq_length = max(map(measure_len, task1cnn_df.top_10_words)) + 2
        

        self.train_df = self.task1cnn_df[self.task1cnn_df.split=='train']
        self.train_size = len(self.train_df)
        self.val_df = self.task1cnn_df[self.task1cnn_df.split=='val']
        self.validation_size = len(self.val_df)
        self.test_df = self.task1cnn_df[self.task1cnn_df.split=='test']
        self.test_size = len(self.test_df)
        self._lookup_dict = {'train': (self.train_df, self.train_size),
                             'val': (self.val_df, self.validation_size),
                             'test': (self.test_df, self.test_size)}
        self.set_split('train')

        # Class weights
        class_counts = task1cnn_df.job_type_new.value_counts().to_dict()
        def sort_key(item):
            return self._vectorizer.job_type_new_vocab.lookup_token(item[0])
        sorted_counts = sorted(class_counts.items(), key=sort_key)
        frequencies = [count for _, count in sorted_counts]
        self.class_weights = 1.0 / torch.tensor(frequencies, dtype=torch.float32)
        
    @classmethod
    def load_dataset_and_make_vectorizer(cls, task1CNN_csv):
        task1cnn_df = pd.read_csv(task1CNN_csv)
        train_task1cnn_df = task1cnn_df[task1cnn_df.split=='train']
        return cls(task1cnn_df, TopPretrainedVectorizer.from_dataframe(train_task1cnn_df))

    @classmethod
    def load_dataset_and_load_vectorizer(cls, task1CNN_csv, vectorizer_filepath):
        task1cnn_df = pd.read_csv(task1CNN_csv)
        vectorizer = cls.load_vectorizer_only(vectorizer_filepath)
        return cls(task1CNN_csv, vectorizer)

    @staticmethod
    def load_vectorizer_only(vectorizer_filepath):
        with open(vectorizer_filepath) as fp:
            return NameVectorizer.from_serializable(json.load(fp))

    def save_vectorizer(self, vectorizer_filepath):
        with open(vectorizer_filepath, "w") as fp:
            json.dump(self._vectorizer.to_serializable(), fp)

    def get_vectorizer(self):
        return self._vectorizer

    def set_split(self, split="train"):
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]

    def __len__(self):
        return self._target_size

    def __getitem__(self, index):
        row = self._target_df.iloc[index]

        top_10_words_vector = \
            self._vectorizer.vectorize(row.top_10_words, self._max_seq_length)

        job_type_new_index = \
            self._vectorizer.job_type_new_vocab.lookup_token(row.job_type_new)

        return {'x_data': top_10_words_vector,
                'y_target': job_type_new_index}

    def get_num_batches(self, batch_size):

        return len(self) // batch_size

def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"): 
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

In [93]:
# Vocabulary for CNN
class CNNVocabulary(object):
    def __init__(self, token_to_idx=None):
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx

        self._idx_to_token = {idx: token 
                              for token, idx in self._token_to_idx.items()}
        
    def to_serializable(self):
        return {'token_to_idx': self._token_to_idx}

    @classmethod
    def from_serializable(cls, contents):
        return cls(**contents)

    def add_token(self, token):
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index
            
    def add_many(self, tokens):
        return [self.add_token(token) for token in tokens]

    def lookup_token(self, token):
        return self._token_to_idx[token]

    def lookup_index(self, index):
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the VocabularyCNN" % index)
        return self._idx_to_token[index]

    def __str__(self):
        return "<CNNVocabulary(size=%d)>" % len(self)

    def __len__(self):
        return len(self._token_to_idx)

In [94]:
# SequenceVocabulary for CNN
class CNNSequenceVocabulary(CNNVocabulary):
    def __init__(self, token_to_idx=None, unk_token="<UNK>",
                 mask_token="<MASK>", begin_seq_token="<BEGIN>",
                 end_seq_token="<END>"):

        super(CNNSequenceVocabulary, self).__init__(token_to_idx)

        self._mask_token = mask_token
        self._unk_token = unk_token
        self._begin_seq_token = begin_seq_token
        self._end_seq_token = end_seq_token

        self.mask_index = self.add_token(self._mask_token)
        self.unk_index = self.add_token(self._unk_token)
        self.begin_seq_index = self.add_token(self._begin_seq_token)
        self.end_seq_index = self.add_token(self._end_seq_token)

    def to_serializable(self):
        contents = super(CNNSequenceVocabulary, self).to_serializable()
        contents.update({'unk_token': self._unk_token,
                         'mask_token': self._mask_token,
                         'begin_seq_token': self._begin_seq_token,
                         'end_seq_token': self._end_seq_token})
        return contents

    def lookup_token(self, token):
        if self.unk_index >= 0:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]

In [95]:
# top 10 word pretrained Vectorizer
class TopPretrainedVectorizer(object): 
    def __init__(self, top_10_words_vocab, job_type_new_vocab):
        self.top_10_words_vocab = top_10_words_vocab
        self.job_type_new_vocab = job_type_new_vocab

    def vectorize(self, top_10_words, vector_length=-1):
        indices = [self.top_10_words_vocab.begin_seq_index]
        indices.extend(self.top_10_words_vocab.lookup_token(token) 
                       for token in top_10_words.split(" "))
        indices.append(self.top_10_words_vocab.end_seq_index)

        if vector_length < 0:
            vector_length = len(indices)

        out_vector = np.zeros(vector_length, dtype=np.int64)
        out_vector[:len(indices)] = indices
        out_vector[len(indices):] = self.top_10_words_vocab.mask_index

        return out_vector

    @classmethod
    def from_dataframe(cls, task1cnn_df, cutoff=25):
        job_type_new_vocab = CNNVocabulary()        
        for job_type in sorted(set(task1cnn_df.job_type_new)):
            job_type_new_vocab.add_token(job_type)

        word_counts = Counter()
        for top_10 in task1cnn_df.top_10_words:
            for token in top_10.split(" "):
                if token not in string.punctuation:
                    word_counts[token] += 1
        
        top_10_words_vocab = CNNSequenceVocabulary()
        for word, word_count in word_counts.items():
            if word_count >= cutoff:
                top_10_words_vocab.add_token(word)
        
        return cls(top_10_words_vocab, job_type_new_vocab)

    @classmethod
    def from_serializable(cls, contents):
        top_10_words_vocab = \
            CNNSequenceVocabulary.from_serializable(contents['top_10_words_vocab'])
        job_type_new_vocab =  \
            CNNVocabulary.from_serializable(contents['job_type_new_vocab'])

        return cls(top_10_words_vocab=top_10_words_vocab, job_type_new_vocab=job_type_new_vocab)

    def to_serializable(self):
        return {'top_10_words_vocab': self.top_10_words_vocab.to_serializable(),
                'job_type_new_vocab': self.job_type_new_vocab.to_serializable()}

In [103]:
# CNN Classifier
class CNNClassifier(nn.Module):
    def __init__(self, embedding_size, num_embeddings, num_channels, 
                 hidden_dim, num_classes, dropout_p, 
                 pretrained_embeddings=None, padding_idx=0):
        super(CNNClassifier, self).__init__()

        if pretrained_embeddings is None:

            self.emb = nn.Embedding(embedding_dim=embedding_size,
                                    num_embeddings=num_embeddings,
                                    padding_idx=padding_idx)        
        else:
            pretrained_embeddings = torch.from_numpy(pretrained_embeddings).float()
            self.emb = nn.Embedding(embedding_dim=embedding_size,
                                    num_embeddings=num_embeddings,
                                    padding_idx=padding_idx,
                                    _weight=pretrained_embeddings)
        
        self.convnet = nn.Sequential(
            nn.Conv1d(in_channels=embedding_size, 
                   out_channels=num_channels, kernel_size=2),
            nn.ELU(),
            nn.Conv1d(in_channels=num_channels, out_channels=num_channels, 
                   kernel_size=2, stride=2),
            nn.ELU(),
            nn.Conv1d(in_channels=num_channels, out_channels=num_channels, 
                   kernel_size=2, stride=2),
            nn.ELU(),
            nn.Conv1d(in_channels=num_channels, out_channels=num_channels, 
                   kernel_size=2),
            nn.ELU()
        )

        self._dropout_p = dropout_p
        self.fc1 = nn.Linear(num_channels, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, num_classes)

    def forward(self, x_in, apply_softmax=False):
        
        # embed and permute so features are channels
        x_embedded = self.emb(x_in).permute(0, 2, 1)

        features = self.convnet(x_embedded)

        # average and remove the extra dimension
        remaining_size = features.size(dim=2)
        features = F.avg_pool1d(features, remaining_size).squeeze(dim=2)
        features = F.dropout(features, p=self._dropout_p)
        
        # mlp classifier
        intermediate_vector = F.relu(F.dropout(self.fc1(features), p=self._dropout_p))
        prediction_vector = self.fc2(intermediate_vector)

        if apply_softmax:
            prediction_vector = F.softmax(prediction_vector, dim=1)

        return prediction_vector

In [104]:
# helper function
def make_train_state(args):
    return {'stop_early': False,
            'early_stopping_step': 0,
            'early_stopping_best_val': 1e8,
            'learning_rate': args.learning_rate,
            'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': [],
            'test_loss': -1,
            'test_acc': -1,
            'model_filename': args.model_state_file}

def update_train_state(args, model, train_state):

    # Save one model at least
    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(), train_state['model_filename'])
        train_state['stop_early'] = False

    # Save model if performance improved
    elif train_state['epoch_index'] >= 1:
        loss_tm1, loss_t = train_state['val_loss'][-2:]

        # If loss worsened
        if loss_t >= train_state['early_stopping_best_val']:
            # Update step
            train_state['early_stopping_step'] += 1
        # Loss decreased
        else:
            # Save the best model
            if loss_t < train_state['early_stopping_best_val']:
                torch.save(model.state_dict(), train_state['model_filename'])

            # Reset early stopping step
            train_state['early_stopping_step'] = 0

        # Stop early ?
        train_state['stop_early'] = \
            train_state['early_stopping_step'] >= args.early_stopping_criteria

    return train_state

def compute_accuracy(y_pred, y_target):
    _, y_pred_indices = y_pred.max(dim=1)
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100

In [105]:
# General utilities
def set_seed_everywhere(seed, cuda):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed_all(seed)

def handle_dirs(dirpath):
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)
        
def load_glove_from_file(glove_filepath):

    word_to_index = {}
    embeddings = []
    with open(glove_filepath, encoding="utf8") as fp:
        for index, line in enumerate(fp):
            line = line.split(" ") # each line: word num1 num2 ...
            word_to_index[line[0]] = index # word = line[0] 
            embedding_i = np.array([float(val) for val in line[1:]])
            embeddings.append(embedding_i)
    return word_to_index, np.stack(embeddings)

def make_embedding_matrix(glove_filepath, words):
    word_to_idx, glove_embeddings = load_glove_from_file(glove_filepath)
    embedding_size = glove_embeddings.shape[1]
    
    final_embeddings = np.zeros((len(words), embedding_size))

    for i, word in enumerate(words):
        if word in word_to_idx:
            final_embeddings[i, :] = glove_embeddings[word_to_idx[word]]
        else:
            embedding_i = torch.ones(1, embedding_size)
            torch.nn.init.xavier_uniform_(embedding_i)
            final_embeddings[i, :] = embedding_i

    return final_embeddings

In [106]:
# Setting and prep work
args = Namespace(
    # Data and Path hyper parameters
    task1CNN_csv="ass02_task01.csv",
    vectorizer_file="vectorizer.json",
    model_state_file="model.pth",
    save_dir="model_storage/cnn",
    # Model hyper parameters
    glove_filepath='glove.6B.100d.txt', 
    use_glove=False,
    embedding_size=100, 
    hidden_dim=100, 
    num_channels=100, 
    # Training hyper parameter
    seed=1337, 
    learning_rate=0.001, 
    dropout_p=0.1, 
    batch_size=128, 
    num_epochs=100, 
    early_stopping_criteria=5, 
    # Runtime option
    cuda=True, 
    device='cuda',
    catch_keyboard_interrupt=True, 
    reload_from_files=False,
    expand_filepaths_to_save_dir=True
) 

if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir,
                                        args.vectorizer_file)

    args.model_state_file = os.path.join(args.save_dir,
                                         args.model_state_file)
    
    print("Expanded filepaths: ")
    print("\t{}".format(args.vectorizer_file))
    print("\t{}".format(args.model_state_file))
    
# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False
    
args.device = torch.device("cuda" if args.cuda else "cpu")
print("Using CUDA: {}".format(args.cuda))

# Set seed for reproducibility
set_seed_everywhere(args.seed, args.cuda)

# handle dirs
handle_dirs(args.save_dir)

Expanded filepaths: 
	model_storage/cnn/vectorizer.json
	model_storage/cnn/model.pth
Using CUDA: False


In [107]:
# Initializations
args.use_glove = True
if args.reload_from_files:
    # training from a checkpoint
    dataset = CNNDataset.load_dataset_and_load_vectorizer(args.task1CNN_csv,
                                                           args.vectorizer_file)
else:
    # create dataset and vectorizer
    dataset = CNNDataset.load_dataset_and_make_vectorizer(args.task1CNN_csv)
    dataset.save_vectorizer(args.vectorizer_file)
vectorizer = dataset.get_vectorizer()

# Use GloVe or randomly initialized embeddings
if args.use_glove:
    words = vectorizer.top_10_words_vocab._token_to_idx.keys()
    embeddings = make_embedding_matrix(glove_filepath=args.glove_filepath, 
                                       words=words)
    print("Using pre-trained embeddings")
else:
    print("Not using pre-trained embeddings")
    embeddings = None

classifier = CNNClassifier(embedding_size=args.embedding_size, 
                            num_embeddings=len(vectorizer.top_10_words_vocab),
                            num_channels=args.num_channels,
                            hidden_dim=args.hidden_dim, 
                            num_classes=len(vectorizer.job_type_new_vocab), 
                            dropout_p=args.dropout_p,
                            pretrained_embeddings=embeddings,
                            padding_idx=0)

Using pre-trained embeddings


In [108]:
# Number of Classes
len(vectorizer.job_type_new_vocab)

2

In [109]:
# training loop
classifier = classifier.to(args.device)
dataset.class_weights = dataset.class_weights.to(args.device)
    
loss_func = nn.CrossEntropyLoss(dataset.class_weights)
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                           mode='min', factor=0.5,
                                           patience=1)

train_state = make_train_state(args)

epoch_bar = tqdm(desc='training routine', 
                          total=args.num_epochs,
                          position=0)

dataset.set_split('train')
train_bar = tqdm(desc='split=train',
                          total=dataset.get_num_batches(args.batch_size), 
                          position=1, 
                          leave=True)
dataset.set_split('val')
val_bar = tqdm(desc='split=val',
                        total=dataset.get_num_batches(args.batch_size), 
                        position=1, 
                        leave=True)

try:
    for epoch_index in range(args.num_epochs):
        train_state['epoch_index'] = epoch_index

        # Iterate over training dataset

        # setup: batch generator, set loss and acc to 0, set train mode on

        dataset.set_split('train')
        batch_generator = generate_batches(dataset, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
        running_loss = 0.0
        running_acc = 0.0
        classifier.train()

        for batch_index, batch_dict in enumerate(batch_generator):
            # the training routine is these 5 steps:

            # --------------------------------------
            # step 1. zero the gradients
            optimizer.zero_grad()

            # step 2. compute the output
            y_pred = classifier(batch_dict['x_data'])

            # step 3. compute the loss
            loss = loss_func(y_pred, batch_dict['y_target'])
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            # step 4. use loss to produce gradients
            loss.backward()

            # step 5. use optimizer to take gradient step
            optimizer.step()
            # -----------------------------------------
            # compute the accuracy
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)

            # update bar
            train_bar.set_postfix(loss=running_loss, acc=running_acc, 
                                  epoch=epoch_index)
            train_bar.update()

        train_state['train_loss'].append(running_loss)
        train_state['train_acc'].append(running_acc)

        # Iterate over val dataset

        # setup: batch generator, set loss and acc to 0; set eval mode on
        dataset.set_split('val')
        batch_generator = generate_batches(dataset, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
        running_loss = 0.
        running_acc = 0.
        classifier.eval()

        for batch_index, batch_dict in enumerate(batch_generator):

            # compute the output
            y_pred =  classifier(batch_dict['x_data'])

            # step 3. compute the loss
            loss = loss_func(y_pred, batch_dict['y_target'])
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            # compute the accuracy
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)
            val_bar.set_postfix(loss=running_loss, acc=running_acc, 
                            epoch=epoch_index)
            val_bar.update()

        train_state['val_loss'].append(running_loss)
        train_state['val_acc'].append(running_acc)

        train_state = update_train_state(args=args, model=classifier,
                                         train_state=train_state)

        scheduler.step(train_state['val_loss'][-1])

        if train_state['stop_early']:
            break

        train_bar.n = 0
        val_bar.n = 0
        epoch_bar.update()
except KeyboardInterrupt:
    print("Exiting loop")

training routine:   0%|          | 0/100 [00:00<?, ?it/s]

split=train:   0%|          | 0/10 [00:00<?, ?it/s]

split=val:   0%|          | 0/1 [00:00<?, ?it/s]

In [110]:
# compute the loss & accuracy on the test set using the best available model

classifier.load_state_dict(torch.load(train_state['model_filename']))

classifier = classifier.to(args.device)
dataset.class_weights = dataset.class_weights.to(args.device)
loss_func = nn.CrossEntropyLoss(dataset.class_weights)

dataset.set_split('test')
batch_generator = generate_batches(dataset, 
                                   batch_size=args.batch_size, 
                                   device=args.device)
running_loss = 0.
running_acc = 0.
classifier.eval()

for batch_index, batch_dict in enumerate(batch_generator):
    # compute the output
    y_pred =  classifier(batch_dict['x_data'])
    
    # compute the loss
    loss = loss_func(y_pred, batch_dict['y_target'])
    loss_t = loss.item()
    running_loss += (loss_t - running_loss) / (batch_index + 1)

    # compute the accuracy
    acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
    running_acc += (acc_t - running_acc) / (batch_index + 1)

train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc

In [111]:
print("Test loss: {};".format(train_state['test_loss']))
print("Test Accuracy: {}".format(train_state['test_acc']))

Test loss: 0.6933475931485494;
Test Accuracy: 64.0625


In [114]:
def predict_job_type_new(top_10_words, classifier, vectorizer, max_length):
    """Predict a News category for a new title
    
    Args:
        title (str): a raw title string
        classifier (NewsClassifier): an instance of the trained classifier
        vectorizer (NewsVectorizer): the corresponding vectorizer
        max_length (int): the max sequence length
            Note: CNNs are sensitive to the input data tensor size. 
                  This ensures to keep it the same size as the training data
    """
    top_10_words = preprocess_text(top_10_words)
    vectorized_top_10_words = \
        torch.tensor(vectorizer.vectorize(top_10_words, vector_length=max_length))
    result = classifier(vectorized_top_10_words.unsqueeze(0), apply_softmax=True)
    probability_values, indices = result.max(dim=1)
    predicted_job_type_new = vectorizer.job_type_new_vocab.lookup_index(indices.item())

    return {'job_type': predicted_job_type_new, 
            'probability': probability_values.item()}

In [116]:
def get_samples():
    samples = {}
    for cat in dataset.val_df.job_type_new.unique():
        samples[cat] = dataset.val_df.top_10_words[dataset.val_df.job_type_new==cat].tolist()[:5]
    return samples

val_samples = get_samples()

In [118]:
#title = input("Enter a news title to classify: ")
classifier = classifier.to("cpu")

for truth, sample_group in val_samples.items():
    print(f"True Category: {truth}")
    print("="*30)
    for sample in sample_group:
        prediction = predict_job_type_new(sample, classifier, 
                                      vectorizer, dataset._max_seq_length + 1)
        print("Prediction: {} (p={:0.2f})".format(prediction['job_type'],
                                                  prediction['probability']))
        print("\t + Sample: {}".format(sample))
    print("-"*30 + "\n")

True Category: Full Time
Prediction: Full Time (p=0.51)
	 + Sample: calculation foreign streamlining jane cullen appeal arise pricing taxation oriented
Prediction: Full Time (p=0.50)
	 + Sample: shine wips whip tracker spotlight rush testimonials soar sees resourceful
Prediction: Full Time (p=0.51)
	 + Sample: transcript legalpersonnel lack jgrasso chained admission julie charities bill genuinely
Prediction: Full Time (p=0.51)
	 + Sample: establishments consistant captive canteen camps attentive diplomatic deliverable served majority
Prediction: Full Time (p=0.50)
	 + Sample: tipper economix diligent ot combination afternoon overtime induction white report
------------------------------

True Category: Other
Prediction: Full Time (p=0.50)
	 + Sample: sunglass hut gabbana dolce chanel prada oakley ray push rest
Prediction: Full Time (p=0.51)
	 + Sample: lloyd hall blockers removing sponsors towers tackle etl streams consultation
Prediction: Full Time (p=0.50)
	 + Sample: tidying wynyard