# NLP Project 2

**Author: Jiaru Chen & Daniel Santosa**<br>
**Student Number: 22850907 & 19315466**

### Libraries

In [48]:
#!pip install pandas
#!pip install seaborn
#!pip install wordcloud
import pandas as pd
import numpy as np
import random
import json

# plotting
from matplotlib import pyplot as plt
import seaborn as sns

import nltk
from nltk.probability import FreqDist # frequency 
from nltk.tokenize import word_tokenize # tokenize
from nltk.tag import pos_tag # POS tag
from nltk.stem import PorterStemmer # stemming
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords # to remove stop words

# pip install -U spacy
# python -m spacy download en_core_web_sm
import spacy
from spacy.tokens.doc import Doc
from spacy.vocab import Vocab
from spacy.tokenizer import Tokenizer
from spacy.matcher import Matcher # linguistic pattern
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter # count frequent noun phrases
from sklearn.model_selection import train_test_split  

# onehot encoding
from sklearn.preprocessing import OneHotEncoder
import collections
from argparse import Namespace
import os
import string
import sklearn

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm
from torch.nn.utils import rnn as rnn_utils

from heapq import nlargest
import gensim
from gensim.corpora import Dictionary
from gensim.models import TfidfModel 
from gensim.corpora import Dictionary


# 1. Data Exploration

## Keep only relevant fields

As we have two tasks, one is binary classification (full time or other) and the other one is multiclassifiacation (job categories), we only keep ***job_type***, ***category***, ***job_description*** from the original dataset.

In order to process the dataset faster, we subseted a 2000 records/rows from the original dataset in this whole project.

In [49]:
dat = pd.read_csv('seek_australia.csv')

In [50]:
dat.head()

Unnamed: 0,category,city,company_name,geo,job_board,job_description,job_title,job_type,post_date,salary_offered,state,url
0,Retail & Consumer Products,Sydney,Frontline Executive Retail Sydney,AU,seek,Have you had 10 years experience in fresh pro...,Store Manager - Fresh Produce,Full Time,2018-04-15T23:13:45Z,$100k Base + Super + Benefits,North Shore & Northern Beaches,https://www.seek.com.au/job/35989382
1,Government & Defence,Brisbane,Powerlink,AU,seek,The Opportunity: The Client Solution Analyst ...,Client Solution Analyst,Full Time,2018-04-15T23:04:40Z,Excellent remuneration packages,Northern Suburbs,https://www.seek.com.au/job/35989272
2,Trades & Services,Sydney,Richard Jay Laundry,AU,seek,An innovative business development role for a...,Service Technician / Installer - NSW,Full Time,2018-04-15T23:04:31Z,,Parramatta & Western Suburbs,https://www.seek.com.au/job/35989270
3,Trades & Services,Melbourne,Adaptalift Hyster,AU,seek,About the role: We are seeking an Automotive W...,Workshop Technician I Material Handling Equipment,Full Time,2018-04-16T03:15:17Z,,Bayside & South Eastern Suburbs,https://www.seek.com.au/job/35993203
4,Trades & Services,Adelaide,Bakers Delight G&M,AU,seek,Â Early starts and weekend shifts. No experie...,APPRENTICESHIP JUNIOR BAKER,Full Time,2018-04-16T01:26:50Z,,,https://www.seek.com.au/job/35991578


In [51]:
dat.isnull().sum()

category               0
city                   0
company_name           0
geo                    0
job_board              0
job_description      345
job_title              0
job_type               0
post_date              0
salary_offered     21048
state              10820
url                    0
dtype: int64

In [52]:
dat = dat[dat['job_description'].notna()]

In [53]:
dat.isnull().sum()

category               0
city                   0
company_name           0
geo                    0
job_board              0
job_description        0
job_title              0
job_type               0
post_date              0
salary_offered     20811
state              10718
url                    0
dtype: int64

In [54]:
dat

Unnamed: 0,category,city,company_name,geo,job_board,job_description,job_title,job_type,post_date,salary_offered,state,url
0,Retail & Consumer Products,Sydney,Frontline Executive Retail Sydney,AU,seek,Have you had 10 years experience in fresh pro...,Store Manager - Fresh Produce,Full Time,2018-04-15T23:13:45Z,$100k Base + Super + Benefits,North Shore & Northern Beaches,https://www.seek.com.au/job/35989382
1,Government & Defence,Brisbane,Powerlink,AU,seek,The Opportunity: The Client Solution Analyst ...,Client Solution Analyst,Full Time,2018-04-15T23:04:40Z,Excellent remuneration packages,Northern Suburbs,https://www.seek.com.au/job/35989272
2,Trades & Services,Sydney,Richard Jay Laundry,AU,seek,An innovative business development role for a...,Service Technician / Installer - NSW,Full Time,2018-04-15T23:04:31Z,,Parramatta & Western Suburbs,https://www.seek.com.au/job/35989270
3,Trades & Services,Melbourne,Adaptalift Hyster,AU,seek,About the role: We are seeking an Automotive W...,Workshop Technician I Material Handling Equipment,Full Time,2018-04-16T03:15:17Z,,Bayside & South Eastern Suburbs,https://www.seek.com.au/job/35993203
4,Trades & Services,Adelaide,Bakers Delight G&M,AU,seek,Â Early starts and weekend shifts. No experie...,APPRENTICESHIP JUNIOR BAKER,Full Time,2018-04-16T01:26:50Z,,,https://www.seek.com.au/job/35991578
...,...,...,...,...,...,...,...,...,...,...,...,...
29995,Hospitality & Tourism,Sydney,Radisson Blu Plaza Hotel Sydney,AU,seek,Hotel snapshot The Radisson Blu Plaza Sydney ...,Bar Supervisor,Full Time,2018-04-11T04:20:40Z,"Annualised salary, uniform + Super","CBD, Inner West & Eastern Suburbs",https://www.seek.com.au/job/35958503
29996,CEO & General Management,ACT,Airservices Australia,AU,seek,The Organisation Airservices is a government ...,Deputy Board Secretary,Full Time,2018-04-11T04:00:49Z,Salary package to be negotiated,,https://www.seek.com.au/job/35958100
29997,Accounting,Melbourne,The Hassett Group,AU,seek,ABOUT THE COMPANY AND ROLE Our client is one o...,Corporate Accountant,Full Time,2018-04-11T02:45:37Z,$110k Package On Offer!,CBD & Inner Suburbs,https://www.seek.com.au/job/35956991
29998,Government & Defence,ACT,SOS Recruitment,AU,seek,Long term contract for 12 months with possibl...,APS 6 & EL1 Account Managers,Contract/Temp,2018-04-11T04:55:16Z,,,https://www.seek.com.au/job/35959184


In [55]:
dat2 = dat.loc[:2019]

In [56]:
dat2.shape

(2000, 12)

In [57]:
data = dat2[['job_description', 'job_type', 'category']]

In [58]:
data.head()

Unnamed: 0,job_description,job_type,category
0,Have you had 10 years experience in fresh pro...,Full Time,Retail & Consumer Products
1,The Opportunity: The Client Solution Analyst ...,Full Time,Government & Defence
2,An innovative business development role for a...,Full Time,Trades & Services
3,About the role: We are seeking an Automotive W...,Full Time,Trades & Services
4,Â Early starts and weekend shifts. No experie...,Full Time,Trades & Services


In [59]:
data.isnull().sum()

job_description    0
job_type           0
category           0
dtype: int64

Since the column ***job_description*** contains some meaningless notation, we have performed data cleansing in this part.

In [60]:
# data['job_description'].fillna("", inplace=True)
# we need to drop na otherwise the 
# data = data.dropna()
data['job_description_new'] = data['job_description'].map(lambda x: re.sub(r'\W+', ' ', x))
def remove_nonEglish(data):
    return re.sub("[^\x00-\x7F]+", "", data)

def remove_multiSpace(data):
    return re.sub(' +', ' ', data)

data['job_description_new'] = data['job_description_new'].apply(lambda x: remove_nonEglish(x))
data['job_description_new'] = data['job_description_new'].apply(lambda x: remove_multiSpace(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['job_description_new'] = data['job_description'].map(lambda x: re.sub(r'\W+', ' ', x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['job_description_new'] = data['job_description_new'].apply(lambda x: remove_nonEglish(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['job_descript

In [61]:
data.to_csv("seek_australia_2000.csv")

In [62]:
data['job_description_new'][0]

' Have you had 10 years experience in fresh produce that wants to manage their own store for a family owned Australian company that is passionate about food We are looking for Must have 10 years in the fresh food business and have the passion for the role Current 2IC looking to progress with training into Store manager role Excellent customer service and communication skills Be hands on and have a can do attitude Be into the fresh food business and have the passion for the role Hardworking ambitious and competitive people who are passionate about good food Are able to maximise the financial return in their market ensuring it meets sales margin and wages budgets Have exceptional merchandising capabilities and customer service skills helping us to create unique shopping experiences for our customers Have a wealth of knowledge of fresh food retailing and a willingness to share this knowledge Can lead manage and motivate a teams Must be able to work weekend and use to early starts which is

Below code is to make each category as a corpus and all job descriptions in that category as a document. And we can see there are 30 unique job categories in this dataset.

In [63]:
unique_category = data['category'].unique().tolist()

cate_desc_dict = {}

for cate in unique_category:
    cate = str(cate)
    desc_documents = data.loc[data['category'] == cate, 'job_description_new']
    # print(desc_documents)
    corpus = []
    for desc in desc_documents:
        # print(desc)
        corpus.append(str(desc))

    cate_desc_dict[cate] = corpus

In [64]:
len(cate_desc_dict)

30

## Use gensim to train word embeddings and TSNE to visulise some examples

#### Training Domain Specific Embeddings And Saving Embeddings As Model and TXT File

In [66]:
def vectorizer(category):
    document = cate_desc_dict[category]
    doc_tokenized = gensim.utils.simple_preprocess(str(document), deacc=True)
    # print(doc_tokenized[:10])
    return doc_tokenized

In [67]:
categories = data['category'].unique()

doc_tokenized = []
for category in categories:
    tokens = vectorizer(category)
    doc_tokenized.append(tokens)

# converting tokens to a list form
doc_tokenized2 = []
for x in doc_tokenized:
    doc_tokenized2 += x if type(x) == list else [x]

doc_tokenized = doc_tokenized2


In [68]:
from gensim.models import Word2Vec

cores = 16
model = Word2Vec(min_count=1,
                     window=2,
                     vector_size=100,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

In [69]:
from time import time

t = time()

model.build_vocab([doc_tokenized], progress_per=10)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

Time to build vocab: 0.01 mins


In [70]:
# t = time()

# model.train(doc_tokenized, total_examples=model.corpus_count, epochs=1000, report_delay=1)

# print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

In [71]:
# Save the domain specific trained embeddings in txt format
# model.wv.save_word2vec_format('domain_embeddings.txt')

# Save the domain specific trained embeddings in as model
# model.save('domain_embeddings')


#### Visulise the Pre-Trained Word2Vec Embeddings and Domain Specific Embeddings

In [72]:
import sys
assert sys.version_info[0]==3
assert sys.version_info[1] >= 5

from gensim.models import KeyedVectors
from gensim.test.utils import datapath
import pprint
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [10, 5]
import numpy as np
import random
import scipy as sp
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA

START_TOKEN = '<START>'
END_TOKEN = '<END>'

np.random.seed(0)
random.seed(0)

In [73]:
texts = cate_desc_dict["Self Employment"]
job_desc_token = []
for text in texts:
    sentences = nltk.sent_tokenize(text)
    for sentence in sentences:
        words = nltk.word_tokenize(sentence)
        # ner_tagged_words = nltk.ne_chunk(nltk.pos_tag(words))
        # job_descip_token = job_descip_token + [ner_tagged_words]
        job_desc_token.extend(words)

In [74]:
len(job_desc_token)

279

In [75]:
len(set(job_desc_token))

185

In [76]:
cate_desc_dict["Self Employment"]

['Join a team you ll love within a company Australia loves At Aussie we pride ourselves on educating new to industry brokers developing the businesses of experienced Mortgage Brokers and helping people transition to a self employed opportunity they love What Aussie offers you Ongoing training development and support Comprehensive panel of lenders Leading technology Opportunity to work towards other Aussie channels such as franchise Free two year mentoring program for all new brokers Uncapped commission to build your own future Supportive vibrant team culture What does being a Mortgage Broker involve A broker s day is wide and varied A typical day could involve meeting with a prospective customer to review their financial situation and borrowing capacity liaising with lenders to track the progress of a loan managing the flow of documentation for multiple loan lodgements meeting with a referral partner to build up business generation networks overseeing the integrity of compliance proces

In [77]:
def read_corpus(category="Self Employment"):
    sents = []
    for text in cate_desc_dict[category]:
        for sentence in nltk.sent_tokenize(text):
            sent = [START_TOKEN] + [w.lower() for w in nltk.word_tokenize(sentence)] + [END_TOKEN]
            sents.append(sent)
    # return [[[START_TOKEN] + [w.lower() for w in nltk.word_tokenize(sentence)] + [END_TOKEN] for sentence in nltk.sent_tokenize(text)] for text in cate_desc_dict[category]]
    return sents

data_corpus = read_corpus()
pprint.pprint(data_corpus[:3], compact=True, width=100)

[['<START>', 'join', 'a', 'team', 'you', 'll', 'love', 'within', 'a', 'company', 'australia',
  'loves', 'at', 'aussie', 'we', 'pride', 'ourselves', 'on', 'educating', 'new', 'to', 'industry',
  'brokers', 'developing', 'the', 'businesses', 'of', 'experienced', 'mortgage', 'brokers', 'and',
  'helping', 'people', 'transition', 'to', 'a', 'self', 'employed', 'opportunity', 'they', 'love',
  'what', 'aussie', 'offers', 'you', 'ongoing', 'training', 'development', 'and', 'support',
  'comprehensive', 'panel', 'of', 'lenders', 'leading', 'technology', 'opportunity', 'to', 'work',
  'towards', 'other', 'aussie', 'channels', 'such', 'as', 'franchise', 'free', 'two', 'year',
  'mentoring', 'program', 'for', 'all', 'new', 'brokers', 'uncapped', 'commission', 'to', 'build',
  'your', 'own', 'future', 'supportive', 'vibrant', 'team', 'culture', 'what', 'does', 'being', 'a',
  'mortgage', 'broker', 'involve', 'a', 'broker', 's', 'day', 'is', 'wide', 'and', 'varied', 'a',
  'typical', 'day', 'coul

In [78]:
def distinct_words(corpus):
    """ Determine a list of distinct words for the corpus.
        Params:
            corpus (list of list of strings): corpus of documents
        Return:
            corpus_words (list of strings): list of distinct words across the corpus, sorted (using python 'sorted' function)
            num_corpus_words (integer): number of distinct words across the corpus
    """
    corpus_words = []
    num_corpus_words = -1
    
    # ------------------
    # Write your implementation here.
    corpus_words = sorted(list(set([y for x in corpus for y in x])))
#     corpus_words = [y for x in corpus for y in x] 
#     corpus_words = list(set(corpus_words)) # unique words 
#     corpus_words = sorted(corpus_words) # sorts
    num_corpus_words = len(corpus_words)
    # ------------------

    return corpus_words, num_corpus_words

test_corpus_words, num_corpus_words = distinct_words(data_corpus)

In [79]:
test_corpus_words

['<END>',
 '<START>',
 'a',
 'achieve',
 'activities',
 'aggregator',
 'all',
 'also',
 'am',
 'and',
 'appointments',
 'as',
 'at',
 'aussie',
 'australia',
 'be',
 'being',
 'borrowing',
 'broker',
 'brokers',
 'build',
 'built',
 'business',
 'businesses',
 'busy',
 'capacity',
 'channels',
 'check',
 'child',
 'children',
 'clients',
 'commission',
 'company',
 'compliance',
 'comprehensive',
 'could',
 'culture',
 'customer',
 'daily',
 'day',
 'developing',
 'development',
 'do',
 'documentation',
 'does',
 'educating',
 'employed',
 'experience',
 'experienced',
 'families',
 'financial',
 'first',
 'flexibility',
 'flow',
 'for',
 'franchise',
 'free',
 'from',
 'future',
 'generation',
 'goals',
 'have',
 'help',
 'helping',
 'highly',
 'how',
 'i',
 'industry',
 'info',
 'integrity',
 'involve',
 'is',
 'it',
 'job',
 'join',
 'know',
 'leading',
 'leave',
 'lenders',
 'liaising',
 'lifestyle',
 'll',
 'loan',
 'lodgements',
 'looking',
 'love',
 'loves',
 'managing',
 'mater

In [80]:
from sklearn.manifold import TSNE
from nltk.corpus import stopwords
import numpy as np
import matplotlib.pyplot as plt
import gensim.downloader as api
from bokeh.plotting import figure, show, output_file
from bokeh.io import push_notebook, output_notebook
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show
from bokeh.io import push_notebook, output_notebook
from bokeh.models import ColumnDataSource, LabelSet


In [81]:
def interactive_tsne(text_labels, tsne_array):
    '''makes an interactive scatter plot with text labels for each point'''

    # Define a dataframe to be used by bokeh context
    bokeh_df = pd.DataFrame(tsne_array, text_labels, columns=['x','y'])
    bokeh_df['text_labels'] = bokeh_df.index

    # interactive controls to include to the plot
    TOOLS="hover, zoom_in, zoom_out, box_zoom, undo, redo, reset, box_select"
    output_file("plot.html")

    p = figure(tools=TOOLS, plot_width=700, plot_height=700)

    # define data source for the plot
    source = ColumnDataSource(bokeh_df)

    # scatter plot
    p.scatter('x', 'y', source=source, fill_alpha=0.6,
              fill_color="#8724B5",
              line_color=None)

    # text labels
    labels = LabelSet(x='x', y='y', text='text_labels', y_offset=8,
                      text_font_size="8pt", text_color="#555555",
                      source=source, text_align='center')

    p.add_layout(labels)

    # show plot inline
    output_notebook()
    show(p)

#### Pre-trained Word2Vec Embedding

In [82]:
# pretrained model
wv = api.load('word2vec-google-news-300')

In [83]:
# stopwords = stopwords.words('english')
# vocab = test_corpus_words
# input_vocab =  [word for word in vocab if word in wv.key_to_index.keys() and word not in stopwords]
vocab = test_corpus_words
input_vocab =  [word for word in vocab if word in wv.key_to_index.keys()]
X = wv[input_vocab]
# find tsne coords for 2 dimensions
tsne = TSNE(n_components=2, random_state=0)
X_tsne = tsne.fit_transform(X)

print(input_vocab)

points = len(input_vocab)
interactive_tsne(list(input_vocab)[:points], X_tsne)



['achieve', 'activities', 'aggregator', 'all', 'also', 'am', 'appointments', 'as', 'at', 'aussie', 'australia', 'be', 'being', 'borrowing', 'broker', 'brokers', 'build', 'built', 'business', 'businesses', 'busy', 'capacity', 'channels', 'check', 'child', 'children', 'clients', 'commission', 'company', 'compliance', 'comprehensive', 'could', 'culture', 'customer', 'daily', 'day', 'developing', 'development', 'do', 'documentation', 'does', 'educating', 'employed', 'experience', 'experienced', 'families', 'financial', 'first', 'flexibility', 'flow', 'for', 'franchise', 'free', 'from', 'future', 'generation', 'goals', 'have', 'help', 'helping', 'highly', 'how', 'i', 'industry', 'info', 'integrity', 'involve', 'is', 'it', 'job', 'join', 'know', 'leading', 'leave', 'lenders', 'liaising', 'lifestyle', 'll', 'loan', 'lodgements', 'looking', 'love', 'loves', 'managing', 'maternity', 'meeting', 'melanie', 'mentoring', 'mobile', 'more', 'mortgage', 'mother', 'motivated', 'multiple', 'mum', 'my', 

#### Pre-trained Domain Specific Embedding

In [84]:
from gensim.models.word2vec import Word2Vec

# pretrained domain specific model
model = Word2Vec.load('domain_embeddings')

In [85]:
import nltk
from nltk.corpus import stopwords
from sklearn.manifold import TSNE

vocab = test_corpus_words
input_vocab =  [word for word in vocab if word in model.wv.index_to_key]
X = model.wv[input_vocab]
# find tsne coords for 2 dimensions
tsne = TSNE(n_components=2, random_state=0)
X_tsne = tsne.fit_transform(X)

# print(input_vocab)

points = len(input_vocab)
interactive_tsne(list(input_vocab)[:points], X_tsne)



# 2. Data preprocessing

In [86]:
data

Unnamed: 0,job_description,job_type,category,job_description_new
0,Have you had 10 years experience in fresh pro...,Full Time,Retail & Consumer Products,Have you had 10 years experience in fresh pro...
1,The Opportunity: The Client Solution Analyst ...,Full Time,Government & Defence,The Opportunity The Client Solution Analyst p...
2,An innovative business development role for a...,Full Time,Trades & Services,An innovative business development role for a...
3,About the role: We are seeking an Automotive W...,Full Time,Trades & Services,About the role We are seeking an Automotive Wo...
4,Â Early starts and weekend shifts. No experie...,Full Time,Trades & Services,Early starts and weekend shifts No experience...
...,...,...,...,...
2015,2 positions available Based in Kelmscott and ...,Full Time,Trades & Services,2 positions available Based in Kelmscott and ...
2016,North Bondi FishÂ Located a few short steps f...,Full Time,Hospitality & Tourism,North Bondi Fish Located a few short steps fro...
2017,Process Workers â€“ South Gippsland Rapidly g...,Casual/Vacation,"Manufacturing, Transport & Logistics",Process Workers South Gippsland Rapidly growi...
2018,"JPS, Australia's leading Consultancy for the ...",Full Time,Trades & Services,JPS Australia s leading Consultancy for the P...


Reindex the dataset

In [87]:
data = data.reset_index()
data = data.drop(['index'], axis = 1)

### Caculate highest TFIDF scores for each word in the whole category and pick the top 10 TFIDF words for each job description from the TFIDF of the category.

In [88]:
from heapq import nlargest
from gensim.models import TfidfModel 
from gensim.corpora import Dictionary

# convert to tokens
def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

# convert a string into a list
def convert(lst):
    return ([i for item in lst for i in item.split()])

In [89]:
def top_10_words(data, category):
    top_words = []
    data = data.loc[data['category'] == category]
    data = data.reset_index(drop=True)
    job_description_text = data.loc[:,'job_description_new']
    doc_tokenized = list(sent_to_words(job_description_text))
    dictionary = Dictionary()
    BoW_corpus = [dictionary.doc2bow(doc, allow_update=True) for doc in doc_tokenized]
    BoW_corpus
    tfidf = TfidfModel(BoW_corpus, smartirs='ntc') 
    maximun_tfidf = []

    for corpus_idx in range(len(tfidf[BoW_corpus])):
        # chose 10 smallest TFIDF values and remove them
        maximun_tfidf = tfidf[BoW_corpus[corpus_idx]]
        maximun_tfidf.sort(key=lambda x: x[1], reverse=True)

    for ele in maximun_tfidf:
        cur_row = dictionary[ele[0]].strip()
        top_words.append(cur_row)

    # find top 10 words in job description from tfidf of words in category and create new column of top 10 words in dataframe
    for i, job_descr in enumerate(job_description_text):
        arr = []
        for word in top_words:        
            word = word.lower()
            if word in job_descr:
                arr.append(word)
                if len(arr) == 10:
                    break
        listToStr = ' '.join([str(elem) for elem in arr])
        data.loc[[i],'top_10_words'] = str(listToStr)
    return data

In [90]:
def top_words_data(data, categories):
    appended_data = []
    for category in categories:
        df = top_10_words(data, category)
        appended_data.append(df)
    appended_data = pd.concat(appended_data)
    return appended_data


In [91]:
data2 = top_words_data(data, categories)


In [92]:
data2.reset_index(drop=True)

Unnamed: 0,job_description,job_type,category,job_description_new,top_10_words
0,Have you had 10 years experience in fresh pro...,Full Time,Retail & Consumer Products,Have you had 10 years experience in fresh pro...,looking helping willing what experiences excep...
1,About Us: Retail SafariÂ as part ofÂ CPM Aust...,Casual/Vacation,Retail & Consumer Products,About Us Retail Safari as part of CPM Austral...,ideal understand ll about customers industry w...
2,Work with a brand you love & wear! Generous e...,Full Time,Retail & Consumer Products,Work with a brand you love wear Generous empl...,play celebrate yourself clear ideal exceptiona...
3,About the Company Forever New is one of Austr...,Part Time,Retail & Consumer Products,About the Company Forever New is one of Austr...,reward celebrate ideal understand look ll prog...
4,Sunglass Hut is a global retailer with stores...,Casual/Vacation,Retail & Consumer Products,Sunglass Hut is a global retailer with stores...,reward always incentive ll national about bonu...
...,...,...,...,...,...
1995,Who is South West TAFE?Â Â Â South West TAFE...,Full Time,CEO & General Management,Who is South West TAFE South West TAFE is an e...,educational teaching education swtafe and lear...
1996,Mars has just been awarded #1 in the Best Pla...,Full Time,Science & Technology,Mars has just been awarded 1 in the Best Plac...,com full and across the have key leader manage...
1997,Â About Us: Mayne Pharmaâ€™s roots can be tr...,Full Time,Science & Technology,About Us Mayne Pharma s roots can be traced b...,com pharmaceutical and the apply company of co...
1998,Newly created roles for Strategic Statistical ...,Full Time,Science & Technology,Newly created roles for Strategic Statistical ...,com and account the have apply group responsib...


In [93]:
data = data2

### Splitting data for task 1

As I mentioned, we only predict job type is full time or other by using job description or top 10 words. Therefore, we need to convert other job type into a new category called 'other' for example: convert 'Casual/Vacation' job type into 'other'.

Moreover, since task 1 and task 2 aim to predict different targets, so we need to split data twice for diifferent objectives. The first split for task 1, we splited data based on ***job_type***.

In [95]:
data

Unnamed: 0,job_description,job_type,category,job_description_new,top_10_words
0,Have you had 10 years experience in fresh pro...,Full Time,Retail & Consumer Products,Have you had 10 years experience in fresh pro...,looking helping willing what experiences excep...
1,About Us: Retail SafariÂ as part ofÂ CPM Aust...,Casual/Vacation,Retail & Consumer Products,About Us Retail Safari as part of CPM Austral...,ideal understand ll about customers industry w...
2,Work with a brand you love & wear! Generous e...,Full Time,Retail & Consumer Products,Work with a brand you love wear Generous empl...,play celebrate yourself clear ideal exceptiona...
3,About the Company Forever New is one of Austr...,Part Time,Retail & Consumer Products,About the Company Forever New is one of Austr...,reward celebrate ideal understand look ll prog...
4,Sunglass Hut is a global retailer with stores...,Casual/Vacation,Retail & Consumer Products,Sunglass Hut is a global retailer with stores...,reward always incentive ll national about bonu...
...,...,...,...,...,...
5,Who is South West TAFE?Â Â Â South West TAFE...,Full Time,CEO & General Management,Who is South West TAFE South West TAFE is an e...,educational teaching education swtafe and lear...
0,Mars has just been awarded #1 in the Best Pla...,Full Time,Science & Technology,Mars has just been awarded 1 in the Best Plac...,com full and across the have key leader manage...
1,Â About Us: Mayne Pharmaâ€™s roots can be tr...,Full Time,Science & Technology,About Us Mayne Pharma s roots can be traced b...,com pharmaceutical and the apply company of co...
2,Newly created roles for Strategic Statistical ...,Full Time,Science & Technology,Newly created roles for Strategic Statistical ...,com and account the have apply group responsib...


In [96]:
data["job_type_new"] = np.where(data["job_type"] == "Full Time", "Full Time", "Other")

In [97]:
data

Unnamed: 0,job_description,job_type,category,job_description_new,top_10_words,job_type_new
0,Have you had 10 years experience in fresh pro...,Full Time,Retail & Consumer Products,Have you had 10 years experience in fresh pro...,looking helping willing what experiences excep...,Full Time
1,About Us: Retail SafariÂ as part ofÂ CPM Aust...,Casual/Vacation,Retail & Consumer Products,About Us Retail Safari as part of CPM Austral...,ideal understand ll about customers industry w...,Other
2,Work with a brand you love & wear! Generous e...,Full Time,Retail & Consumer Products,Work with a brand you love wear Generous empl...,play celebrate yourself clear ideal exceptiona...,Full Time
3,About the Company Forever New is one of Austr...,Part Time,Retail & Consumer Products,About the Company Forever New is one of Austr...,reward celebrate ideal understand look ll prog...,Other
4,Sunglass Hut is a global retailer with stores...,Casual/Vacation,Retail & Consumer Products,Sunglass Hut is a global retailer with stores...,reward always incentive ll national about bonu...,Other
...,...,...,...,...,...,...
5,Who is South West TAFE?Â Â Â South West TAFE...,Full Time,CEO & General Management,Who is South West TAFE South West TAFE is an e...,educational teaching education swtafe and lear...,Full Time
0,Mars has just been awarded #1 in the Best Pla...,Full Time,Science & Technology,Mars has just been awarded 1 in the Best Plac...,com full and across the have key leader manage...,Full Time
1,Â About Us: Mayne Pharmaâ€™s roots can be tr...,Full Time,Science & Technology,About Us Mayne Pharma s roots can be traced b...,com pharmaceutical and the apply company of co...,Full Time
2,Newly created roles for Strategic Statistical ...,Full Time,Science & Technology,Newly created roles for Strategic Statistical ...,com and account the have apply group responsib...,Full Time


Training, testing and validation datasets are splitted based on 0.7/0.15/0.15. Validation set will be used in later early stop section.

In [98]:
args = Namespace(
    raw_dataset_csv="seek_australia_2000.csv",
    train_proportion=0.7,
    val_proportion=0.10,
    test_proportion=0.20,
    output_munged_csv="ass02_task01.csv",
    seed=1337
)

In [99]:
# Splitting train by job_type
# Create dict
by_type = collections.defaultdict(list)
for _, row in data.iterrows():
    by_type[row.job_type_new].append(row.to_dict())

In [100]:
# Create split data
final_list = []
np.random.seed(args.seed)

for _, item_list in sorted(by_type.items()):

    np.random.shuffle(item_list)
    
    n_total = len(item_list)
    n_train = int(args.train_proportion * n_total)
    n_val = int(args.val_proportion * n_total)
    n_test = int(args.test_proportion*n_total)
    
    # Give data point a split attribute
    for item in item_list[:n_train]:
        item['split'] = 'train'
    
    for item in item_list[n_train:n_train+n_val]:
        item['split'] = 'val'
    
    for item in item_list[n_train+n_val:]:
        item['split'] = 'test'

    # Add to final list
    final_list.extend(item_list)

In [101]:
split_task1 = pd.DataFrame(final_list)
split_task1.split.value_counts()
# len(split_task1) = 2000

train    1399
test      402
val       199
Name: split, dtype: int64

In [102]:
set(split_task1.split)

{'test', 'train', 'val'}

In [103]:
split_task1 = split_task1[["category", "job_description_new", "top_10_words", "job_type_new", "split"]]

In [104]:
split_task1[pd.isnull(split_task1.job_description_new)]

Unnamed: 0,category,job_description_new,top_10_words,job_type_new,split


In [105]:
split_task1

Unnamed: 0,category,job_description_new,top_10_words,job_type_new,split
0,Trades & Services,We are seeking suitably qualified and motivat...,roles go staff it an role team join opportunit...,Full Time,train
1,Healthcare & Medical,Our business IPAR is the leading provider of ...,health range excel developing problem also res...,Full Time,train
2,Administration & Office Support,Cricket Australia CA is one of the nation s p...,corporate level its providing fit addressing d...,Full Time,train
3,Human Resources & Recruitment,We are seeking highly motivated and enthusias...,appointments great comp get not ir skill stron...,Full Time,train
4,Marketing & Communications,This organization is one of Canberra s iconic...,contract then marketing format delivery exposu...,Full Time,train
...,...,...,...,...,...
1995,Healthcare & Medical,We are seeking a Part time qualified dynamic ...,solving problem under clinical motivated abili...,Other,test
1996,Trades & Services,About the Company WorkPac is Australia s lead...,print large leading core go can off but areas ...,Other,test
1997,Trades & Services,We are looking for security guards to work in...,looking can hour it an considered for as with are,Other,test
1998,Information & Communication Technology,As one of the Australia s largest providers o...,identity connect format word provider function...,Other,test


In [106]:
pd.Series(dict(FreqDist(split_task1.job_type_new)))

Full Time    1383
Other         617
dtype: int64

In [107]:
split_task1.dtypes

category               object
job_description_new    object
top_10_words           object
job_type_new           object
split                  object
dtype: object

In [108]:
split_task1.to_csv(args.output_munged_csv, index=False)

### Splitting data for task 2

This time we splited the dataset based on ***category*** for task 2 multiclassification problem. Training, testing and validation datasets are splitted based on 0.7/0.15/0.15. Validation set will be used in later early stop section.

In [292]:
args = Namespace(
    raw_dataset_csv="seek_australia_2000.csv",
    train_proportion=0.7,
    val_proportion=0.10,
    test_proportion=0.20,
    output_munged_csv="ass02_task02.csv",
    seed=1337
)

In [206]:
# Splitting train by category
# Create dict
by_cate = collections.defaultdict(list)
for _, row in data.iterrows():
    by_cate[row.category].append(row.to_dict())

In [207]:
# Create split data
final_list = []
np.random.seed(args.seed)

for _, item_list in sorted(by_cate.items()):

    np.random.shuffle(item_list)
    
    n_total = len(item_list)
    n_train = int(args.train_proportion * n_total)
    n_val = int(args.val_proportion * n_total)
    n_test = int(args.test_proportion*n_total)
    
    # Give data point a split attribute
    for item in item_list[:n_train]:
        item['split'] = 'train'
    
    for item in item_list[n_train:n_train+n_val]:
        item['split'] = 'val'
    
    for item in item_list[n_train+n_val:]:
        item['split'] = 'test'

    # Add to final list
    final_list.extend(item_list)

In [208]:
split_task2 = pd.DataFrame(final_list)
split_task2.split.value_counts()

train    1387
test      425
val       188
Name: split, dtype: int64

In [209]:
set(split_task1.split)

{'test', 'train', 'val'}

In [210]:
split_task2

Unnamed: 0,job_description,job_type,category,job_description_new,top_10_words,job_type_new,split
0,The Company We are currently working with a G...,Contract/Temp,Accounting,The Company We are currently working with a G...,they respected service no credit access per re...,Other,train
1,About the Company My clientÂ is a global mark...,Full Time,Accounting,About the Company My client is a global marke...,every they accounting use service control no u...,Full Time,train
2,About the business After two years in operati...,Part Time,Accounting,About the business After two years in operati...,ward put pride accounting hours establish look...,Other,train
3,The Firm This firm has been in businessÂ for ...,Full Time,Accounting,The Firm This firm has been in business for o...,accounting look firm service no credit per are...,Full Time,train
4,Project Finance Manager Â The Opportunity At...,Full Time,Accounting,Project Finance Manager The Opportunity At Pw...,them ward primarily every they accounting hour...,Full Time,train
...,...,...,...,...,...,...,...
1995,LaseMedics are going through a period of expa...,Part Time,Trades & Services,LaseMedics are going through a period of expa...,looking person go can machines but driven oppo...,Other,test
1996,Hays Trades & Labour require carpenters for a...,Contract/Temp,Trades & Services,Hays Trades Labour require carpenters for an ...,many leading too looking call go opportunities...,Other,test
1997,About Us GJK Facility Services is one of the ...,Full Time,Trades & Services,About Us GJK Facility Services is one of the ...,large person go can but areas organisation so ...,Full Time,test
1998,"Well presented, professional,Â highly motivat...",Full Time,Trades & Services,Well presented professional highly motivated ...,go can off but driven areas it over an more,Full Time,test


In [211]:
split_task2 = split_task2[["category", "job_description_new", "top_10_words", "split"]]

In [212]:
split_task2[pd.isnull(split_task2.job_description_new)]

Unnamed: 0,category,job_description_new,top_10_words,split


In [285]:
split_task2.category.value_counts()

Trades & Services                         249
Information & Communication Technology    190
Government & Defence                      156
Manufacturing, Transport & Logistics      153
Healthcare & Medical                      153
Administration & Office Support           124
Accounting                                108
Sales                                      87
Hospitality & Tourism                      82
Construction                               80
Retail & Consumer Products                 66
Education & Training                       61
Community Services & Development           61
Marketing & Communications                 55
Call Centre & Customer Service             51
Human Resources & Recruitment              49
Banking & Financial Services               45
Engineering                                45
Real Estate & Property                     42
Mining, Resources & Energy                 36
Legal                                      25
Design & Architecture             

In [284]:
print('There are {} unique job categories in the whole dataset'.format(len(set(split_task2.category))))
print('There are {} job categories in the training set'.format(len(set(split_task2[split_task2['split'] == 'train'].category))))
print('There are {} job categories in the test set'.format(len(set(split_task2[split_task2['split'] == 'test'].category))))
print('There are {} job categories in the validation set'.format(len(set(split_task2[split_task2['split'] == 'val'].category))))

There are 30 unique job categories in the whole dataset
There are 29 job categories in the training set
There are 30 job categories in the test set
There are 25 job categories in the validation set


From the results above, there are 30 classes in job category, however, there are only 29 categories in the training set, 30 categories in the test set and 25 categories in the validation set. If we train the RNN or GRU model with using training set to perform the model, it can only predict the 29 categories that in the training set instead of all 30 categories.

We found that 'Self Employment' category only occurs one time in the first 2000 rows, therefore, there is no 'Self Employment' category in training set. In order to make sure there are same unique categories in both training and test set, we removed this category from the test set.

In [287]:
split_task2[split_task2['category'] == 'Self Employment']

Unnamed: 0,category,job_description_new,top_10_words,split
1738,Self Employment,Join a team you ll love within a company Austr...,to and with of for mortgage my the broker brokers,test


In [293]:
split_task2.drop(split_task2[split_task2['category'] == 'Self Employment'].index, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  split_task2.drop(split_task2[split_task2['category'] == 'Self Employment'].index, inplace = True)


In [294]:
len(split_task2)

1999

In [295]:
split_task2

Unnamed: 0,category,job_description_new,top_10_words,split
0,Accounting,The Company We are currently working with a G...,they respected service no credit access per re...,train
1,Accounting,About the Company My client is a global marke...,every they accounting use service control no u...,train
2,Accounting,About the business After two years in operati...,ward put pride accounting hours establish look...,train
3,Accounting,The Firm This firm has been in business for o...,accounting look firm service no credit per are...,train
4,Accounting,Project Finance Manager The Opportunity At Pw...,them ward primarily every they accounting hour...,train
...,...,...,...,...
1995,Trades & Services,LaseMedics are going through a period of expa...,looking person go can machines but driven oppo...,test
1996,Trades & Services,Hays Trades Labour require carpenters for an ...,many leading too looking call go opportunities...,test
1997,Trades & Services,About Us GJK Facility Services is one of the ...,large person go can but areas organisation so ...,test
1998,Trades & Services,Well presented professional highly motivated ...,go can off but driven areas it over an more,test


In [296]:
split_task2.to_csv(args.output_munged_csv, index=False)

### Dataset Feed Forward Neural Network model

In [114]:
# Dataset
from torch.utils.data import Dataset
class FeedFowardDataset(Dataset):
    def __init__(self, task1_df, vectorizer):
        self.task1_df = task1_df
        self._vectorizer = vectorizer
        self.train_df = self.task1_df[self.task1_df.split=='train']
        self.train_size = len(self.train_df)
        self.val_df = self.task1_df[self.task1_df.split=='val']
        self.validation_size = len(self.val_df)
        self.test_df = self.task1_df[self.task1_df.split=='test']
        self.test_size = len(self.test_df)
        self._lookup_dict = {'train': (self.train_df, self.train_size),
                            'val': (self.val_df, self.validation_size),
                            'test': (self.test_df, self.test_size)}
        self.set_split('train')
    @classmethod
    def load_dataset_and_make_vectorizer(cls, task1_csv):
        task1_df = pd.read_csv(task1_csv)
        return cls(task1_df, OneHotVectorizer.from_dataframe(task1_df))
    def get_vectorizer(self):
        return self._vectorizer
    def set_split(self, split="train"):
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]
    def __len__(self):
        return self._target_size
    def __getitem__(self, index):
        row = self._target_df.iloc[index]
        top_10_vector = \
        self._vectorizer.vectorize(row.top_10_words)
        job_type_index = \
        self._vectorizer.job_type_new_vocab.lookup_token(row.job_type_new)
        return {'x_data': top_10_vector,
                'y_target': job_type_index}
    def get_num_batches(self, batch_size):
        return len(self) // batch_size

### One-Hot Encoding Vectorizer

In [115]:
# Vectorizer
from collections import Counter
import string
class OneHotVectorizer(object):
    def __init__(self, top_10_words_vocab, job_type_new_vocab):
        self.top_10_words_vocab = top_10_words_vocab
        self.job_type_new_vocab = job_type_new_vocab
    def vectorize(self, top_10):
        one_hot = np.zeros(len(self.top_10_words_vocab), dtype=np.float32)
        for token in top_10.split(" "):
            if token not in string.punctuation:
                one_hot[self.top_10_words_vocab.lookup_token(token)] = 1
        return one_hot
    @classmethod
    def from_dataframe(cls, task1_df, cutoff=15):
        top_10_words_vocab = Vocabulary(add_unk=True)
        job_type_new_vocab = Vocabulary(add_unk=False)
        # Add job_types
        for job_type in sorted(set(task1_df.job_type_new)):
            job_type_new_vocab.add_token(job_type)
        # Add top words if count > provided count
        word_counts = Counter()
        for top_10 in task1_df.top_10_words:
            for word in top_10.split(" "):
                if word not in string.punctuation:
                    word_counts[word] += 1
        for word, count in word_counts.items():
            if count > cutoff:
                top_10_words_vocab.add_token(word)
        return cls(top_10_words_vocab, job_type_new_vocab)
    @classmethod
    def from_serializable(cls, contents):
        top_10_words_vocab = Vocabulary.from_serializable(contents['top_10_words_vocab'])
        job_type_new_vocab = Vocabulary.from_serializable(contents['job_type_new_vocab'])
        return cls(top_10_words_vocab=top_10_words_vocab, job_type_new_vocab=job_type_new_vocab)
    def to_serializable(self):
        return {'top_10_words_vocab': self.top_10_words_vocab.to_serializable(),
                'job_type_new_vocab': self.job_type_new_vocab.to_serializable()}

### Dataset for CNN model

In [116]:
# Dataset for CNN by using the top 10 words
class CNNDataset(Dataset):
    def __init__(self, task1cnn_df, vectorizer):
        self.task1cnn_df = task1cnn_df
        self._vectorizer = vectorizer

        # +1 if only using begin_seq, +2 if using both begin and end seq tokens
        measure_len = lambda context: len(context.split(" "))
        self._max_seq_length = max(map(measure_len, task1cnn_df.job_descr)) + 2
        

        self.train_df = self.task1cnn_df[self.task1cnn_df.split=='train']
        self.train_size = len(self.train_df)
        self.val_df = self.task1cnn_df[self.task1cnn_df.split=='val']
        self.validation_size = len(self.val_df)
        self.test_df = self.task1cnn_df[self.task1cnn_df.split=='test']
        self.test_size = len(self.test_df)
        self._lookup_dict = {'train': (self.train_df, self.train_size),
                             'val': (self.val_df, self.validation_size),
                             'test': (self.test_df, self.test_size)}
        self.set_split('train')

        # Class weights
        class_counts = task1cnn_df.job_type_new.value_counts().to_dict()
        def sort_key(item):
            return self._vectorizer.job_type_new_vocab.lookup_token(item[0])
        sorted_counts = sorted(class_counts.items(), key=sort_key)
        frequencies = [count for _, count in sorted_counts]
        self.class_weights = 1.0 / torch.tensor(frequencies, dtype=torch.float32)
        
    @classmethod
    def load_dataset_and_make_vectorizer(cls, task1CNN_csv):
        task1cnn_df = pd.read_csv(task1CNN_csv)
        train_task1cnn_df = task1cnn_df[task1cnn_df.split=='train']
        return cls(task1cnn_df, TopPretrainedVectorizer.from_dataframe(train_task1cnn_df))

    @classmethod
    def load_dataset_and_load_vectorizer(cls, task1CNN_csv, vectorizer_filepath):
        task1cnn_df = pd.read_csv(task1CNN_csv)
        vectorizer = cls.load_vectorizer_only(vectorizer_filepath)
        return cls(task1CNN_csv, vectorizer)

    @staticmethod
    def load_vectorizer_only(vectorizer_filepath):
        with open(vectorizer_filepath) as fp:
            return TopPretrainedVectorizer.from_serializable(json.load(fp))

    def save_vectorizer(self, vectorizer_filepath):
        with open(vectorizer_filepath, "w") as fp:
            json.dump(self._vectorizer.to_serializable(), fp)

    def get_vectorizer(self):
        return self._vectorizer

    def set_split(self, split="train"):
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]

    def __len__(self):
        return self._target_size

    def __getitem__(self, index):
        row = self._target_df.iloc[index]

        top_10_words_vector = \
            self._vectorizer.vectorize(row.top_10_words, self._max_seq_length)

        job_type_new_index = \
            self._vectorizer.job_type_new_vocab.lookup_token(row.job_type_new)

        return {'x_data': top_10_words_vector,
                'y_target': job_type_new_index}

    def get_num_batches(self, batch_size):

        return len(self) // batch_size

def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"): 
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

### Pre-trained GloVe Vectorizer & Domain Specific Vectorizer

In [117]:
# top 10 word pretrained Vectorizer
class TopPretrainedVectorizer(object): 
    def __init__(self, top_10_words_vocab, job_type_new_vocab):
        self.top_10_words_vocab = top_10_words_vocab
        self.job_type_new_vocab = job_type_new_vocab

    def vectorize(self, top_10_words, vector_length=-1):
        indices = [self.top_10_words_vocab.begin_seq_index]
        indices.extend(self.top_10_words_vocab.lookup_token(token) 
                       for token in top_10_words.split(" "))
        indices.append(self.top_10_words_vocab.end_seq_index)

        if vector_length < 0:
            vector_length = len(indices)

        out_vector = np.zeros(vector_length, dtype=np.int64)
        out_vector[:len(indices)] = indices
        out_vector[len(indices):] = self.top_10_words_vocab.mask_index

        return out_vector

    @classmethod
    def from_dataframe(cls, task1cnn_df, cutoff=25):
        job_type_new_vocab = CNNVocabulary()        
        for job_type in sorted(set(task1cnn_df.job_type_new)):
            job_type_new_vocab.add_token(job_type)

        word_counts = Counter()
        for top_10 in task1cnn_df.top_10_words:
            for token in top_10.split(" "):
                if token not in string.punctuation:
                    word_counts[token] += 1
        
        top_10_words_vocab = CNNSequenceVocabulary()
        for word, word_count in word_counts.items():
            if word_count >= cutoff:
                top_10_words_vocab.add_token(word)
        
        return cls(top_10_words_vocab, job_type_new_vocab)

    @classmethod
    def from_serializable(cls, contents):
        top_10_words_vocab = \
            CNNSequenceVocabulary.from_serializable(contents['top_10_words_vocab'])
        job_type_new_vocab =  \
            CNNVocabulary.from_serializable(contents['job_type_new_vocab'])

        return cls(top_10_words_vocab=top_10_words_vocab, job_type_new_vocab=job_type_new_vocab)

    def to_serializable(self):
        return {'top_10_words_vocab': self.top_10_words_vocab.to_serializable(),
                'job_type_new_vocab': self.job_type_new_vocab.to_serializable()}

In [None]:
# General utilities
def set_seed_everywhere(seed, cuda):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed_all(seed)

def handle_dirs(dirpath):
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)
        
def load_glove_from_file(glove_filepath): 
    word_to_index = {}
    embeddings = []
    with open(glove_filepath, encoding="utf8") as fp:
        for index, line in enumerate(fp):
            line = line.split(" ") # each line: word num1 num2 ...
            word_to_index[line[0]] = index # word = line[0] 
            embedding_i = np.array([float(val) for val in line[1:]])
            embeddings.append(embedding_i)
    return word_to_index, np.stack(embeddings)

def load_domain_from_file(domain_filepath):
    word_to_index = {}
    embeddings = []
    with open(domain_filepath, encoding="utf8") as fp:
        for index, line in enumerate(fp):
            line = line.split(" ") # each line: word num1 num2 ...
            word_to_index[line[0]] = index # word = line[0] 
            embedding_i = np.array([float(val) for val in line[1:]])
            embeddings.append(embedding_i)
    return word_to_index, np.stack(embeddings)

def make_glove_embedding_matrix(glove_filepath, words):
    word_to_idx, glove_embeddings = load_glove_from_file(glove_filepath)
    embedding_size = glove_embeddings.shape[1]
    
    final_embeddings = np.zeros((len(words), embedding_size))

    for i, word in enumerate(words):
        if word in word_to_idx:
            final_embeddings[i, :] = glove_embeddings[word_to_idx[word]]
        else:
            embedding_i = torch.ones(1, embedding_size)
            torch.nn.init.xavier_uniform_(embedding_i)
            final_embeddings[i, :] = embedding_i
    return final_embeddings

def make_domain_embedding_matrix(domain_filepath, words):
    word_to_idx, domain_embeddings = load_domain_from_file(domain_filepath)
    embedding_size = domain_embeddings.shape[1]
    
    final_embeddings = np.zeros((len(words), embedding_size))

    for i, word in enumerate(words):
        if word in word_to_idx:
            final_embeddings[i, :] = domain_embeddings[word_to_idx[word]]
        else:
            embedding_i = torch.ones(1, embedding_size)
            torch.nn.init.xavier_uniform_(embedding_i)
            final_embeddings[i, :] = embedding_i
    return final_embeddings

# Task 1 Binary Document Classification

## Feed-Forward Neural Network Model 

#### `One-Hot Embeddings`

In [1446]:
# Dataset
from torch.utils.data import Dataset
class FeedFowardDataset(Dataset):
    def __init__(self, task1_df, vectorizer):
        self.task1_df = task1_df
        self._vectorizer = vectorizer
        self.train_df = self.task1_df[self.task1_df.split=='train']
        self.train_size = len(self.train_df)
        self.val_df = self.task1_df[self.task1_df.split=='val']
        self.validation_size = len(self.val_df)
        self.test_df = self.task1_df[self.task1_df.split=='test']
        self.test_size = len(self.test_df)
        self._lookup_dict = {'train': (self.train_df, self.train_size),
                            'val': (self.val_df, self.validation_size),
                            'test': (self.test_df, self.test_size)}
        self.set_split('train')
    @classmethod
    def load_dataset_and_make_vectorizer(cls, task1_csv):
        task1_df = pd.read_csv(task1_csv)
        return cls(task1_df, OneHotVectorizer.from_dataframe(task1_df))
    def get_vectorizer(self):
        return self._vectorizer
    def set_split(self, split="train"):
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]
    def __len__(self):
        return self._target_size
    def __getitem__(self, index):
        row = self._target_df.iloc[index]
        top_10_vector = \
        self._vectorizer.vectorize(row.top_10_words)
        job_type_index = \
        self._vectorizer.job_type_new_vocab.lookup_token(row.job_type_new)
        return {'x_data': top_10_vector,
                'y_target': job_type_index}
    def get_num_batches(self, batch_size):
        return len(self) // batch_size

In [1447]:
# Vocabulary
class Vocabulary(object):
    def __init__(self, token_to_idx=None, add_unk=True, unk_token="<UNK>"):
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx
        self._idx_to_token = {idx: token 
                                for token, idx in self._token_to_idx.items()}
        self._add_unk = add_unk
        self._unk_token = unk_token
        self.unk_index = 1
        if add_unk:
            self.unk_index = self.add_token(unk_token)
    def to_serializable(self):
        return {'token_to_idx': self._token_to_idx,
                'add_unk': self._add_unk,
                'unk_token': self._unk_token}
    @classmethod
    def from_serializable(cls, contents):
        return cls(**contents)
    def add_token(self, token):
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index
    def lookup_token(self, token):
        if self._add_unk:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]
    def lookup_index(self, index):
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]
    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)
    def __len__(self):
        return len(self._token_to_idx)

# Dataloader
from torch.utils.data import DataLoader
def generate_batches(dataset, batch_size, shuffle=True, drop_last=True, device="cpu"):
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size, 
                            shuffle=shuffle, drop_last=drop_last)
    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

In [1448]:
# Vectorizer
from collections import Counter
import string
class OneHotVectorizer(object):
    def __init__(self, top_10_words_vocab, job_type_new_vocab):
        self.top_10_words_vocab = top_10_words_vocab
        self.job_type_new_vocab = job_type_new_vocab
    def vectorize(self, top_10):
        one_hot = np.zeros(len(self.top_10_words_vocab), dtype=np.float32)
        for token in top_10.split(" "):
            if token not in string.punctuation:
                one_hot[self.top_10_words_vocab.lookup_token(token)] = 1
        return one_hot
    @classmethod
    def from_dataframe(cls, task1_df, cutoff=15):
        top_10_words_vocab = Vocabulary(add_unk=True)
        job_type_new_vocab = Vocabulary(add_unk=False)
        # Add job_types
        for job_type in sorted(set(task1_df.job_type_new)):
            job_type_new_vocab.add_token(job_type)
        # Add top words if count > provided count
        word_counts = Counter()
        for top_10 in task1_df.top_10_words:
            for word in top_10.split(" "):
                if word not in string.punctuation:
                    word_counts[word] += 1
        for word, count in word_counts.items():
            if count > cutoff:
                top_10_words_vocab.add_token(word)
        return cls(top_10_words_vocab, job_type_new_vocab)
    @classmethod
    def from_serializable(cls, contents):
        top_10_words_vocab = Vocabulary.from_serializable(contents['top_10_words_vocab'])
        job_type_new_vocab = Vocabulary.from_serializable(contents['job_type_new_vocab'])
        return cls(top_10_words_vocab=top_10_words_vocab, job_type_new_vocab=job_type_new_vocab)
    def to_serializable(self):
        return {'top_10_words_vocab': self.top_10_words_vocab.to_serializable(),
                'job_type_new_vocab': self.job_type_new_vocab.to_serializable()}

In [1449]:
# Perceptron Classifier
import torch.nn as nn
import torch.nn.functional as F
class FeedForwardClassifier(nn.Module):
    def __init__(self, num_features):
        super(FeedForwardClassifier, self).__init__()
        self.fc1 = nn.Linear(in_features=num_features, out_features=1)
    
    def forward(self, x_in, apply_sigmoid=False):
        y_out = self.fc1(x_in).squeeze()
        if apply_sigmoid:
            y_out = torch.sigmoid(y_out)
        return y_out

In [1450]:
# Initial Setup
from argparse import Namespace
args = Namespace(
    # Data and path information
    frequency_cutoff=15,
    model_state_file='model0.pth',
    task1_csv="ass02_task01.csv",
    save_dir='model_storage/ffnn/',
    vectorizer_file='vectorizer0.json',
    # No model hyperparameters
    # Training hyperparameters
    batch_size=128,
    early_stopping_criteria=5,
    learning_rate=0.001,
    num_epochs=100,
    seed=1337,
    # Runtime options
    cuda=True,
    device='cuda',
)

In [1451]:
# Training preparation/initialization
import torch
import torch.optim as optim
import pandas as pd

def make_train_state(args):
    return {'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': [],
            'test_loss': 1,
            'test_acc': 1}

train_state = make_train_state(args)
if not torch.cuda.is_available():
    args.cuda = False
args.device = torch.device("cuda" if args.cuda else "cpu")

# dataset and vectorizer
dataset = FeedFowardDataset.load_dataset_and_make_vectorizer(args.task1_csv)
vectorizer = dataset.get_vectorizer()
# model
classifier = FeedForwardClassifier(num_features=len(vectorizer.top_10_words_vocab))
classifier = classifier.to(args.device)
# loss and optimizer
loss_func = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)

In [1452]:
def compute_accuracy(y_pred, y_target):
    y_target = y_target.cpu()
    y_pred_indices = (torch.sigmoid(y_pred)>0.5).cpu().long()#.max(dim=1)[1]
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100

In [1453]:
# Training
import numpy as np
for epoch_index in range(args.num_epochs):
    train_state['epoch_index'] = epoch_index
    # Iterate over training dataset
    # setup: batch generator, set loss and acc to 0, set train mode on
    dataset.set_split('train')
    batch_generator = generate_batches(dataset, batch_size=args.batch_size, device=args.device)
    running_loss = 0.0
    running_acc = 0.0
    classifier.train()
    for batch_index, batch_dict in enumerate(batch_generator):
        # the training routine is 5 steps:
        # step 1. zero the gradients
        optimizer.zero_grad()
        # step 2. compute the output
        y_pred = classifier(x_in=batch_dict['x_data'].float())
        # step 3. compute the loss
        loss = loss_func(y_pred, batch_dict['y_target'].float())
        loss_batch = loss.item()
        running_loss += (loss_batch-running_loss) / (batch_index + 1)
        # step 4. use loss to produce gradients
        loss.backward()
        # step 5. use optimizer to take gradient step
        optimizer.step()
        # compute the accuracy
        acc_batch = compute_accuracy(y_pred, batch_dict['y_target'])
        running_acc += (acc_batch - running_acc) / (batch_index + 1)

    train_state['train_loss'].append(running_loss)
    train_state['train_acc'].append(running_acc)

    # Iterate over val dataset
    # setup: batch generator, set loss and acc to 0, set eval mode on
    dataset.set_split('val')
    batch_generator = generate_batches(dataset, batch_size=args.batch_size, device=args.device)
    running_loss = 0.
    running_acc = 0.
    classifier.eval()

    for batch_index, batch_dict in enumerate(batch_generator):
        # step 1. compute the output
        y_pred = classifier(x_in=batch_dict['x_data'].float())
        # step 2. compute the loss
        loss = loss_func(y_pred, batch_dict['y_target'].float())
        loss_batch = loss.item()
        running_loss += (loss_batch - running_loss) / (batch_index + 1)
        # step 3. compute the accuracy
        acc_batch = compute_accuracy(y_pred, batch_dict['y_target'])
        running_acc += (acc_batch - running_acc) / (batch_index + 1)
    train_state['val_loss'].append(running_loss)
    train_state['val_acc'].append(running_acc)

In [1457]:
# Evaluation
dataset.set_split('test')
batch_generator = generate_batches(dataset,batch_size=args.batch_size,device=args.device)
running_loss = 0.
running_acc = 0.
classifier.eval()
for batch_index, batch_dict in enumerate(batch_generator):
    # compute the output
    y_pred = classifier(x_in=batch_dict['x_data'].float())
    # compute the loss
    loss = loss_func(y_pred, batch_dict['y_target'].float())
    loss_batch = loss.item()
    running_loss += (loss_batch - running_loss) / (batch_index + 1)
    # compute the accuracy
    acc_batch = compute_accuracy(y_pred, batch_dict['y_target'])
    running_acc += (acc_batch - running_acc) / (batch_index + 1)
train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc

In [1458]:
print("Test loss: {:.3f}".format(train_state['test_loss']))
print("Test Accuracy: {:.2f}".format(train_state['test_acc']))

Test loss: 0.610
Test Accuracy: 68.75


#### Feed Forward Neural Network Method 2

In [1550]:
# Dataset
from torch.utils.data import Dataset
class FeedFowardDataset(Dataset):
    def __init__(self, task1_df, vectorizer):
        self.task1_df = task1_df
        self._vectorizer = vectorizer
        # +1 if only using begin_seq, +2 if using both begin and end seq tokens
        measure_len = lambda context: len(context.split(" "))
        self._max_seq_length = max(map(measure_len, task1_df.top_10_words)) + 2

        self.train_df = self.task1_df[self.task1_df.split=='train']
        self.train_size = len(self.train_df)
        self.val_df = self.task1_df[self.task1_df.split=='val']
        self.validation_size = len(self.val_df)
        self.test_df = self.task1_df[self.task1_df.split=='test']
        self.test_size = len(self.test_df)
        self._lookup_dict = {'train': (self.train_df, self.train_size),
                            'val': (self.val_df, self.validation_size),
                            'test': (self.test_df, self.test_size)}
        self.set_split('train')
        
        # Class weights
        class_counts = task1_df.job_type_new.value_counts().to_dict()
        def sort_key(item):
            return self._vectorizer.job_type_new_vocab.lookup_token(item[0])
        sorted_counts = sorted(class_counts.items(), key=sort_key)
        frequencies = [count for _, count in sorted_counts]
        self.class_weights = 1.0 / torch.tensor(frequencies, dtype=torch.float32)

    @classmethod
    def load_dataset_and_make_vectorizer(cls, task1_csv):
        task1_df = pd.read_csv(task1_csv)
        train_task1_df = task1_df[task1_df.split=='train']
        return cls(task1_df, OneHotVectorizer.from_dataframe(train_task1_df))

    @classmethod
    def load_dataset_and_load_vectorizer(cls, task1_csv, vectorizer_filepath):
        task1_df = pd.read_csv(task1_csv)
        vectorizer = cls.load_vectorizer_only(vectorizer_filepath)
        return cls(task1_csv, vectorizer)
    
    @staticmethod
    def load_vectorizer_only(vectorizer_filepath):
        with open(vectorizer_filepath) as fp:
            return OneHotVectorizer.from_serializable(json.load(fp))

    def save_vectorizer(self, vectorizer_filepath):
        with open(vectorizer_filepath, "w") as fp:
            json.dump(self._vectorizer.to_serializable(), fp)

    def get_vectorizer(self):
        return self._vectorizer

    def set_split(self, split="train"):
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]

    def __len__(self):
        return self._target_size

    def __getitem__(self, index):
        row = self._target_df.iloc[index]
        top_10_vector = \
        self._vectorizer.vectorize(row.top_10_words, self._max_seq_length)
        job_type_index = \
        self._vectorizer.job_type_new_vocab.lookup_token(row.job_type_new)

        return {'x_data': top_10_vector,
                'y_target': job_type_index}
                
    def get_num_batches(self, batch_size):
        return len(self) // batch_size

# Dataloader
from torch.utils.data import DataLoader
def generate_batches(dataset, batch_size, shuffle=True, drop_last=True, device="cpu"):
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size, 
                            shuffle=shuffle, drop_last=drop_last)
    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

In [1551]:
# Vocabulary
class Vocabulary(object):
    def __init__(self, token_to_idx=None, add_unk=True, unk_token="<UNK>"):
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx
        self._idx_to_token = {idx: token 
                                for token, idx in self._token_to_idx.items()}
        self._add_unk = add_unk
        self._unk_token = unk_token
        self.unk_index = 1
        if add_unk:
            self.unk_index = self.add_token(unk_token)
    def to_serializable(self):
        return {'token_to_idx': self._token_to_idx,
                'add_unk': self._add_unk,
                'unk_token': self._unk_token}
    @classmethod
    def from_serializable(cls, contents):
        return cls(**contents)
    def add_token(self, token):
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index
    def lookup_token(self, token):
        if self._add_unk:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]
    def lookup_index(self, index):
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]
    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)
    def __len__(self):
        return len(self._token_to_idx)

In [1552]:
class SequenceVocabulary(Vocabulary):
    def __init__(self, token_to_idx=None, unk_token="<UNK>",
                 mask_token="<MASK>", begin_seq_token="<BEGIN>",
                 end_seq_token="<END>"):

        super(SequenceVocabulary, self).__init__(token_to_idx)

        self._mask_token = mask_token
        self._unk_token = unk_token
        self._begin_seq_token = begin_seq_token
        self._end_seq_token = end_seq_token

        self.mask_index = self.add_token(self._mask_token)
        self.unk_index = self.add_token(self._unk_token)
        self.begin_seq_index = self.add_token(self._begin_seq_token)
        self.end_seq_index = self.add_token(self._end_seq_token)

    def to_serializable(self):
        contents = super(SequenceVocabulary, self).to_serializable()
        contents.update({'unk_token': self._unk_token,
                         'mask_token': self._mask_token,
                         'begin_seq_token': self._begin_seq_token,
                         'end_seq_token': self._end_seq_token})
        return contents

    def lookup_token(self, token):
        if self.unk_index >= 0:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]

In [1553]:
# Vectorizer
from collections import Counter
import string
class OneHotVectorizer(object):
    def __init__(self, top_10_words_vocab, job_type_new_vocab):
        self.top_10_words_vocab = top_10_words_vocab
        self.job_type_new_vocab = job_type_new_vocab

    def vectorize(self, top_10_words, vector_length=-1):
        indices = [self.top_10_words_vocab.begin_seq_index]
        indices.extend(self.top_10_words_vocab.lookup_token(token) 
                       for token in top_10_words.split(" "))
        indices.append(self.top_10_words_vocab.end_seq_index)

        if vector_length < 0:
            vector_length = len(indices)

        out_vector = np.zeros(vector_length, dtype=np.int64)
        out_vector[:len(indices)] = indices
        out_vector[len(indices):] = self.top_10_words_vocab.mask_index

        return out_vector
    @classmethod
    def from_dataframe(cls, task1_df, cutoff=15):
        job_type_new_vocab = Vocabulary(add_unk=False)        
        for job_type in sorted(set(task1_df.job_type_new)):
            job_type_new_vocab.add_token(job_type)

        word_counts = Counter()
        for top_10 in task1_df.top_10_words:
            for token in top_10.split(" "):
                if token not in string.punctuation:
                    word_counts[token] += 1
        
        top_10_words_vocab = SequenceVocabulary()
        for word, word_count in word_counts.items():
            if word_count >= cutoff:
                top_10_words_vocab.add_token(word)
        
        return cls(top_10_words_vocab, job_type_new_vocab)
    @classmethod
    def from_serializable(cls, contents):
        top_10_words_vocab = Vocabulary.from_serializable(contents['top_10_words_vocab'])
        job_type_new_vocab = Vocabulary.from_serializable(contents['job_type_new_vocab'])
        return cls(top_10_words_vocab=top_10_words_vocab, job_type_new_vocab=job_type_new_vocab)
    def to_serializable(self):
        return {'top_10_words_vocab': self.top_10_words_vocab.to_serializable(),
                'job_type_new_vocab': self.job_type_new_vocab.to_serializable()}

In [1554]:
# Perceptron Classifier
import torch.nn as nn
import torch.nn.functional as F
class FeedForwardClassifier(nn.Module):
    def __init__(self,embedding_size, num_embeddings,
                 hidden_dim, num_classes, dropout_p, 
                 pretrained_embeddings=None, padding_idx=0):
        super(FeedForwardClassifier, self).__init__()
        if pretrained_embeddings is None:

            self.emb = nn.Embedding(embedding_dim=embedding_size,
                                    num_embeddings=num_embeddings,
                                    padding_idx=padding_idx)        
        else:
            pretrained_embeddings = torch.from_numpy(pretrained_embeddings).float()
            self.emb = nn.Embedding(embedding_dim=embedding_size,
                                    num_embeddings=num_embeddings,
                                    padding_idx=padding_idx,
                                    _weight=pretrained_embeddings)

        self._dropout_p = dropout_p
        self.fc1 = nn.Linear(embedding_size, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, num_classes)

    
    def forward(self, x_in, apply_softmax=False):
        # embed and permute so features are channels
        features = self.emb(x_in).permute(0, 2, 1)

        # features = self.convnet(x_embedded)

        # average and remove the extra dimension
        remaining_size = features.size(dim=2)
        features = F.avg_pool1d(features, remaining_size).squeeze(dim=2)
        features = F.dropout(features, p=self._dropout_p)
        
        # mlp classifier
        intermediate_vector = F.relu(F.dropout(self.fc1(features), p=self._dropout_p))
        prediction_vector = self.fc2(intermediate_vector)

        if apply_softmax:
            prediction_vector = F.softmax(prediction_vector, dim=1)

        return prediction_vector

In [1555]:
# helper function
import torch
import torch.optim as optim
import pandas as pd

def make_train_state(args):
    return {'stop_early': False,
            'early_stopping_step': 0,
            'early_stopping_best_val': 1e8,
            'learning_rate': args.learning_rate,
            'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': [],
            'test_loss': 1,
            'test_acc': 1,
            'model_filename': args.model_state_file}

def update_train_state(args, model, train_state):

    # Save one model at least
    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(), train_state['model_filename'])
        train_state['stop_early'] = False

    # Save model if performance improved
    elif train_state['epoch_index'] >= 1:
        loss_tm1, loss_t = train_state['val_loss'][-2:]

        # If loss worsened
        if loss_t >= train_state['early_stopping_best_val']:
            # Update step
            train_state['early_stopping_step'] += 1
        # Loss decreased
        else:
            # Save the best model
            if loss_t < train_state['early_stopping_best_val']:
                torch.save(model.state_dict(), train_state['model_filename'])

            # Reset early stopping step
            train_state['early_stopping_step'] = 0

        # Stop early ?
        train_state['stop_early'] = \
            train_state['early_stopping_step'] >= args.early_stopping_criteria

    return train_state

# def compute_accuracy(y_pred, y_target):
#     y_target = y_target.cpu()
#     y_pred_indices = (torch.sigmoid(y_pred)>0.5).cpu().long()#.max(dim=1)[1]
#     n_correct = torch.eq(y_pred_indices, y_target).sum().item()
#     return n_correct / len(y_pred_indices) * 100
def compute_accuracy(y_pred, y_target):
    _, y_pred_indices = y_pred.max(dim=1)
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100

In [1556]:
# General utilities
def set_seed_everywhere(seed, cuda):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed_all(seed)

def handle_dirs(dirpath):
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)
        
def load_glove_from_file(glove_filepath): 
    word_to_index = {}
    embeddings = []
    with open(glove_filepath, encoding="utf8") as fp:
        for index, line in enumerate(fp):
            line = line.split(" ") # each line: word num1 num2 ...
            word_to_index[line[0]] = index # word = line[0] 
            embedding_i = np.array([float(val) for val in line[1:]])
            embeddings.append(embedding_i)
    return word_to_index, np.stack(embeddings)

def load_domain_from_file(domain_filepath):
    word_to_index = {}
    embeddings = []
    with open(domain_filepath, encoding="utf8") as fp:
        for index, line in enumerate(fp):
            line = line.split(" ") # each line: word num1 num2 ...
            word_to_index[line[0]] = index # word = line[0] 
            embedding_i = np.array([float(val) for val in line[1:]])
            embeddings.append(embedding_i)
    return word_to_index, np.stack(embeddings)

def make_glove_embedding_matrix(glove_filepath, words):
    word_to_idx, glove_embeddings = load_glove_from_file(glove_filepath)
    embedding_size = glove_embeddings.shape[1]
    
    final_embeddings = np.zeros((len(words), embedding_size))

    for i, word in enumerate(words):
        if word in word_to_idx:
            final_embeddings[i, :] = glove_embeddings[word_to_idx[word]]
        else:
            embedding_i = torch.ones(1, embedding_size)
            torch.nn.init.xavier_uniform_(embedding_i)
            final_embeddings[i, :] = embedding_i
    return final_embeddings

def make_domain_embedding_matrix(domain_filepath, words):
    word_to_idx, domain_embeddings = load_domain_from_file(domain_filepath)
    embedding_size = domain_embeddings.shape[1]
    
    final_embeddings = np.zeros((len(words), embedding_size))

    for i, word in enumerate(words):
        if word in word_to_idx:
            final_embeddings[i, :] = domain_embeddings[word_to_idx[word]]
        else:
            embedding_i = torch.ones(1, embedding_size)
            torch.nn.init.xavier_uniform_(embedding_i)
            final_embeddings[i, :] = embedding_i
    return final_embeddings

#### `One-Hot Embeddings`

In [1571]:
# Initial Setup
from argparse import Namespace
args = Namespace(
    # Data and path information
    frequency_cutoff=15,
    model_state_file='model1.pth',
    task1_csv="ass02_task01.csv",
    save_dir='model_storage/ffnn/one_hot',
    vectorizer_file='vectorizer1.json',
    # model hyperparameters
    glove_filepath='glove.6B.100d.txt', 
    domain_filepath='domain_embeddings.txt',
    use_glove=False,
    use_domain_embeddings=False,
    embedding_size=100, 
    hidden_dim=100, 
    num_classes=2,
    # Training hyperparameters
    batch_size=128,
    dropout_p=0.2,
    early_stopping_criteria=5,
    learning_rate=0.001,
    num_epochs=100,
    seed=1337,
    # Runtime options
    cuda=True,
    device='cuda',    
    catch_keyboard_interrupt=True, 
    reload_from_files=False,
    expand_filepaths_to_save_dir=True
)

if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir,
                                        args.vectorizer_file)

    args.model_state_file = os.path.join(args.save_dir,
                                         args.model_state_file)
    
    print("Expanded filepaths: ")
    print("\t{}".format(args.vectorizer_file))
    print("\t{}".format(args.model_state_file))
    
# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False
    
args.device = torch.device("cuda" if args.cuda else "cpu")
print("Using CUDA: {}".format(args.cuda))

# Set seed for reproducibility
set_seed_everywhere(args.seed, args.cuda)

# handle dirs
handle_dirs(args.save_dir)

Expanded filepaths: 
	model_storage/ffnn/one_hot/vectorizer1.json
	model_storage/ffnn/one_hot/model1.pth
Using CUDA: False


In [1572]:
# Initializations
if args.reload_from_files:
# dataset and vectorizer
    dataset = FeedFowardDataset.load_dataset_and_make_vectorizer(args.task1_csv, args.vectorizer_file)
else:
    # create dataset and vectorizer
    dataset = FeedFowardDataset.load_dataset_and_make_vectorizer(args.task1_csv)
    dataset.save_vectorizer(args.vectorizer_file)
vectorizer = dataset.get_vectorizer()

# Use GloVe, domain_embeddings or randomly initialized embeddings
if args.use_glove: 
    words = vectorizer.top_10_words_vocab._token_to_idx.keys()
    embeddings = make_glove_embedding_matrix(glove_filepath=args.glove_filepath, 
                                       words=words)
    print("Using pre-trained glove embeddings")
elif args.use_domain_embeddings:
    words = vectorizer.top_10_words_vocab._token_to_idx.keys()
    embeddings = make_domain_embedding_matrix(domain_filepath=args.domain_filepath, 
                                       words=words)
    print("Using pre-trained domain specific embeddings")
else:
    print("Using one hot embeddings")
    embeddings = None

# model
classifier = FeedForwardClassifier(embedding_size=args.embedding_size, 
                                    num_embeddings=len(vectorizer.top_10_words_vocab),
                                    hidden_dim=args.hidden_dim, 
                                    # num_classes=args.num_classes, 
                                    num_classes=len(vectorizer.job_type_new_vocab),
                                    dropout_p=args.dropout_p,
                                    pretrained_embeddings=embeddings,
                                    padding_idx=0)
# classifier = classifier.to(args.device)
# # loss and optimizer
# loss_func = nn.BCEWithLogitsLoss()
# optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)

Using one hot embeddings


In [1425]:
len(vectorizer.job_type_new_vocab)

2

In [1573]:
# Training loop
classifier = classifier.to(args.device)
loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                           mode='min', factor=0.5,
                                           patience=1)

train_state = make_train_state(args)

epoch_bar = tqdm(desc='training routine', 
                          total=args.num_epochs,
                          position=0)

dataset.set_split('train')
train_bar = tqdm(desc='split=train',
                          total=dataset.get_num_batches(args.batch_size), 
                          position=1, 
                          leave=True)
dataset.set_split('val')
val_bar = tqdm(desc='split=val',
                        total=dataset.get_num_batches(args.batch_size), 
                        position=1, 
                        leave=True)                                           
                        
try:
    for epoch_index in range(args.num_epochs):
        train_state['epoch_index'] = epoch_index

        # Iterate over training dataset

        # setup: batch generator, set loss and acc to 0, set train mode on

        dataset.set_split('train')
        batch_generator = generate_batches(dataset, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
        running_loss = 0.0
        running_acc = 0.0
        classifier.train()
        for batch_index, batch_dict in enumerate(batch_generator):
            # the training routine is 5 steps:
            # step 1. zero the gradients
            optimizer.zero_grad()
            # step 2. compute the output
            y_pred = classifier(x_in=batch_dict['x_data'])
            # step 3. compute the loss
            print(y_pred.shape)
            print(batch_dict['y_target'])  
            loss = loss_func(y_pred, batch_dict['y_target'])
            loss_batch = loss.item()
            running_loss += (loss_batch-running_loss) / (batch_index + 1)
            # step 4. use loss to produce gradients
            loss.backward()
            # step 5. use optimizer to take gradient step
            optimizer.step()
            # compute the accuracy
            acc_batch = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_batch - running_acc) / (batch_index + 1)

        train_state['train_loss'].append(running_loss)
        train_state['train_acc'].append(running_acc)

        # Iterate over val dataset
        # setup: batch generator, set loss and acc to 0, set eval mode on
        dataset.set_split('val')
        batch_generator = generate_batches(dataset, batch_size=args.batch_size, device=args.device)
        running_loss = 0.
        running_acc = 0.
        classifier.eval()

        for batch_index, batch_dict in enumerate(batch_generator):
            # step 1. compute the output
            y_pred = classifier(x_in=batch_dict['x_data'])
            # step 2. compute the loss
            loss = loss_func(y_pred, batch_dict['y_target'])
            loss_batch = loss.item()
            running_loss += (loss_batch - running_loss) / (batch_index + 1)
            # step 3. compute the accuracy
            acc_batch = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_batch - running_acc) / (batch_index + 1)
            val_bar.set_postfix(loss=running_loss, acc=running_acc, 
                            epoch=epoch_index)
            val_bar.update()
        train_state['val_loss'].append(running_loss)
        train_state['val_acc'].append(running_acc)
        train_state = update_train_state(args=args, model=classifier,
                                         train_state=train_state)

        scheduler.step(train_state['val_loss'][-1])

        if train_state['stop_early']:
            break

        train_bar.n = 0
        val_bar.n = 0
        epoch_bar.update()
except KeyboardInterrupt:
    print("Exiting loop")

training routine:   0%|          | 0/100 [00:00<?, ?it/s]

split=train:   0%|          | 0/10 [00:00<?, ?it/s]

split=val:   0%|          | 0/1 [00:00<?, ?it/s]

torch.Size([128, 2])
tensor([0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1,
        0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0,
        0, 1, 0, 0, 0, 0, 0, 0])
torch.Size([128, 2])
tensor([1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0,
        1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
        0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
        0, 0, 1, 1, 0, 0, 0, 1])
torch.Size([128, 2])
tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 

In [1618]:
# Evaluation
# compute the loss & accuracy on the test set using the best available model

classifier.load_state_dict(torch.load(train_state['model_filename']))

classifier = classifier.to(args.device)
dataset.class_weights = dataset.class_weights.to(args.device)
loss_func = nn.CrossEntropyLoss(dataset.class_weights)

dataset.set_split('test')
batch_generator = generate_batches(dataset, 
                                   batch_size=args.batch_size, 
                                   device=args.device)
running_loss = 0.
running_acc = 0.
classifier.eval()

for batch_index, batch_dict in enumerate(batch_generator):
    # compute the output
    y_pred = classifier(x_in=batch_dict['x_data'])
    # compute the loss
    loss = loss_func(y_pred, batch_dict['y_target'])
    loss_batch = loss.item()
    running_loss += (loss_batch - running_loss) / (batch_index + 1)
    # compute the accuracy
    acc_batch = compute_accuracy(y_pred, batch_dict['y_target'])
    running_acc += (acc_batch - running_acc) / (batch_index + 1)
train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc

print("Test loss: {:.3f}".format(train_state['test_loss']))
print("Test Accuracy: {:.2f}".format(train_state['test_acc']))

Test loss: 0.778
Test Accuracy: 67.71


#### `Pretrained GloVe Embeddings`

In [1557]:
# Initial Setup
from argparse import Namespace
args = Namespace(
    # Data and path information
    frequency_cutoff=15,
    model_state_file='model2.pth',
    task1_csv="ass02_task01.csv",
    save_dir='model_storage/ffnn/glove',
    vectorizer_file='vectorizer2.json',
    # model hyperparameters
    glove_filepath='glove.6B.100d.txt', 
    domain_filepath='domain_embeddings.txt',
    use_glove=False,
    use_domain_embeddings=False,
    embedding_size=100, 
    hidden_dim=100, 
    num_classes=2,
    # Training hyperparameters
    batch_size=128,
    dropout_p=0.2,
    early_stopping_criteria=5,
    learning_rate=0.001,
    num_epochs=100,
    seed=1337,
    # Runtime options
    cuda=True,
    device='cuda',    
    catch_keyboard_interrupt=True, 
    reload_from_files=False,
    expand_filepaths_to_save_dir=True
)

if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir,
                                        args.vectorizer_file)

    args.model_state_file = os.path.join(args.save_dir,
                                         args.model_state_file)
    
    print("Expanded filepaths: ")
    print("\t{}".format(args.vectorizer_file))
    print("\t{}".format(args.model_state_file))
    
# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False
    
args.device = torch.device("cuda" if args.cuda else "cpu")
print("Using CUDA: {}".format(args.cuda))

# Set seed for reproducibility
set_seed_everywhere(args.seed, args.cuda)

# handle dirs
handle_dirs(args.save_dir)

Expanded filepaths: 
	model_storage/ffnn/glove/vectorizer2.json
	model_storage/ffnn/glove/model2.pth
Using CUDA: False


In [1558]:
# Initializations
args.use_glove = True
if args.reload_from_files:
# dataset and vectorizer
    dataset = FeedFowardDataset.load_dataset_and_make_vectorizer(args.task1_csv, args.vectorizer_file)
else:
    # create dataset and vectorizer
    dataset = FeedFowardDataset.load_dataset_and_make_vectorizer(args.task1_csv)
    dataset.save_vectorizer(args.vectorizer_file)
vectorizer = dataset.get_vectorizer()

# Use GloVe, domain_embeddings or randomly initialized embeddings
if args.use_glove: 
    words = vectorizer.top_10_words_vocab._token_to_idx.keys()
    embeddings = make_glove_embedding_matrix(glove_filepath=args.glove_filepath, 
                                       words=words)
    print("Using pre-trained glove embeddings")
elif args.use_domain_embeddings:
    words = vectorizer.top_10_words_vocab._token_to_idx.keys()
    embeddings = make_domain_embedding_matrix(domain_filepath=args.domain_filepath, 
                                       words=words)
    print("Using pre-trained domain specific embeddings")
else:
    print("Using one hot embeddings")
    embeddings = None

# model
classifier = FeedForwardClassifier(embedding_size=args.embedding_size, 
                                    num_embeddings=len(vectorizer.top_10_words_vocab),
                                    hidden_dim=args.hidden_dim, 
                                    # num_classes=args.num_classes, 
                                    num_classes=len(vectorizer.job_type_new_vocab),
                                    dropout_p=args.dropout_p,
                                    pretrained_embeddings=embeddings,
                                    padding_idx=0)

# len(vectorizer.job_type_new_vocab)

Using pre-trained glove embeddings


In [1559]:
# Training loop
classifier = classifier.to(args.device)
  
loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                           mode='min', factor=0.5,
                                           patience=1)

train_state = make_train_state(args)

epoch_bar = tqdm(desc='training routine', 
                          total=args.num_epochs,
                          position=0)

dataset.set_split('train')
train_bar = tqdm(desc='split=train',
                          total=dataset.get_num_batches(args.batch_size), 
                          position=1, 
                          leave=True)
dataset.set_split('val')
val_bar = tqdm(desc='split=val',
                        total=dataset.get_num_batches(args.batch_size), 
                        position=1, 
                        leave=True)                                           
# import numpy as np
# for epoch_index in range(args.num_epochs):
#     train_state['epoch_index'] = epoch_index
#     # Iterate over training dataset
#     # setup: batch generator, set loss and acc to 0, set train mode on
#     dataset.set_split('train')
#     batch_generator = generate_batches(dataset, batch_size=args.batch_size, device=args.device)
#     running_loss = 0.0
#     running_acc = 0.0
#     classifier.train()
try:
    for epoch_index in range(args.num_epochs):
        train_state['epoch_index'] = epoch_index

        # Iterate over training dataset

        # setup: batch generator, set loss and acc to 0, set train mode on

        dataset.set_split('train')
        batch_generator = generate_batches(dataset, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
        running_loss = 0.0
        running_acc = 0.0
        classifier.train()
        for batch_index, batch_dict in enumerate(batch_generator):
            # the training routine is 5 steps:
            # step 1. zero the gradients
            optimizer.zero_grad()
            # step 2. compute the output
            y_pred = classifier(x_in=batch_dict['x_data'])
            # step 3. compute the loss
            print(y_pred.shape)
            print(batch_dict['y_target'])  
            loss = loss_func(y_pred, batch_dict['y_target'])
            loss_batch = loss.item()
            running_loss += (loss_batch-running_loss) / (batch_index + 1)
            # step 4. use loss to produce gradients
            loss.backward()
            # step 5. use optimizer to take gradient step
            optimizer.step()
            # compute the accuracy
            acc_batch = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_batch - running_acc) / (batch_index + 1)

        train_state['train_loss'].append(running_loss)
        train_state['train_acc'].append(running_acc)

        # Iterate over val dataset
        # setup: batch generator, set loss and acc to 0, set eval mode on
        dataset.set_split('val')
        batch_generator = generate_batches(dataset, batch_size=args.batch_size, device=args.device)
        running_loss = 0.
        running_acc = 0.
        classifier.eval()

        for batch_index, batch_dict in enumerate(batch_generator):
            # step 1. compute the output
            y_pred = classifier(x_in=batch_dict['x_data'])
            # step 2. compute the loss
            loss = loss_func(y_pred, batch_dict['y_target'])
            loss_batch = loss.item()
            running_loss += (loss_batch - running_loss) / (batch_index + 1)
            # step 3. compute the accuracy
            acc_batch = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_batch - running_acc) / (batch_index + 1)
            val_bar.set_postfix(loss=running_loss, acc=running_acc, 
                            epoch=epoch_index)
            val_bar.update()
        train_state['val_loss'].append(running_loss)
        train_state['val_acc'].append(running_acc)
        train_state = update_train_state(args=args, model=classifier,
                                         train_state=train_state)

        scheduler.step(train_state['val_loss'][-1])

        if train_state['stop_early']:
            break

        train_bar.n = 0
        val_bar.n = 0
        epoch_bar.update()
except KeyboardInterrupt:
    print("Exiting loop")

training routine:   0%|          | 0/100 [00:00<?, ?it/s]

split=train:   0%|          | 0/10 [00:00<?, ?it/s]

split=val:   0%|          | 0/1 [00:00<?, ?it/s]

torch.Size([128, 2])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
        1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,
        1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
        0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
        1, 1, 1, 0, 0, 0, 0, 0])
torch.Size([128, 2])
tensor([1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
        1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0,
        1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 1, 0, 0, 1])
torch.Size([128, 2])
tensor([1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 

In [1565]:
# Evaluation
# compute the loss & accuracy on the test set using the best available model

classifier.load_state_dict(torch.load(train_state['model_filename']))

classifier = classifier.to(args.device)
dataset.class_weights = dataset.class_weights.to(args.device)
loss_func = nn.CrossEntropyLoss(dataset.class_weights)

dataset.set_split('test')
batch_generator = generate_batches(dataset, 
                                   batch_size=args.batch_size, 
                                   device=args.device)
running_loss = 0.
running_acc = 0.
classifier.eval()

for batch_index, batch_dict in enumerate(batch_generator):
    # compute the output
    y_pred = classifier(x_in=batch_dict['x_data'])
    # compute the loss
    loss = loss_func(y_pred, batch_dict['y_target'])
    loss_batch = loss.item()
    running_loss += (loss_batch - running_loss) / (batch_index + 1)
    # compute the accuracy
    acc_batch = compute_accuracy(y_pred, batch_dict['y_target'])
    running_acc += (acc_batch - running_acc) / (batch_index + 1)
train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc

In [1567]:
print("Test loss: {:.3f}".format(train_state['test_loss']))
print("Test Accuracy: {:.2f}".format(train_state['test_acc']))

Test loss: 0.753
Test Accuracy: 69.79


In [1568]:
# Inference and Classifying new data points
import re
# Preprocess the reviews
def preprocess_text(text):
    if type(text) == float:
        print(text)
    text = text.lower()
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text


In [1570]:
# make prediction
def predict_job_type_new(top_10_words, classifier, vectorizer, max_length):
    top_10_words = preprocess_text(top_10_words)
    vectorized_top_10_words = \
        torch.tensor(vectorizer.vectorize(top_10_words, vector_length=max_length))
    result = classifier(vectorized_top_10_words.unsqueeze(0), apply_softmax=True)
    probability_values, indices = result.max(dim=1)
    predicted_job_type_new = vectorizer.job_type_new_vocab.lookup_index(indices.item())

    return {'job_type': predicted_job_type_new, 
            'probability': probability_values.item()}

def get_samples():
    samples = {}
    for cat in dataset.val_df.job_type_new.unique():
        samples[cat] = dataset.val_df.top_10_words[dataset.val_df.job_type_new==cat].tolist()[:5]
    return samples

val_samples = get_samples()


classifier = classifier.to("cpu")

for truth, sample_group in val_samples.items():
    print(f"True Category: {truth}")
    print("="*30)
    for sample in sample_group:
        prediction = predict_job_type_new(sample, classifier, 
                                      vectorizer, dataset._max_seq_length + 1)
        print("Prediction: {} (p={:0.2f})".format(prediction['job_type'],
                                                  prediction['probability']))
        print("\t + Sample: {}".format(sample))
    print("-"*30 + "\n")

True Category: Full Time
Prediction: Full Time (p=0.70)
	 + Sample: calculation foreign streamlining jane cullen appeal arise pricing taxation oriented
Prediction: Full Time (p=0.66)
	 + Sample: shine wips whip tracker spotlight rush testimonials soar sees resourceful
Prediction: Full Time (p=0.69)
	 + Sample: transcript legalpersonnel lack jgrasso chained admission julie charities bill genuinely
Prediction: Full Time (p=0.68)
	 + Sample: establishments consistant captive canteen camps attentive diplomatic deliverable served majority
Prediction: Full Time (p=0.69)
	 + Sample: tipper economix diligent ot combination afternoon overtime induction white report
------------------------------

True Category: Other
Prediction: Full Time (p=0.70)
	 + Sample: sunglass hut gabbana dolce chanel prada oakley ray push rest
Prediction: Full Time (p=0.67)
	 + Sample: lloyd hall blockers removing sponsors towers tackle etl streams consultation
Prediction: Full Time (p=0.66)
	 + Sample: tidying wynyard

#### `Domain Specific Embeddings`

In [1502]:
# Initial Setup
from argparse import Namespace
args = Namespace(
    # Data and path information
    frequency_cutoff=15,
    model_state_file='model3.pth',
    task1_csv="ass02_task01.csv",
    save_dir='model_storage/ffnn/domain',
    vectorizer_file='vectorizer3.json',
    # model hyperparameters
    glove_filepath='glove.6B.100d.txt', 
    domain_filepath='domain_embeddings.txt',
    use_glove=False,
    use_domain_embeddings=False,
    embedding_size=100, 
    hidden_dim=100, 
    num_classes=2,
    # Training hyperparameters
    batch_size=128,
    dropout_p=0.,
    early_stopping_criteria=5,
    learning_rate=0.001,
    num_epochs=100,
    seed=1337,
    # Runtime options
    cuda=True,
    device='cuda',    
    catch_keyboard_interrupt=True, 
    reload_from_files=False,
    expand_filepaths_to_save_dir=True
)

if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir,
                                        args.vectorizer_file)

    args.model_state_file = os.path.join(args.save_dir,
                                         args.model_state_file)
    
    print("Expanded filepaths: ")
    print("\t{}".format(args.vectorizer_file))
    print("\t{}".format(args.model_state_file))
    
# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False
    
args.device = torch.device("cuda" if args.cuda else "cpu")
print("Using CUDA: {}".format(args.cuda))

# Set seed for reproducibility
set_seed_everywhere(args.seed, args.cuda)

# handle dirs
handle_dirs(args.save_dir)

Expanded filepaths: 
	model_storage/ffnn/domain/vectorizer3.json
	model_storage/ffnn/domain/model3.pth
Using CUDA: False


In [1503]:
# Initializations
args.use_domain_embeddings = True
if args.reload_from_files:
# dataset and vectorizer
    dataset = FeedFowardDataset.load_dataset_and_make_vectorizer(args.task1_csv, args.vectorizer_file)
else:
    # create dataset and vectorizer
    dataset = FeedFowardDataset.load_dataset_and_make_vectorizer(args.task1_csv)
    dataset.save_vectorizer(args.vectorizer_file)
vectorizer = dataset.get_vectorizer()

# Use GloVe, domain_embeddings or randomly initialized embeddings
if args.use_glove: 
    words = vectorizer.top_10_words_vocab._token_to_idx.keys()
    embeddings = make_glove_embedding_matrix(glove_filepath=args.glove_filepath, 
                                       words=words)
    print("Using pre-trained glove embeddings")
elif args.use_domain_embeddings:
    words = vectorizer.top_10_words_vocab._token_to_idx.keys()
    embeddings = make_domain_embedding_matrix(domain_filepath=args.domain_filepath, 
                                       words=words)
    print("Using pre-trained domain specific embeddings")
else:
    print("Using one hot embeddings")
    embeddings = None

# model
classifier = FeedForwardClassifier(embedding_size=args.embedding_size, 
                                    num_embeddings=len(vectorizer.top_10_words_vocab),
                                    hidden_dim=args.hidden_dim, 
                                    # num_classes=args.num_classes, 
                                    num_classes=len(vectorizer.job_type_new_vocab),
                                    dropout_p=args.dropout_p,
                                    pretrained_embeddings=embeddings,
                                    padding_idx=0)

Using pre-trained domain specific embeddings


In [1504]:
# Training loop
classifier = classifier.to(args.device)  
loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                           mode='min', factor=0.5,
                                           patience=1)

train_state = make_train_state(args)

epoch_bar = tqdm(desc='training routine', 
                          total=args.num_epochs,
                          position=0)

dataset.set_split('train')
train_bar = tqdm(desc='split=train',
                          total=dataset.get_num_batches(args.batch_size), 
                          position=1, 
                          leave=True)
dataset.set_split('val')
val_bar = tqdm(desc='split=val',
                        total=dataset.get_num_batches(args.batch_size), 
                        position=1, 
                        leave=True)                                           

try:
    for epoch_index in range(args.num_epochs):
        train_state['epoch_index'] = epoch_index

        # Iterate over training dataset

        # setup: batch generator, set loss and acc to 0, set train mode on

        dataset.set_split('train')
        batch_generator = generate_batches(dataset, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
        running_loss = 0.0
        running_acc = 0.0
        classifier.train()
        for batch_index, batch_dict in enumerate(batch_generator):
            # the training routine is 5 steps:
            # step 1. zero the gradients
            optimizer.zero_grad()
            # step 2. compute the output
            y_pred = classifier(x_in=batch_dict['x_data'])
            # step 3. compute the loss
            print(y_pred.shape)
            print(batch_dict['y_target'])  
            loss = loss_func(y_pred, batch_dict['y_target'])
            loss_batch = loss.item()
            running_loss += (loss_batch-running_loss) / (batch_index + 1)
            # step 4. use loss to produce gradients
            loss.backward()
            # step 5. use optimizer to take gradient step
            optimizer.step()
            # compute the accuracy
            acc_batch = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_batch - running_acc) / (batch_index + 1)

        train_state['train_loss'].append(running_loss)
        train_state['train_acc'].append(running_acc)

        # Iterate over val dataset
        # setup: batch generator, set loss and acc to 0, set eval mode on
        dataset.set_split('val')
        batch_generator = generate_batches(dataset, batch_size=args.batch_size, device=args.device)
        running_loss = 0.
        running_acc = 0.
        classifier.eval()

        for batch_index, batch_dict in enumerate(batch_generator):
            # step 1. compute the output
            y_pred = classifier(x_in=batch_dict['x_data'])
            # step 2. compute the loss
            loss = loss_func(y_pred, batch_dict['y_target'])
            loss_batch = loss.item()
            running_loss += (loss_batch - running_loss) / (batch_index + 1)
            # step 3. compute the accuracy
            acc_batch = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_batch - running_acc) / (batch_index + 1)
            val_bar.set_postfix(loss=running_loss, acc=running_acc, 
                            epoch=epoch_index)
            val_bar.update()
        train_state['val_loss'].append(running_loss)
        train_state['val_acc'].append(running_acc)
        train_state = update_train_state(args=args, model=classifier,
                                         train_state=train_state)

        scheduler.step(train_state['val_loss'][-1])

        if train_state['stop_early']:
            break

        train_bar.n = 0
        val_bar.n = 0
        epoch_bar.update()
except KeyboardInterrupt:
    print("Exiting loop")

training routine:   0%|          | 0/100 [00:00<?, ?it/s]

split=train:   0%|          | 0/10 [00:00<?, ?it/s]

split=val:   0%|          | 0/1 [00:00<?, ?it/s]

torch.Size([128, 2])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
        1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,
        1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
        0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
        1, 1, 1, 0, 0, 0, 0, 0])
torch.Size([128, 2])
tensor([1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
        1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0,
        1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 1, 0, 0, 1])
torch.Size([128, 2])
tensor([1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 

In [1513]:
# Evaluation
# compute the loss & accuracy on the test set using the best available model

classifier.load_state_dict(torch.load(train_state['model_filename']))

classifier = classifier.to(args.device)
dataset.class_weights = dataset.class_weights.to(args.device)
loss_func = nn.CrossEntropyLoss(dataset.class_weights)

dataset.set_split('test')
batch_generator = generate_batches(dataset, 
                                   batch_size=args.batch_size, 
                                   device=args.device)
running_loss = 0.
running_acc = 0.
classifier.eval()

for batch_index, batch_dict in enumerate(batch_generator):
    # compute the output
    y_pred = classifier(x_in=batch_dict['x_data'])
    # compute the loss
    loss = loss_func(y_pred, batch_dict['y_target'])
    loss_batch = loss.item()
    running_loss += (loss_batch - running_loss) / (batch_index + 1)
    # compute the accuracy
    acc_batch = compute_accuracy(y_pred, batch_dict['y_target'])
    running_acc += (acc_batch - running_acc) / (batch_index + 1)
train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc

print("Test loss: {:.3f}".format(train_state['test_loss']))
print("Test Accuracy: {:.2f}".format(train_state['test_acc']))

Test loss: 0.766
Test Accuracy: 69.01


## CNN Conv1d Model Using Top 10 Words

In [1293]:
# Dataset for CNN by using the top 10 words
class CNNDataset(Dataset):
    def __init__(self, task1cnn_df, vectorizer):
        self.task1cnn_df = task1cnn_df
        self._vectorizer = vectorizer

        # +1 if only using begin_seq, +2 if using both begin and end seq tokens
        measure_len = lambda context: len(context.split(" "))
        self._max_seq_length = max(map(measure_len, task1cnn_df.top_10_words)) + 2
        

        self.train_df = self.task1cnn_df[self.task1cnn_df.split=='train']
        self.train_size = len(self.train_df)
        self.val_df = self.task1cnn_df[self.task1cnn_df.split=='val']
        self.validation_size = len(self.val_df)
        self.test_df = self.task1cnn_df[self.task1cnn_df.split=='test']
        self.test_size = len(self.test_df)
        self._lookup_dict = {'train': (self.train_df, self.train_size),
                             'val': (self.val_df, self.validation_size),
                             'test': (self.test_df, self.test_size)}
        self.set_split('train')

        # Class weights
        class_counts = task1cnn_df.job_type_new.value_counts().to_dict()
        def sort_key(item):
            return self._vectorizer.job_type_new_vocab.lookup_token(item[0])
        sorted_counts = sorted(class_counts.items(), key=sort_key)
        frequencies = [count for _, count in sorted_counts]
        self.class_weights = 1.0 / torch.tensor(frequencies, dtype=torch.float32)
        
    @classmethod
    def load_dataset_and_make_vectorizer(cls, task1CNN_csv):
        task1cnn_df = pd.read_csv(task1CNN_csv)
        train_task1cnn_df = task1cnn_df[task1cnn_df.split=='train']
        return cls(task1cnn_df, TopPretrainedVectorizer.from_dataframe(train_task1cnn_df))

    @classmethod
    def load_dataset_and_load_vectorizer(cls, task1CNN_csv, vectorizer_filepath):
        task1cnn_df = pd.read_csv(task1CNN_csv)
        vectorizer = cls.load_vectorizer_only(vectorizer_filepath)
        return cls(task1CNN_csv, vectorizer)

    @staticmethod
    def load_vectorizer_only(vectorizer_filepath):
        with open(vectorizer_filepath) as fp:
            return TopPretrainedVectorizer.from_serializable(json.load(fp))

    def save_vectorizer(self, vectorizer_filepath):
        with open(vectorizer_filepath, "w") as fp:
            json.dump(self._vectorizer.to_serializable(), fp)

    def get_vectorizer(self):
        return self._vectorizer

    def set_split(self, split="train"):
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]

    def __len__(self):
        return self._target_size

    def __getitem__(self, index):
        row = self._target_df.iloc[index]

        top_10_words_vector = \
            self._vectorizer.vectorize(row.top_10_words, self._max_seq_length)

        job_type_new_index = \
            self._vectorizer.job_type_new_vocab.lookup_token(row.job_type_new)

        return {'x_data': top_10_words_vector,
                'y_target': job_type_new_index}

    def get_num_batches(self, batch_size):

        return len(self) // batch_size

def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"): 
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

In [1122]:
# Vocabulary for CNN
class CNNVocabulary(object):
    def __init__(self, token_to_idx=None):
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx

        self._idx_to_token = {idx: token 
                              for token, idx in self._token_to_idx.items()}
        
    def to_serializable(self):
        return {'token_to_idx': self._token_to_idx}

    @classmethod
    def from_serializable(cls, contents):
        return cls(**contents)

    def add_token(self, token):
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index
            
    def add_many(self, tokens):
        return [self.add_token(token) for token in tokens]

    def lookup_token(self, token):
        return self._token_to_idx[token]

    def lookup_index(self, index):
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the VocabularyCNN" % index)
        return self._idx_to_token[index]

    def __str__(self):
        return "<CNNVocabulary(size=%d)>" % len(self)

    def __len__(self):
        return len(self._token_to_idx)

In [1123]:
# SequenceVocabulary for CNN
class CNNSequenceVocabulary(CNNVocabulary):
    def __init__(self, token_to_idx=None, unk_token="<UNK>",
                 mask_token="<MASK>", begin_seq_token="<BEGIN>",
                 end_seq_token="<END>"):

        super(CNNSequenceVocabulary, self).__init__(token_to_idx)

        self._mask_token = mask_token
        self._unk_token = unk_token
        self._begin_seq_token = begin_seq_token
        self._end_seq_token = end_seq_token

        self.mask_index = self.add_token(self._mask_token)
        self.unk_index = self.add_token(self._unk_token)
        self.begin_seq_index = self.add_token(self._begin_seq_token)
        self.end_seq_index = self.add_token(self._end_seq_token)

    def to_serializable(self):
        contents = super(CNNSequenceVocabulary, self).to_serializable()
        contents.update({'unk_token': self._unk_token,
                         'mask_token': self._mask_token,
                         'begin_seq_token': self._begin_seq_token,
                         'end_seq_token': self._end_seq_token})
        return contents

    def lookup_token(self, token):
        if self.unk_index >= 0:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]

In [1124]:
# Vectorizer for pre-trained GloVe and domain specific
class TopPretrainedVectorizer(object): 
    def __init__(self, top_10_words_vocab, job_type_new_vocab):
        self.top_10_words_vocab = top_10_words_vocab
        self.job_type_new_vocab = job_type_new_vocab

    def vectorize(self, top_10_words, vector_length=-1):
        indices = [self.top_10_words_vocab.begin_seq_index]
        indices.extend(self.top_10_words_vocab.lookup_token(token) 
                       for token in top_10_words.split(" "))
        indices.append(self.top_10_words_vocab.end_seq_index)

        if vector_length < 0:
            vector_length = len(indices)

        out_vector = np.zeros(vector_length, dtype=np.int64)
        out_vector[:len(indices)] = indices
        out_vector[len(indices):] = self.top_10_words_vocab.mask_index

        return out_vector

    @classmethod
    def from_dataframe(cls, task1cnn_df, cutoff=25):
        job_type_new_vocab = CNNVocabulary()        
        for job_type in sorted(set(task1cnn_df.job_type_new)):
            job_type_new_vocab.add_token(job_type)

        word_counts = Counter()
        for top_10 in task1cnn_df.top_10_words:
            for token in top_10.split(" "):
                if token not in string.punctuation:
                    word_counts[token] += 1
        
        top_10_words_vocab = CNNSequenceVocabulary()
        for word, word_count in word_counts.items():
            if word_count >= cutoff:
                top_10_words_vocab.add_token(word)
        
        return cls(top_10_words_vocab, job_type_new_vocab)

    @classmethod
    def from_serializable(cls, contents):
        top_10_words_vocab = \
            CNNSequenceVocabulary.from_serializable(contents['top_10_words_vocab'])
        job_type_new_vocab =  \
            CNNVocabulary.from_serializable(contents['job_type_new_vocab'])

        return cls(top_10_words_vocab=top_10_words_vocab, job_type_new_vocab=job_type_new_vocab)

    def to_serializable(self):
        return {'top_10_words_vocab': self.top_10_words_vocab.to_serializable(),
                'job_type_new_vocab': self.job_type_new_vocab.to_serializable()}

In [1125]:
# CNN Classifier
class CNNClassifier(nn.Module):
    def __init__(self, embedding_size, num_embeddings, num_channels, 
                 hidden_dim, num_classes, dropout_p, 
                 pretrained_embeddings=None, padding_idx=0):
        super(CNNClassifier, self).__init__()

        if pretrained_embeddings is None:

            self.emb = nn.Embedding(embedding_dim=embedding_size,
                                    num_embeddings=num_embeddings,
                                    padding_idx=padding_idx)        
        else:
            pretrained_embeddings = torch.from_numpy(pretrained_embeddings).float()
            self.emb = nn.Embedding(embedding_dim=embedding_size,
                                    num_embeddings=num_embeddings,
                                    padding_idx=padding_idx,
                                    _weight=pretrained_embeddings)
        
        self.convnet = nn.Sequential(
            nn.Conv1d(in_channels=embedding_size, 
                   out_channels=num_channels, kernel_size=2),
            nn.ELU(),
            nn.Conv1d(in_channels=num_channels, out_channels=num_channels, 
                   kernel_size=2, stride=2),
            nn.ELU(),
            nn.Conv1d(in_channels=num_channels, out_channels=num_channels, 
                   kernel_size=2, stride=2),
            nn.ELU(),
            nn.Conv1d(in_channels=num_channels, out_channels=num_channels, 
                   kernel_size=2),
            nn.ELU()
        )

        self._dropout_p = dropout_p
        self.fc1 = nn.Linear(num_channels, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, num_classes)

    def forward(self, x_in, apply_softmax=False):
        
        # embed and permute so features are channels
        x_embedded = self.emb(x_in).permute(0, 2, 1)

        features = self.convnet(x_embedded)

        # average and remove the extra dimension
        remaining_size = features.size(dim=2)
        features = F.avg_pool1d(features, remaining_size).squeeze(dim=2)
        features = F.dropout(features, p=self._dropout_p)
        
        # mlp classifier
        intermediate_vector = F.relu(F.dropout(self.fc1(features), p=self._dropout_p))
        prediction_vector = self.fc2(intermediate_vector)

        if apply_softmax:
            prediction_vector = F.softmax(prediction_vector, dim=1)

        return prediction_vector

In [1126]:
# helper function
def make_train_state(args):
    return {'stop_early': False,
            'early_stopping_step': 0,
            'early_stopping_best_val': 1e8,
            'learning_rate': args.learning_rate,
            'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': [],
            'test_loss': -1,
            'test_acc': -1,
            'model_filename': args.model_state_file}

def update_train_state(args, model, train_state):

    # Save one model at least
    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(), train_state['model_filename'])
        train_state['stop_early'] = False

    # Save model if performance improved
    elif train_state['epoch_index'] >= 1:
        loss_tm1, loss_t = train_state['val_loss'][-2:]

        # If loss worsened
        if loss_t >= train_state['early_stopping_best_val']:
            # Update step
            train_state['early_stopping_step'] += 1
        # Loss decreased
        else:
            # Save the best model
            if loss_t < train_state['early_stopping_best_val']:
                torch.save(model.state_dict(), train_state['model_filename'])

            # Reset early stopping step
            train_state['early_stopping_step'] = 0

        # Stop early ?
        train_state['stop_early'] = \
            train_state['early_stopping_step'] >= args.early_stopping_criteria

    return train_state

def compute_accuracy(y_pred, y_target):
    _, y_pred_indices = y_pred.max(dim=1)
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100

In [1127]:
# General utilities
def set_seed_everywhere(seed, cuda):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed_all(seed)

def handle_dirs(dirpath):
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)
        
def load_glove_from_file(glove_filepath): 
    word_to_index = {}
    embeddings = []
    with open(glove_filepath, encoding="utf8") as fp:
        for index, line in enumerate(fp):
            line = line.split(" ") # each line: word num1 num2 ...
            word_to_index[line[0]] = index # word = line[0] 
            embedding_i = np.array([float(val) for val in line[1:]])
            embeddings.append(embedding_i)
    return word_to_index, np.stack(embeddings)

def load_domain_from_file(domain_filepath):
    word_to_index = {}
    embeddings = []
    with open(domain_filepath, encoding="utf8") as fp:
        for index, line in enumerate(fp):
            line = line.split(" ") # each line: word num1 num2 ...
            word_to_index[line[0]] = index # word = line[0] 
            embedding_i = np.array([float(val) for val in line[1:]])
            embeddings.append(embedding_i)
    return word_to_index, np.stack(embeddings)

def make_glove_embedding_matrix(glove_filepath, words):
    word_to_idx, glove_embeddings = load_glove_from_file(glove_filepath)
    embedding_size = glove_embeddings.shape[1]
    
    final_embeddings = np.zeros((len(words), embedding_size))

    for i, word in enumerate(words):
        if word in word_to_idx:
            final_embeddings[i, :] = glove_embeddings[word_to_idx[word]]
        else:
            embedding_i = torch.ones(1, embedding_size)
            torch.nn.init.xavier_uniform_(embedding_i)
            final_embeddings[i, :] = embedding_i
    return final_embeddings

def make_domain_embedding_matrix(domain_filepath, words):
    word_to_idx, domain_embeddings = load_domain_from_file(domain_filepath)
    embedding_size = domain_embeddings.shape[1]
    
    final_embeddings = np.zeros((len(words), embedding_size))

    for i, word in enumerate(words):
        if word in word_to_idx:
            final_embeddings[i, :] = domain_embeddings[word_to_idx[word]]
        else:
            embedding_i = torch.ones(1, embedding_size)
            torch.nn.init.xavier_uniform_(embedding_i)
            final_embeddings[i, :] = embedding_i
    return final_embeddings

#### `One-Hot Encoding Embeddings`

In [1086]:
# Setting and prep work
args = Namespace(
    # Data and Path hyper parameters
    task1CNN_csv="ass02_task01.csv",
    vectorizer_file="vectorizer1.json",
    model_state_file="model1.pth",
    save_dir="model_storage/cnn_top_10/one_hot",
    # Model hyper parameters
    glove_filepath='glove.6B.100d.txt', 
    domain_filepath='domain_embeddings.txt',
    use_glove=False,
    use_domain_embeddings=False,
    embedding_size=100, 
    hidden_dim=600, 
    num_channels=100, 
    # Training hyper parameter
    seed=1337, 
    learning_rate=0.001, 
    dropout_p=0., 
    batch_size=128, 
    num_epochs=100, 
    early_stopping_criteria=5, 
    # Runtime option
    cuda=True, 
    device='cuda',
    catch_keyboard_interrupt=True, 
    reload_from_files=False,
    expand_filepaths_to_save_dir=True
) 

if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir,
                                        args.vectorizer_file)

    args.model_state_file = os.path.join(args.save_dir,
                                         args.model_state_file)
    
    print("Expanded filepaths: ")
    print("\t{}".format(args.vectorizer_file))
    print("\t{}".format(args.model_state_file))
    
# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False
    
args.device = torch.device("cuda" if args.cuda else "cpu")
print("Using CUDA: {}".format(args.cuda))

# Set seed for reproducibility
set_seed_everywhere(args.seed, args.cuda)

# handle dirs
handle_dirs(args.save_dir)

Expanded filepaths: 
	model_storage/cnn_top_10/one_hot/vectorizer1.json
	model_storage/cnn_top_10/one_hot/model1.pth
Using CUDA: False


In [1087]:
# Initializations
if args.reload_from_files:
    # training from a checkpoint
    dataset = CNNDataset.load_dataset_and_load_vectorizer(args.task1CNN_csv,
                                                           args.vectorizer_file)
else:
    # create dataset and vectorizer
    dataset = CNNDataset.load_dataset_and_make_vectorizer(args.task1CNN_csv)
    dataset.save_vectorizer(args.vectorizer_file)
vectorizer = dataset.get_vectorizer()

# Use GloVe, domain_embeddings or randomly initialized embeddings
if args.use_glove: 
    words = vectorizer.top_10_words_vocab._token_to_idx.keys()
    embeddings = make_glove_embedding_matrix(glove_filepath=args.glove_filepath, 
                                       words=words)
    print("Using pre-trained glove embeddings")
elif args.use_domain_embeddings:
    words = vectorizer.top_10_words_vocab._token_to_idx.keys()
    embeddings = make_domain_embedding_matrix(domain_filepath=args.domain_filepath, 
                                       words=words)
    print("Using pre-trained domain specific embeddings")
else:
    print("Using one hot embeddings")
    embeddings = None

classifier = CNNClassifier(embedding_size=args.embedding_size, 
                            num_embeddings=len(vectorizer.top_10_words_vocab),
                            num_channels=args.num_channels,
                            hidden_dim=args.hidden_dim, 
                            num_classes=len(vectorizer.job_type_new_vocab), 
                            dropout_p=args.dropout_p,
                            pretrained_embeddings=embeddings,
                            padding_idx=0)

Using one hot embeddings


In [1088]:
# training loop
classifier = classifier.to(args.device)
dataset.class_weights = dataset.class_weights.to(args.device)
    
loss_func = nn.CrossEntropyLoss(dataset.class_weights)
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                           mode='min', factor=0.5,
                                           patience=1)

train_state = make_train_state(args)

epoch_bar = tqdm(desc='training routine', 
                          total=args.num_epochs,
                          position=0)

dataset.set_split('train')
train_bar = tqdm(desc='split=train',
                          total=dataset.get_num_batches(args.batch_size), 
                          position=1, 
                          leave=True)
dataset.set_split('val')
val_bar = tqdm(desc='split=val',
                        total=dataset.get_num_batches(args.batch_size), 
                        position=1, 
                        leave=True)

try:
    for epoch_index in range(args.num_epochs):
        train_state['epoch_index'] = epoch_index

        # Iterate over training dataset

        # setup: batch generator, set loss and acc to 0, set train mode on

        dataset.set_split('train')
        batch_generator = generate_batches(dataset, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
        running_loss = 0.0
        running_acc = 0.0
        classifier.train()

        for batch_index, batch_dict in enumerate(batch_generator):
            # the training routine is these 5 steps:

            # --------------------------------------
            # step 1. zero the gradients
            optimizer.zero_grad()

            # step 2. compute the output
            y_pred = classifier(batch_dict['x_data'])

            # step 3. compute the loss
            loss = loss_func(y_pred, batch_dict['y_target'])
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            # step 4. use loss to produce gradients
            loss.backward()

            # step 5. use optimizer to take gradient step
            optimizer.step()
            # -----------------------------------------
            # compute the accuracy
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)

            # update bar
            train_bar.set_postfix(loss=running_loss, acc=running_acc, 
                                  epoch=epoch_index)
            train_bar.update()

        train_state['train_loss'].append(running_loss)
        train_state['train_acc'].append(running_acc)

        # Iterate over val dataset

        # setup: batch generator, set loss and acc to 0; set eval mode on
        dataset.set_split('val')
        batch_generator = generate_batches(dataset, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
        running_loss = 0.
        running_acc = 0.
        classifier.eval()

        for batch_index, batch_dict in enumerate(batch_generator):

            # compute the output
            y_pred =  classifier(batch_dict['x_data'])

            # step 3. compute the loss
            loss = loss_func(y_pred, batch_dict['y_target'])
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            # compute the accuracy
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)
            val_bar.set_postfix(loss=running_loss, acc=running_acc, 
                            epoch=epoch_index)
            val_bar.update()

        train_state['val_loss'].append(running_loss)
        train_state['val_acc'].append(running_acc)

        train_state = update_train_state(args=args, model=classifier,
                                         train_state=train_state)

        scheduler.step(train_state['val_loss'][-1])

        if train_state['stop_early']:
            break

        train_bar.n = 0
        val_bar.n = 0
        epoch_bar.update()
except KeyboardInterrupt:
    print("Exiting loop")

training routine:   0%|          | 0/100 [00:00<?, ?it/s]

split=train:   0%|          | 0/10 [00:00<?, ?it/s]

split=val:   0%|          | 0/1 [00:00<?, ?it/s]

In [1094]:
# compute the loss & accuracy on the test set using the best available model

classifier.load_state_dict(torch.load(train_state['model_filename']))

classifier = classifier.to(args.device)
dataset.class_weights = dataset.class_weights.to(args.device)
loss_func = nn.CrossEntropyLoss(dataset.class_weights)

dataset.set_split('test')
batch_generator = generate_batches(dataset, 
                                   batch_size=args.batch_size, 
                                   device=args.device)
running_loss = 0.
running_acc = 0.
classifier.eval()

for batch_index, batch_dict in enumerate(batch_generator):
    # compute the output
    y_pred =  classifier(batch_dict['x_data'])
    
    # compute the loss
    loss = loss_func(y_pred, batch_dict['y_target'])
    loss_t = loss.item()
    running_loss += (loss_t - running_loss) / (batch_index + 1)

    # compute the accuracy
    acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
    running_acc += (acc_t - running_acc) / (batch_index + 1)

train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc

print("Test loss: {};".format(train_state['test_loss']))
print("Test Accuracy: {}".format(train_state['test_acc']))

Test loss: 0.6931397517522176;
Test Accuracy: 69.27083333333333


#### `Pre-trained GloVe Embeddings`

In [1102]:
# Setting and prep work
args = Namespace(
    # Data and Path hyper parameters
    task1CNN_csv="ass02_task01.csv",
    vectorizer_file="one_hot_vectorizer2.json",
    model_state_file="model2.pth",
    save_dir="model_storage/cnn_top_10/glove",
    # Model hyper parameters
    glove_filepath='glove.6B.100d.txt', 
    domain_filepath='domain_embeddings.txt',
    use_glove=False,
    use_domain_embeddings=False,
    embedding_size=100, 
    hidden_dim=600, 
    num_channels=100, 
    # Training hyper parameter
    seed=1337, 
    learning_rate=0.001, 
    dropout_p=0., 
    batch_size=128, 
    num_epochs=100, 
    early_stopping_criteria=5, 
    # Runtime option
    cuda=True, 
    device='cuda',
    catch_keyboard_interrupt=True, 
    reload_from_files=False,
    expand_filepaths_to_save_dir=True
) 

if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir,
                                        args.vectorizer_file)

    args.model_state_file = os.path.join(args.save_dir,
                                         args.model_state_file)
    
    print("Expanded filepaths: ")
    print("\t{}".format(args.vectorizer_file))
    print("\t{}".format(args.model_state_file))
    
# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False
    
args.device = torch.device("cuda" if args.cuda else "cpu")
print("Using CUDA: {}".format(args.cuda))

# Set seed for reproducibility
set_seed_everywhere(args.seed, args.cuda)

# handle dirs
handle_dirs(args.save_dir)

Expanded filepaths: 
	model_storage/cnn_top_10/glove/one_hot_vectorizer2.json
	model_storage/cnn_top_10/glove/model2.pth
Using CUDA: False


In [1103]:
# Initializations
args.use_glove = True
if args.reload_from_files:
    # training from a checkpoint
    dataset = CNNDataset.load_dataset_and_load_vectorizer(args.task1CNN_csv,
                                                           args.vectorizer_file)
else:
    # create dataset and vectorizer
    dataset = CNNDataset.load_dataset_and_make_vectorizer(args.task1CNN_csv)
    dataset.save_vectorizer(args.vectorizer_file)
vectorizer = dataset.get_vectorizer()

# Use GloVe, domain_embeddingsor randomly initialized embeddings
if args.use_glove: 
    words = vectorizer.top_10_words_vocab._token_to_idx.keys()
    embeddings = make_glove_embedding_matrix(glove_filepath=args.glove_filepath, 
                                       words=words)
    print("Using pre-trained glove embeddings")
elif args.use_domain_embeddings:
    words = vectorizer.top_10_words_vocab._token_to_idx.keys()
    embeddings = make_domain_embedding_matrix(domain_filepath=args.domain_filepath, 
                                       words=words)
    print("Using pre-trained domain specific embeddings")
else:
    print("Not using pre-trained embeddings")
    embeddings = None

classifier = CNNClassifier(embedding_size=args.embedding_size, 
                            num_embeddings=len(vectorizer.top_10_words_vocab),
                            num_channels=args.num_channels,
                            hidden_dim=args.hidden_dim, 
                            num_classes=len(vectorizer.job_type_new_vocab), 
                            dropout_p=args.dropout_p,
                            pretrained_embeddings=embeddings,
                            padding_idx=0)

Using pre-trained glove embeddings


In [1104]:
# Number of Classes
len(vectorizer.job_type_new_vocab)

2

In [1105]:
# training loop
classifier = classifier.to(args.device)
dataset.class_weights = dataset.class_weights.to(args.device)
    
loss_func = nn.CrossEntropyLoss(dataset.class_weights)
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                           mode='min', factor=0.5,
                                           patience=1)

train_state = make_train_state(args)

epoch_bar = tqdm(desc='training routine', 
                          total=args.num_epochs,
                          position=0)

dataset.set_split('train')
train_bar = tqdm(desc='split=train',
                          total=dataset.get_num_batches(args.batch_size), 
                          position=1, 
                          leave=True)
dataset.set_split('val')
val_bar = tqdm(desc='split=val',
                        total=dataset.get_num_batches(args.batch_size), 
                        position=1, 
                        leave=True)

try:
    for epoch_index in range(args.num_epochs):
        train_state['epoch_index'] = epoch_index

        # Iterate over training dataset

        # setup: batch generator, set loss and acc to 0, set train mode on

        dataset.set_split('train')
        batch_generator = generate_batches(dataset, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
        running_loss = 0.0
        running_acc = 0.0
        classifier.train()

        for batch_index, batch_dict in enumerate(batch_generator):
            # the training routine is these 5 steps:

            # --------------------------------------
            # step 1. zero the gradients
            optimizer.zero_grad()

            # step 2. compute the output
            y_pred = classifier(batch_dict['x_data'])

            # step 3. compute the loss
            loss = loss_func(y_pred, batch_dict['y_target'])
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            # step 4. use loss to produce gradients
            loss.backward()

            # step 5. use optimizer to take gradient step
            optimizer.step()
            # -----------------------------------------
            # compute the accuracy
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)

            # update bar
            train_bar.set_postfix(loss=running_loss, acc=running_acc, 
                                  epoch=epoch_index)
            train_bar.update()

        train_state['train_loss'].append(running_loss)
        train_state['train_acc'].append(running_acc)

        # Iterate over val dataset

        # setup: batch generator, set loss and acc to 0; set eval mode on
        dataset.set_split('val')
        batch_generator = generate_batches(dataset, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
        running_loss = 0.
        running_acc = 0.
        classifier.eval()

        for batch_index, batch_dict in enumerate(batch_generator):

            # compute the output
            y_pred =  classifier(batch_dict['x_data'])

            # step 3. compute the loss
            loss = loss_func(y_pred, batch_dict['y_target'])
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            # compute the accuracy
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)
            val_bar.set_postfix(loss=running_loss, acc=running_acc, 
                            epoch=epoch_index)
            val_bar.update()

        train_state['val_loss'].append(running_loss)
        train_state['val_acc'].append(running_acc)

        train_state = update_train_state(args=args, model=classifier,
                                         train_state=train_state)

        scheduler.step(train_state['val_loss'][-1])

        if train_state['stop_early']:
            break

        train_bar.n = 0
        val_bar.n = 0
        epoch_bar.update()
except KeyboardInterrupt:
    print("Exiting loop")

training routine:   0%|          | 0/100 [00:00<?, ?it/s]

split=train:   0%|          | 0/10 [00:00<?, ?it/s]

split=val:   0%|          | 0/1 [00:00<?, ?it/s]

In [1117]:
# compute the loss & accuracy on the test set using the best available model

classifier.load_state_dict(torch.load(train_state['model_filename']))

classifier = classifier.to(args.device)
dataset.class_weights = dataset.class_weights.to(args.device)
loss_func = nn.CrossEntropyLoss(dataset.class_weights)

dataset.set_split('test')
batch_generator = generate_batches(dataset, 
                                   batch_size=args.batch_size, 
                                   device=args.device)
running_loss = 0.
running_acc = 0.
classifier.eval()

for batch_index, batch_dict in enumerate(batch_generator):
    # compute the output
    y_pred =  classifier(batch_dict['x_data'])
    
    # compute the loss
    loss = loss_func(y_pred, batch_dict['y_target'])
    loss_t = loss.item()
    running_loss += (loss_t - running_loss) / (batch_index + 1)

    # compute the accuracy
    acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
    running_acc += (acc_t - running_acc) / (batch_index + 1)

train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc

print("Test loss: {};".format(train_state['test_loss']))
print("Test Accuracy: {}".format(train_state['test_acc']))

# Test loss: 0.6933475931485494;
# Test Accuracy: 64.0625

Test loss: 0.6931898593902588;
Test Accuracy: 68.75


In [1120]:
# make prediction
def predict_job_type_new(top_10_words, classifier, vectorizer, max_length):
    top_10_words = preprocess_text(top_10_words)
    vectorized_top_10_words = \
        torch.tensor(vectorizer.vectorize(top_10_words, vector_length=max_length))
    result = classifier(vectorized_top_10_words.unsqueeze(0), apply_softmax=True)
    probability_values, indices = result.max(dim=1)
    predicted_job_type_new = vectorizer.job_type_new_vocab.lookup_index(indices.item())

    return {'job_type': predicted_job_type_new, 
            'probability': probability_values.item()}

def get_samples():
    samples = {}
    for cat in dataset.val_df.job_type_new.unique():
        samples[cat] = dataset.val_df.top_10_words[dataset.val_df.job_type_new==cat].tolist()[:5]
    return samples

val_samples = get_samples()


classifier = classifier.to("cpu")

for truth, sample_group in val_samples.items():
    print(f"True Category: {truth}")
    print("="*30)
    for sample in sample_group:
        prediction = predict_job_type_new(sample, classifier, 
                                      vectorizer, dataset._max_seq_length + 1)
        print("Prediction: {} (p={:0.2f})".format(prediction['job_type'],
                                                  prediction['probability']))
        print("\t + Sample: {}".format(sample))
    print("-"*30 + "\n")

True Category: Full Time
Prediction: Full Time (p=0.50)
	 + Sample: calculation foreign streamlining jane cullen appeal arise pricing taxation oriented
Prediction: Full Time (p=0.50)
	 + Sample: shine wips whip tracker spotlight rush testimonials soar sees resourceful
Prediction: Full Time (p=0.50)
	 + Sample: transcript legalpersonnel lack jgrasso chained admission julie charities bill genuinely
Prediction: Full Time (p=0.50)
	 + Sample: establishments consistant captive canteen camps attentive diplomatic deliverable served majority
Prediction: Full Time (p=0.50)
	 + Sample: tipper economix diligent ot combination afternoon overtime induction white report
------------------------------

True Category: Other
Prediction: Full Time (p=0.50)
	 + Sample: sunglass hut gabbana dolce chanel prada oakley ray push rest
Prediction: Full Time (p=0.50)
	 + Sample: lloyd hall blockers removing sponsors towers tackle etl streams consultation
Prediction: Full Time (p=0.50)
	 + Sample: tidying wynyard

#### `Domain Specific Embeddings`

In [1128]:
# Setting and prep work
args = Namespace(
    # Data and Path hyper parameters
    task1CNN_csv="ass02_task01.csv",
    vectorizer_file="vectorizer3.json",
    model_state_file="model3.pth",
    save_dir="model_storage/cnn_top_10/domain",
    # Model hyper parameters
    glove_filepath='glove.6B.100d.txt', 
    domain_filepath='domain_embeddings.txt',
    use_glove=False,
    use_domain_embeddings=False,
    embedding_size=100, 
    hidden_dim=600, 
    num_channels=100, 
    # Training hyper parameter
    seed=1337, 
    learning_rate=0.001, 
    dropout_p=0., 
    batch_size=128, 
    num_epochs=100, 
    early_stopping_criteria=5, 
    # Runtime option
    cuda=True, 
    device='cuda',
    catch_keyboard_interrupt=True, 
    reload_from_files=False,
    expand_filepaths_to_save_dir=True
) 

if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir,
                                        args.vectorizer_file)

    args.model_state_file = os.path.join(args.save_dir,
                                         args.model_state_file)
    
    print("Expanded filepaths: ")
    print("\t{}".format(args.vectorizer_file))
    print("\t{}".format(args.model_state_file))
    
# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False
    
args.device = torch.device("cuda" if args.cuda else "cpu")
print("Using CUDA: {}".format(args.cuda))

# Set seed for reproducibility
set_seed_everywhere(args.seed, args.cuda)

# handle dirs
handle_dirs(args.save_dir)

Expanded filepaths: 
	model_storage/cnn_top_10/domain/vectorizer3.json
	model_storage/cnn_top_10/domain/model3.pth
Using CUDA: False


In [1129]:
# Initializations
args.use_glove = False
args.use_domain_embeddings = True
if args.reload_from_files:
    # training from a checkpoint
    dataset = CNNDataset.load_dataset_and_load_vectorizer(args.task1CNN_csv,
                                                           args.vectorizer_file)
else:
    # create dataset and vectorizer
    dataset = CNNDataset.load_dataset_and_make_vectorizer(args.task1CNN_csv)
    dataset.save_vectorizer(args.vectorizer_file)
vectorizer = dataset.get_vectorizer()

# Use GloVe, domain_embeddingsor randomly initialized embeddings
if args.use_glove: 
    words = vectorizer.top_10_words_vocab._token_to_idx.keys()
    embeddings = make_glove_embedding_matrix(glove_filepath=args.glove_filepath, 
                                       words=words)
    print("Using pre-trained glove embeddings")
elif args.use_domain_embeddings:
    words = vectorizer.top_10_words_vocab._token_to_idx.keys()
    embeddings = make_domain_embedding_matrix(domain_filepath=args.domain_filepath, 
                                       words=words)
    print("Using pre-trained domain specific embeddings")
else:
    print("Not using pre-trained embeddings")
    embeddings = None

classifier = CNNClassifier(embedding_size=args.embedding_size, 
                            num_embeddings=len(vectorizer.top_10_words_vocab),
                            num_channels=args.num_channels,
                            hidden_dim=args.hidden_dim, 
                            num_classes=len(vectorizer.job_type_new_vocab), 
                            dropout_p=args.dropout_p,
                            pretrained_embeddings=embeddings,
                            padding_idx=0)

Using pre-trained domain specific embeddings


In [1130]:
# training loop
classifier = classifier.to(args.device)
dataset.class_weights = dataset.class_weights.to(args.device)
    
loss_func = nn.CrossEntropyLoss(dataset.class_weights)
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                           mode='min', factor=0.5,
                                           patience=1)

train_state = make_train_state(args)

epoch_bar = tqdm(desc='training routine', 
                          total=args.num_epochs,
                          position=0)

dataset.set_split('train')
train_bar = tqdm(desc='split=train',
                          total=dataset.get_num_batches(args.batch_size), 
                          position=1, 
                          leave=True)
dataset.set_split('val')
val_bar = tqdm(desc='split=val',
                        total=dataset.get_num_batches(args.batch_size), 
                        position=1, 
                        leave=True)

try:
    for epoch_index in range(args.num_epochs):
        train_state['epoch_index'] = epoch_index

        # Iterate over training dataset

        # setup: batch generator, set loss and acc to 0, set train mode on

        dataset.set_split('train')
        batch_generator = generate_batches(dataset, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
        running_loss = 0.0
        running_acc = 0.0
        classifier.train()

        for batch_index, batch_dict in enumerate(batch_generator):
            # the training routine is these 5 steps:

            # --------------------------------------
            # step 1. zero the gradients
            optimizer.zero_grad()

            # step 2. compute the output
            y_pred = classifier(batch_dict['x_data'])

            # step 3. compute the loss
            loss = loss_func(y_pred, batch_dict['y_target'])
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            # step 4. use loss to produce gradients
            loss.backward()

            # step 5. use optimizer to take gradient step
            optimizer.step()
            # -----------------------------------------
            # compute the accuracy
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)

            # update bar
            train_bar.set_postfix(loss=running_loss, acc=running_acc, 
                                  epoch=epoch_index)
            train_bar.update()

        train_state['train_loss'].append(running_loss)
        train_state['train_acc'].append(running_acc)

        # Iterate over val dataset

        # setup: batch generator, set loss and acc to 0; set eval mode on
        dataset.set_split('val')
        batch_generator = generate_batches(dataset, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
        running_loss = 0.
        running_acc = 0.
        classifier.eval()

        for batch_index, batch_dict in enumerate(batch_generator):

            # compute the output
            y_pred =  classifier(batch_dict['x_data'])

            # step 3. compute the loss
            loss = loss_func(y_pred, batch_dict['y_target'])
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            # compute the accuracy
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)
            val_bar.set_postfix(loss=running_loss, acc=running_acc, 
                            epoch=epoch_index)
            val_bar.update()

        train_state['val_loss'].append(running_loss)
        train_state['val_acc'].append(running_acc)

        train_state = update_train_state(args=args, model=classifier,
                                         train_state=train_state)

        scheduler.step(train_state['val_loss'][-1])

        if train_state['stop_early']:
            break

        train_bar.n = 0
        val_bar.n = 0
        epoch_bar.update()
except KeyboardInterrupt:
    print("Exiting loop")

training routine:   0%|          | 0/100 [00:00<?, ?it/s]

split=train:   0%|          | 0/10 [00:00<?, ?it/s]

split=val:   0%|          | 0/1 [00:00<?, ?it/s]

In [1150]:
# compute the loss & accuracy on the test set using the best available model

classifier.load_state_dict(torch.load(train_state['model_filename']))

classifier = classifier.to(args.device)
dataset.class_weights = dataset.class_weights.to(args.device)
loss_func = nn.CrossEntropyLoss(dataset.class_weights)

dataset.set_split('test')
batch_generator = generate_batches(dataset, 
                                   batch_size=args.batch_size, 
                                   device=args.device)
running_loss = 0.
running_acc = 0.
classifier.eval()

for batch_index, batch_dict in enumerate(batch_generator):
    # compute the output
    y_pred =  classifier(batch_dict['x_data'])
    
    # compute the loss
    loss = loss_func(y_pred, batch_dict['y_target'])
    loss_t = loss.item()
    running_loss += (loss_t - running_loss) / (batch_index + 1)

    # compute the accuracy
    acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
    running_acc += (acc_t - running_acc) / (batch_index + 1)

train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc

print("Test loss: {};".format(train_state['test_loss']))
print("Test Accuracy: {}".format(train_state['test_acc']))

Test loss: 0.6932318806648254;
Test Accuracy: 68.22916666666667


## CNN Conv1d Model Using Full Job Description

In [994]:
# Dataset for CNN by using the full job description
class CNNDataset(Dataset):
    def __init__(self, task1cnn_df, vectorizer):
        self.task1cnn_df = task1cnn_df
        self._vectorizer = vectorizer

        # +1 if only using begin_seq, +2 if using both begin and end seq tokens
        measure_len = lambda context: len(context.split(" "))
        self._max_seq_length = max(map(measure_len, task1cnn_df.job_description_new)) + 2
        

        self.train_df = self.task1cnn_df[self.task1cnn_df.split=='train']
        self.train_size = len(self.train_df)
        self.val_df = self.task1cnn_df[self.task1cnn_df.split=='val']
        self.validation_size = len(self.val_df)
        self.test_df = self.task1cnn_df[self.task1cnn_df.split=='test']
        self.test_size = len(self.test_df)
        self._lookup_dict = {'train': (self.train_df, self.train_size),
                             'val': (self.val_df, self.validation_size),
                             'test': (self.test_df, self.test_size)}
        self.set_split('train')

        # Class weights
        class_counts = task1cnn_df.job_type_new.value_counts().to_dict()
        def sort_key(item):
            return self._vectorizer.job_type_new_vocab.lookup_token(item[0])
        sorted_counts = sorted(class_counts.items(), key=sort_key)
        frequencies = [count for _, count in sorted_counts]
        self.class_weights = 1.0 / torch.tensor(frequencies, dtype=torch.float32)
        
    @classmethod
    def load_dataset_and_make_vectorizer(cls, task1CNN_csv):
        task1cnn_df = pd.read_csv(task1CNN_csv)
        train_task1cnn_df = task1cnn_df[task1cnn_df.split=='train']
        return cls(task1cnn_df, TopPretrainedVectorizer.from_dataframe(train_task1cnn_df))

    @classmethod
    def load_dataset_and_load_vectorizer(cls, task1CNN_csv, vectorizer_filepath):
        task1cnn_df = pd.read_csv(task1CNN_csv)
        vectorizer = cls.load_vectorizer_only(vectorizer_filepath)
        return cls(task1CNN_csv, vectorizer)

    @staticmethod
    def load_vectorizer_only(vectorizer_filepath):
        with open(vectorizer_filepath) as fp:
            return TopPretrainedVectorizer.from_serializable(json.load(fp))

    def save_vectorizer(self, vectorizer_filepath):
        with open(vectorizer_filepath, "w") as fp:
            json.dump(self._vectorizer.to_serializable(), fp)

    def get_vectorizer(self):
        return self._vectorizer

    def set_split(self, split="train"):
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]

    def __len__(self):
        return self._target_size

    def __getitem__(self, index):
        row = self._target_df.iloc[index]

        job_description_new_vector = \
            self._vectorizer.vectorize(row.job_description_new, self._max_seq_length)

        job_type_new_index = \
            self._vectorizer.job_type_new_vocab.lookup_token(row.job_type_new)

        return {'x_data': job_description_new_vector,
                'y_target': job_type_new_index}

    def get_num_batches(self, batch_size):

        return len(self) // batch_size

def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"): 
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

In [995]:
# Vocabulary for CNN
class CNNVocabulary(object):
    def __init__(self, token_to_idx=None):
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx

        self._idx_to_token = {idx: token 
                              for token, idx in self._token_to_idx.items()}
        
    def to_serializable(self):
        return {'token_to_idx': self._token_to_idx}

    @classmethod
    def from_serializable(cls, contents):
        return cls(**contents)

    def add_token(self, token):
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index
            
    def add_many(self, tokens):
        return [self.add_token(token) for token in tokens]

    def lookup_token(self, token):
        return self._token_to_idx[token]

    def lookup_index(self, index):
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the VocabularyCNN" % index)
        return self._idx_to_token[index]

    def __str__(self):
        return "<CNNVocabulary(size=%d)>" % len(self)

    def __len__(self):
        return len(self._token_to_idx)

In [996]:
# SequenceVocabulary for CNN
class CNNSequenceVocabulary(CNNVocabulary):
    def __init__(self, token_to_idx=None, unk_token="<UNK>",
                 mask_token="<MASK>", begin_seq_token="<BEGIN>",
                 end_seq_token="<END>"):

        super(CNNSequenceVocabulary, self).__init__(token_to_idx)

        self._mask_token = mask_token
        self._unk_token = unk_token
        self._begin_seq_token = begin_seq_token
        self._end_seq_token = end_seq_token

        self.mask_index = self.add_token(self._mask_token)
        self.unk_index = self.add_token(self._unk_token)
        self.begin_seq_index = self.add_token(self._begin_seq_token)
        self.end_seq_index = self.add_token(self._end_seq_token)

    def to_serializable(self):
        contents = super(CNNSequenceVocabulary, self).to_serializable()
        contents.update({'unk_token': self._unk_token,
                         'mask_token': self._mask_token,
                         'begin_seq_token': self._begin_seq_token,
                         'end_seq_token': self._end_seq_token})
        return contents

    def lookup_token(self, token):
        if self.unk_index >= 0:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]

In [997]:
# full job description pretrained Vectorizer for pre-trained GloVe and domain specific
class TopPretrainedVectorizer(object): 
    def __init__(self, job_description_new_vocab, job_type_new_vocab):
        self.job_description_new_vocab = job_description_new_vocab
        self.job_type_new_vocab = job_type_new_vocab

    def vectorize(self, job_description_new, vector_length=-1):
        indices = [self.job_description_new_vocab.begin_seq_index]
        indices.extend(self.job_description_new_vocab.lookup_token(token) 
                       for token in job_description_new.split(" "))
        indices.append(self.job_description_new_vocab.end_seq_index)

        if vector_length < 0:
            vector_length = len(indices)

        out_vector = np.zeros(vector_length, dtype=np.int64)
        out_vector[:len(indices)] = indices
        out_vector[len(indices):] = self.job_description_new_vocab.mask_index

        return out_vector

    @classmethod
    def from_dataframe(cls, task1cnn_df, cutoff=25):
        job_type_new_vocab = CNNVocabulary()        
        for job_type in sorted(set(task1cnn_df.job_type_new)):
            job_type_new_vocab.add_token(job_type)

        word_counts = Counter()
        for top_10 in task1cnn_df.job_description_new:
            for token in top_10.split(" "):
                if token not in string.punctuation:
                    word_counts[token] += 1
        
        job_description_new_vocab = CNNSequenceVocabulary()
        for word, word_count in word_counts.items():
            if word_count >= cutoff:
                job_description_new_vocab.add_token(word)
        
        return cls(job_description_new_vocab, job_type_new_vocab)

    @classmethod
    def from_serializable(cls, contents):
        job_description_new_vocab = \
            CNNSequenceVocabulary.from_serializable(contents['job_description_new_vocab'])
        job_type_new_vocab =  \
            CNNVocabulary.from_serializable(contents['job_type_new_vocab'])

        return cls(job_description_new_vocab=job_description_new_vocab, job_type_new_vocab=job_type_new_vocab)

    def to_serializable(self):
        return {'job_description_new_vocab': self.job_description_new_vocab.to_serializable(),
                'job_type_new_vocab': self.job_type_new_vocab.to_serializable()}

In [998]:
# CNN Classifier
class CNNClassifier(nn.Module):
    def __init__(self, embedding_size, num_embeddings, num_channels, 
                 hidden_dim, num_classes, dropout_p, 
                 pretrained_embeddings=None, padding_idx=0):
        super(CNNClassifier, self).__init__()

        if pretrained_embeddings is None:

            self.emb = nn.Embedding(embedding_dim=embedding_size,
                                    num_embeddings=num_embeddings,
                                    padding_idx=padding_idx)        
        else:
            pretrained_embeddings = torch.from_numpy(pretrained_embeddings).float()
            self.emb = nn.Embedding(embedding_dim=embedding_size,
                                    num_embeddings=num_embeddings,
                                    padding_idx=padding_idx,
                                    _weight=pretrained_embeddings)
        
        self.convnet = nn.Sequential(
            nn.Conv1d(in_channels=embedding_size, 
                   out_channels=num_channels, kernel_size=2),
            nn.ELU(),
            nn.Conv1d(in_channels=num_channels, out_channels=num_channels, 
                   kernel_size=2, stride=2),
            nn.ELU(),
            nn.Conv1d(in_channels=num_channels, out_channels=num_channels, 
                   kernel_size=2, stride=2),
            nn.ELU(),
            nn.Conv1d(in_channels=num_channels, out_channels=num_channels, 
                   kernel_size=2),
            nn.ELU()
        )

        self._dropout_p = dropout_p
        self.fc1 = nn.Linear(num_channels, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, num_classes)

    def forward(self, x_in, apply_softmax=False):
        
        # embed and permute so features are channels
        x_embedded = self.emb(x_in).permute(0, 2, 1)

        features = self.convnet(x_embedded)

        # average and remove the extra dimension
        remaining_size = features.size(dim=2)
        features = F.avg_pool1d(features, remaining_size).squeeze(dim=2)
        features = F.dropout(features, p=self._dropout_p)
        
        # mlp classifier
        intermediate_vector = F.relu(F.dropout(self.fc1(features), p=self._dropout_p))
        prediction_vector = self.fc2(intermediate_vector)

        if apply_softmax:
            prediction_vector = F.softmax(prediction_vector, dim=1)

        return prediction_vector

In [999]:
# helper function
def make_train_state(args):
    return {'stop_early': False,
            'early_stopping_step': 0,
            'early_stopping_best_val': 1e8,
            'learning_rate': args.learning_rate,
            'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': [],
            'test_loss': -1,
            'test_acc': -1,
            'model_filename': args.model_state_file}

def update_train_state(args, model, train_state):

    # Save one model at least
    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(), train_state['model_filename'])
        train_state['stop_early'] = False

    # Save model if performance improved
    elif train_state['epoch_index'] >= 1:
        loss_tm1, loss_t = train_state['val_loss'][-2:]

        # If loss worsened
        if loss_t >= train_state['early_stopping_best_val']:
            # Update step
            train_state['early_stopping_step'] += 1
        # Loss decreased
        else:
            # Save the best model
            if loss_t < train_state['early_stopping_best_val']:
                torch.save(model.state_dict(), train_state['model_filename'])

            # Reset early stopping step
            train_state['early_stopping_step'] = 0

        # Stop early ?
        train_state['stop_early'] = \
            train_state['early_stopping_step'] >= args.early_stopping_criteria

    return train_state

def compute_accuracy(y_pred, y_target):
    _, y_pred_indices = y_pred.max(dim=1)
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100

In [1000]:
# General utilities
def set_seed_everywhere(seed, cuda):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed_all(seed)

def handle_dirs(dirpath):
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)
        
def load_glove_from_file(glove_filepath): 
    word_to_index = {}
    embeddings = []
    with open(glove_filepath, encoding="utf8") as fp:
        for index, line in enumerate(fp):
            line = line.split(" ") # each line: word num1 num2 ...
            word_to_index[line[0]] = index # word = line[0] 
            embedding_i = np.array([float(val) for val in line[1:]])
            embeddings.append(embedding_i)
    return word_to_index, np.stack(embeddings)

def load_domain_from_file(domain_filepath):
    word_to_index = {}
    embeddings = []
    with open(domain_filepath, encoding="utf8") as fp:
        for index, line in enumerate(fp):
            line = line.split(" ") # each line: word num1 num2 ...
            word_to_index[line[0]] = index # word = line[0] 
            embedding_i = np.array([float(val) for val in line[1:]])
            embeddings.append(embedding_i)
    return word_to_index, np.stack(embeddings)

def make_glove_embedding_matrix(glove_filepath, words):
    word_to_idx, glove_embeddings = load_glove_from_file(glove_filepath)
    embedding_size = glove_embeddings.shape[1]
    
    final_embeddings = np.zeros((len(words), embedding_size))

    for i, word in enumerate(words):
        if word in word_to_idx:
            final_embeddings[i, :] = glove_embeddings[word_to_idx[word]]
        else:
            embedding_i = torch.ones(1, embedding_size)
            torch.nn.init.xavier_uniform_(embedding_i)
            final_embeddings[i, :] = embedding_i
    return final_embeddings

def make_domain_embedding_matrix(domain_filepath, words):
    word_to_idx, domain_embeddings = load_domain_from_file(domain_filepath)
    embedding_size = domain_embeddings.shape[1]
    
    final_embeddings = np.zeros((len(words), embedding_size))

    for i, word in enumerate(words):
        if word in word_to_idx:
            final_embeddings[i, :] = domain_embeddings[word_to_idx[word]]
        else:
            embedding_i = torch.ones(1, embedding_size)
            torch.nn.init.xavier_uniform_(embedding_i)
            final_embeddings[i, :] = embedding_i
    return final_embeddings

#### `One-Hot Encoding Embeddings`

In [1001]:
# Setting and prep work
args = Namespace(
    # Data and Path hyper parameters
    task1CNN_csv="ass02_task01.csv",
    vectorizer_file="vectorizer1.json",
    model_state_file="model1.pth",
    save_dir="model_storage/cnn_description/one_hot",
    # Model hyper parameters
    glove_filepath='glove.6B.100d.txt', 
    domain_filepath='domain_embeddings.txt',
    use_glove=False,
    use_domain_embeddings=False,
    embedding_size=100, 
    hidden_dim=600, 
    num_channels=100, 
    # Training hyper parameter
    seed=1337, 
    learning_rate=0.001, 
    dropout_p=0.2, 
    batch_size=128, 
    num_epochs=100, 
    early_stopping_criteria=5, 
    # Runtime option
    cuda=True, 
    device='cuda',
    catch_keyboard_interrupt=True, 
    reload_from_files=False,
    expand_filepaths_to_save_dir=True
) 

if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir,
                                        args.vectorizer_file)

    args.model_state_file = os.path.join(args.save_dir,
                                         args.model_state_file)
    
    print("Expanded filepaths: ")
    print("\t{}".format(args.vectorizer_file))
    print("\t{}".format(args.model_state_file))
    
# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False
    
args.device = torch.device("cuda" if args.cuda else "cpu")
print("Using CUDA: {}".format(args.cuda))

# Set seed for reproducibility
set_seed_everywhere(args.seed, args.cuda)

# handle dirs
handle_dirs(args.save_dir)

Expanded filepaths: 
	model_storage/cnn_description/one_hot/vectorizer1.json
	model_storage/cnn_description/one_hot/model1.pth
Using CUDA: False


In [1002]:
# Initializations
if args.reload_from_files:
    # training from a checkpoint
    dataset = CNNDataset.load_dataset_and_load_vectorizer(args.task1CNN_csv,
                                                           args.vectorizer_file)
else:
    # create dataset and vectorizer
    dataset = CNNDataset.load_dataset_and_make_vectorizer(args.task1CNN_csv)
    dataset.save_vectorizer(args.vectorizer_file)
vectorizer = dataset.get_vectorizer()

# Use GloVe, domain_embeddingsor randomly initialized embeddings
if args.use_glove: 
    words = vectorizer.job_description_new_vocab._token_to_idx.keys()
    embeddings = make_glove_embedding_matrix(glove_filepath=args.glove_filepath, 
                                       words=words)
    print("Using pre-trained glove embeddings")
elif args.use_domain_embeddings:
    words = vectorizer.job_description_new_vocab._token_to_idx.keys()
    embeddings = make_domain_embedding_matrix(domain_filepath=args.domain_filepath, 
                                       words=words)
    print("Using pre-trained domain specific embeddings")
else:
    print("Using one hot embeddings")
    embeddings = None

classifier = CNNClassifier(embedding_size=args.embedding_size, 
                            num_embeddings=len(vectorizer.job_description_new_vocab),
                            num_channels=args.num_channels,
                            hidden_dim=args.hidden_dim, 
                            num_classes=len(vectorizer.job_type_new_vocab), 
                            dropout_p=args.dropout_p,
                            pretrained_embeddings=embeddings,
                            padding_idx=0)

Using one hot embeddings


In [1003]:
# training loop
classifier = classifier.to(args.device)
dataset.class_weights = dataset.class_weights.to(args.device)
    
loss_func = nn.CrossEntropyLoss(dataset.class_weights)
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                           mode='min', factor=0.5,
                                           patience=1)

train_state = make_train_state(args)

epoch_bar = tqdm(desc='training routine', 
                          total=args.num_epochs,
                          position=0)

dataset.set_split('train')
train_bar = tqdm(desc='split=train',
                          total=dataset.get_num_batches(args.batch_size), 
                          position=1, 
                          leave=True)
dataset.set_split('val')
val_bar = tqdm(desc='split=val',
                        total=dataset.get_num_batches(args.batch_size), 
                        position=1, 
                        leave=True)

try:
    for epoch_index in range(args.num_epochs):
        train_state['epoch_index'] = epoch_index

        # Iterate over training dataset

        # setup: batch generator, set loss and acc to 0, set train mode on

        dataset.set_split('train')
        batch_generator = generate_batches(dataset, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
        running_loss = 0.0
        running_acc = 0.0
        classifier.train()

        for batch_index, batch_dict in enumerate(batch_generator):
            # the training routine is these 5 steps:

            # --------------------------------------
            # step 1. zero the gradients
            optimizer.zero_grad()

            # step 2. compute the output
            y_pred = classifier(batch_dict['x_data'])

            # step 3. compute the loss
            loss = loss_func(y_pred, batch_dict['y_target'])
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            # step 4. use loss to produce gradients
            loss.backward()

            # step 5. use optimizer to take gradient step
            optimizer.step()
            # -----------------------------------------
            # compute the accuracy
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)

            # update bar
            train_bar.set_postfix(loss=running_loss, acc=running_acc, 
                                  epoch=epoch_index)
            train_bar.update()

        train_state['train_loss'].append(running_loss)
        train_state['train_acc'].append(running_acc)

        # Iterate over val dataset

        # setup: batch generator, set loss and acc to 0; set eval mode on
        dataset.set_split('val')
        batch_generator = generate_batches(dataset, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
        running_loss = 0.
        running_acc = 0.
        classifier.eval()

        for batch_index, batch_dict in enumerate(batch_generator):

            # compute the output
            y_pred =  classifier(batch_dict['x_data'])

            # step 3. compute the loss
            loss = loss_func(y_pred, batch_dict['y_target'])
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            # compute the accuracy
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)
            val_bar.set_postfix(loss=running_loss, acc=running_acc, 
                            epoch=epoch_index)
            val_bar.update()

        train_state['val_loss'].append(running_loss)
        train_state['val_acc'].append(running_acc)

        train_state = update_train_state(args=args, model=classifier,
                                         train_state=train_state)

        scheduler.step(train_state['val_loss'][-1])

        if train_state['stop_early']:
            break

        train_bar.n = 0
        val_bar.n = 0
        epoch_bar.update()
except KeyboardInterrupt:
    print("Exiting loop")

training routine:   0%|          | 0/100 [00:00<?, ?it/s]

split=train:   0%|          | 0/10 [00:00<?, ?it/s]

split=val:   0%|          | 0/1 [00:00<?, ?it/s]

In [1004]:
# compute the loss & accuracy on the test set using the best available model

classifier.load_state_dict(torch.load(train_state['model_filename']))

classifier = classifier.to(args.device)
dataset.class_weights = dataset.class_weights.to(args.device)
loss_func = nn.CrossEntropyLoss(dataset.class_weights)

dataset.set_split('test')
batch_generator = generate_batches(dataset, 
                                   batch_size=args.batch_size, 
                                   device=args.device)
running_loss = 0.
running_acc = 0.
classifier.eval()

for batch_index, batch_dict in enumerate(batch_generator):
    # compute the output
    y_pred =  classifier(batch_dict['x_data'])
    
    # compute the loss
    loss = loss_func(y_pred, batch_dict['y_target'])
    loss_t = loss.item()
    running_loss += (loss_t - running_loss) / (batch_index + 1)

    # compute the accuracy
    acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
    running_acc += (acc_t - running_acc) / (batch_index + 1)

train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc

In [1006]:
print("Test loss: {};".format(train_state['test_loss']))
print("Test Accuracy: {}".format(train_state['test_acc']))

Test loss: 0.8452479839324951;
Test Accuracy: 70.05208333333333


#### `Pre-trained GloVe Embeddings`

In [1007]:
# Setting and prep work
args = Namespace(
    # Data and Path hyper parameters
    task1CNN_csv="ass02_task01.csv",
    vectorizer_file="vectorizer2.json",
    model_state_file="model2.pth",
    save_dir="model_storage/cnn_description/glove",
    # Model hyper parameters
    glove_filepath='glove.6B.100d.txt', 
    domain_filepath='domain_embeddings.txt',
    use_glove=False,
    use_domain_embeddings=False,
    embedding_size=100, 
    hidden_dim=600, 
    num_channels=100, 
    # Training hyper parameter
    seed=1337, 
    learning_rate=0.001, 
    dropout_p=0.2, 
    batch_size=128, 
    num_epochs=100, 
    early_stopping_criteria=5, 
    # Runtime option
    cuda=True, 
    device='cuda',
    catch_keyboard_interrupt=True, 
    reload_from_files=False,
    expand_filepaths_to_save_dir=True
) 

if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir,
                                        args.vectorizer_file)

    args.model_state_file = os.path.join(args.save_dir,
                                         args.model_state_file)
    
    print("Expanded filepaths: ")
    print("\t{}".format(args.vectorizer_file))
    print("\t{}".format(args.model_state_file))
    
# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False
    
args.device = torch.device("cuda" if args.cuda else "cpu")
print("Using CUDA: {}".format(args.cuda))

# Set seed for reproducibility
set_seed_everywhere(args.seed, args.cuda)

# handle dirs
handle_dirs(args.save_dir)

Expanded filepaths: 
	model_storage/cnn_description/glove/vectorizer2.json
	model_storage/cnn_description/glove/model2.pth
Using CUDA: False


In [1008]:
# Initializations
args.use_glove = True
if args.reload_from_files:
    # training from a checkpoint
    dataset = CNNDataset.load_dataset_and_load_vectorizer(args.task1CNN_csv,
                                                           args.vectorizer_file)
else:
    # create dataset and vectorizer
    dataset = CNNDataset.load_dataset_and_make_vectorizer(args.task1CNN_csv)
    dataset.save_vectorizer(args.vectorizer_file)
vectorizer = dataset.get_vectorizer()

# Use GloVe, domain_embeddingsor randomly initialized embeddings
if args.use_glove: 
    words = vectorizer.job_description_new_vocab._token_to_idx.keys()
    embeddings = make_glove_embedding_matrix(glove_filepath=args.glove_filepath, 
                                       words=words)
    print("Using pre-trained glove embeddings")
elif args.use_domain_embeddings:
    words = vectorizer.job_description_new_vocab._token_to_idx.keys()
    embeddings = make_domain_embedding_matrix(domain_filepath=args.domain_filepath, 
                                       words=words)
    print("Using pre-trained domain specific embeddings")
else:
    print("Not using pre-trained embeddings")
    embeddings = None

classifier = CNNClassifier(embedding_size=args.embedding_size, 
                            num_embeddings=len(vectorizer.job_description_new_vocab),
                            num_channels=args.num_channels,
                            hidden_dim=args.hidden_dim, 
                            num_classes=len(vectorizer.job_type_new_vocab), 
                            dropout_p=args.dropout_p,
                            pretrained_embeddings=embeddings,
                            padding_idx=0)

Using pre-trained glove embeddings


In [1009]:
# Number of Classes
len(vectorizer.job_type_new_vocab)

2

In [1010]:
# training loop
classifier = classifier.to(args.device)
dataset.class_weights = dataset.class_weights.to(args.device)
    
loss_func = nn.CrossEntropyLoss(dataset.class_weights)
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                           mode='min', factor=0.5,
                                           patience=1)

train_state = make_train_state(args)

epoch_bar = tqdm(desc='training routine', 
                          total=args.num_epochs,
                          position=0)

dataset.set_split('train')
train_bar = tqdm(desc='split=train',
                          total=dataset.get_num_batches(args.batch_size), 
                          position=1, 
                          leave=True)
dataset.set_split('val')
val_bar = tqdm(desc='split=val',
                        total=dataset.get_num_batches(args.batch_size), 
                        position=1, 
                        leave=True)

try:
    for epoch_index in range(args.num_epochs):
        train_state['epoch_index'] = epoch_index

        # Iterate over training dataset

        # setup: batch generator, set loss and acc to 0, set train mode on

        dataset.set_split('train')
        batch_generator = generate_batches(dataset, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
        running_loss = 0.0
        running_acc = 0.0
        classifier.train()

        for batch_index, batch_dict in enumerate(batch_generator):
            # the training routine is these 5 steps:

            # --------------------------------------
            # step 1. zero the gradients
            optimizer.zero_grad()

            # step 2. compute the output
            y_pred = classifier(batch_dict['x_data'])

            # step 3. compute the loss
            loss = loss_func(y_pred, batch_dict['y_target'])
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            # step 4. use loss to produce gradients
            loss.backward()

            # step 5. use optimizer to take gradient step
            optimizer.step()
            # -----------------------------------------
            # compute the accuracy
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)

            # update bar
            train_bar.set_postfix(loss=running_loss, acc=running_acc, 
                                  epoch=epoch_index)
            train_bar.update()

        train_state['train_loss'].append(running_loss)
        train_state['train_acc'].append(running_acc)

        # Iterate over val dataset

        # setup: batch generator, set loss and acc to 0; set eval mode on
        dataset.set_split('val')
        batch_generator = generate_batches(dataset, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
        running_loss = 0.
        running_acc = 0.
        classifier.eval()

        for batch_index, batch_dict in enumerate(batch_generator):

            # compute the output
            y_pred =  classifier(batch_dict['x_data'])

            # step 3. compute the loss
            loss = loss_func(y_pred, batch_dict['y_target'])
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            # compute the accuracy
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)
            val_bar.set_postfix(loss=running_loss, acc=running_acc, 
                            epoch=epoch_index)
            val_bar.update()

        train_state['val_loss'].append(running_loss)
        train_state['val_acc'].append(running_acc)

        train_state = update_train_state(args=args, model=classifier,
                                         train_state=train_state)

        scheduler.step(train_state['val_loss'][-1])

        if train_state['stop_early']:
            break

        train_bar.n = 0
        val_bar.n = 0
        epoch_bar.update()
except KeyboardInterrupt:
    print("Exiting loop")

training routine:   0%|          | 0/100 [00:00<?, ?it/s]

split=train:   0%|          | 0/10 [00:00<?, ?it/s]

split=val:   0%|          | 0/1 [00:00<?, ?it/s]

In [1011]:
# compute the loss & accuracy on the test set using the best available model

classifier.load_state_dict(torch.load(train_state['model_filename']))

classifier = classifier.to(args.device)
dataset.class_weights = dataset.class_weights.to(args.device)
loss_func = nn.CrossEntropyLoss(dataset.class_weights)

dataset.set_split('test')
batch_generator = generate_batches(dataset, 
                                   batch_size=args.batch_size, 
                                   device=args.device)
running_loss = 0.
running_acc = 0.
classifier.eval()

for batch_index, batch_dict in enumerate(batch_generator):
    # compute the output
    y_pred =  classifier(batch_dict['x_data'])
    
    # compute the loss
    loss = loss_func(y_pred, batch_dict['y_target'])
    loss_t = loss.item()
    running_loss += (loss_t - running_loss) / (batch_index + 1)

    # compute the accuracy
    acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
    running_acc += (acc_t - running_acc) / (batch_index + 1)

train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc

In [1012]:
print("Test loss: {};".format(train_state['test_loss']))
print("Test Accuracy: {}".format(train_state['test_acc']))

# Test loss: 0.6499330401420593;
# Test Accuracy: 61.458333333333336

Test loss: 0.5823986728986105;
Test Accuracy: 72.13541666666667


In [1013]:
def predict_job_type_new(job_description_new, classifier, vectorizer, max_length):
    """Predict a News category for a new title
    
    Args:
        title (str): a raw title string
        classifier (NewsClassifier): an instance of the trained classifier
        vectorizer (NewsVectorizer): the corresponding vectorizer
        max_length (int): the max sequence length
            Note: CNNs are sensitive to the input data tensor size. 
                  This ensures to keep it the same size as the training data
    """
    job_description_new = preprocess_text(job_description_new)
    vectorized_job_description_new = \
        torch.tensor(vectorizer.vectorize(job_description_new, vector_length=max_length))
    result = classifier(vectorized_job_description_new.unsqueeze(0), apply_softmax=True)
    probability_values, indices = result.max(dim=1)
    predicted_job_type_new = vectorizer.job_type_new_vocab.lookup_index(indices.item())

    return {'job_type': predicted_job_type_new, 
            'probability': probability_values.item()}

In [1014]:
def get_samples():
    samples = {}
    for cat in dataset.val_df.job_type_new.unique():
        samples[cat] = dataset.val_df.job_description_new[dataset.val_df.job_type_new==cat].tolist()[:5]
    return samples

val_samples = get_samples()

In [1015]:
#title = input("Enter a news title to classify: ")
classifier = classifier.to("cpu")

for truth, sample_group in val_samples.items():
    print(f"True Category: {truth}")
    print("="*30)
    for sample in sample_group:
        prediction = predict_job_type_new(sample, classifier, 
                                      vectorizer, dataset._max_seq_length + 1)
        print("Prediction: {} (p={:0.2f})".format(prediction['job_type'],
                                                  prediction['probability']))
        print("\t + Sample: {}".format(sample))
    print("-"*30 + "\n")

True Category: Full Time
Prediction: Full Time (p=0.75)
	 + Sample:  Do you want to work with a growth oriented funds management company With a pipeline of significant investments this progressive company is well positioned for the future You will enjoy working independently yet as part of a small close knit team Your key role will include Day to day financial management of reconciliations end of month processing and financial reporting Cash flow forecasting and management reporting Preparation of statutory accounts Unit pricing a foreign exchange calculation Application of taxation concepts Financial analysis and A continued focus on improvement and streamlining of systems This is a key role working as part of a high performing organisation with highly engaged staff You will enjoy a broad based role with plenty of variety and opportunity to take on further challenges as they arise This role will appeal to a CA CPA qualified Accountant who has exposure to the funds management banking o

#### `Domain Specific Embeddings`

In [1016]:
# Setting and prep work
args = Namespace(
    # Data and Path hyper parameters
    task1CNN_csv="ass02_task01.csv",
    vectorizer_file="vectorizer3.json",
    model_state_file="model3.pth",
    save_dir="model_storage/cnn_description/domain",
    # Model hyper parameters
    glove_filepath='glove.6B.100d.txt', 
    domain_filepath='domain_embeddings.txt',
    use_glove=False,
    use_domain_embeddings=False,
    embedding_size=100, 
    hidden_dim=600, 
    num_channels=100, 
    # Training hyper parameter
    seed=1337, 
    learning_rate=0.001, 
    dropout_p=0.2, 
    batch_size=128, 
    num_epochs=100, 
    early_stopping_criteria=5, 
    # Runtime option
    cuda=True, 
    device='cuda',
    catch_keyboard_interrupt=True, 
    reload_from_files=False,
    expand_filepaths_to_save_dir=True
) 

if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir,
                                        args.vectorizer_file)

    args.model_state_file = os.path.join(args.save_dir,
                                         args.model_state_file)
    
    print("Expanded filepaths: ")
    print("\t{}".format(args.vectorizer_file))
    print("\t{}".format(args.model_state_file))
    
# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False
    
args.device = torch.device("cuda" if args.cuda else "cpu")
print("Using CUDA: {}".format(args.cuda))

# Set seed for reproducibility
set_seed_everywhere(args.seed, args.cuda)

# handle dirs
handle_dirs(args.save_dir)

Expanded filepaths: 
	model_storage/cnn_description/domain/vectorizer3.json
	model_storage/cnn_description/domain/model3.pth
Using CUDA: False


In [1017]:
# Initializations
args.use_domain_embeddings = True
if args.reload_from_files:
    # training from a checkpoint
    dataset = CNNDataset.load_dataset_and_load_vectorizer(args.task1CNN_csv,
                                                           args.vectorizer_file)
else:
    # create dataset and vectorizer
    dataset = CNNDataset.load_dataset_and_make_vectorizer(args.task1CNN_csv)
    dataset.save_vectorizer(args.vectorizer_file)
vectorizer = dataset.get_vectorizer()

# Use GloVe, domain_embeddingsor randomly initialized embeddings
if args.use_glove: 
    words = vectorizer.job_description_new_vocab._token_to_idx.keys()
    embeddings = make_glove_embedding_matrix(glove_filepath=args.glove_filepath, 
                                       words=words)
    print("Using pre-trained glove embeddings")
elif args.use_domain_embeddings:
    words = vectorizer.job_description_new_vocab._token_to_idx.keys()
    embeddings = make_domain_embedding_matrix(domain_filepath=args.domain_filepath, 
                                       words=words)
    print("Using pre-trained domain specific embeddings")
else:
    print("Not using pre-trained embeddings")
    embeddings = None

classifier = CNNClassifier(embedding_size=args.embedding_size, 
                            num_embeddings=len(vectorizer.job_description_new_vocab),
                            num_channels=args.num_channels,
                            hidden_dim=args.hidden_dim, 
                            num_classes=len(vectorizer.job_type_new_vocab), 
                            dropout_p=args.dropout_p,
                            pretrained_embeddings=embeddings,
                            padding_idx=0)

Using pre-trained domain specific embeddings


In [1018]:
# training loop
classifier = classifier.to(args.device)
dataset.class_weights = dataset.class_weights.to(args.device)
    
loss_func = nn.CrossEntropyLoss(dataset.class_weights)
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                           mode='min', factor=0.5,
                                           patience=1)

train_state = make_train_state(args)

epoch_bar = tqdm(desc='training routine', 
                          total=args.num_epochs,
                          position=0)

dataset.set_split('train')
train_bar = tqdm(desc='split=train',
                          total=dataset.get_num_batches(args.batch_size), 
                          position=1, 
                          leave=True)
dataset.set_split('val')
val_bar = tqdm(desc='split=val',
                        total=dataset.get_num_batches(args.batch_size), 
                        position=1, 
                        leave=True)

try:
    for epoch_index in range(args.num_epochs):
        train_state['epoch_index'] = epoch_index

        # Iterate over training dataset

        # setup: batch generator, set loss and acc to 0, set train mode on

        dataset.set_split('train')
        batch_generator = generate_batches(dataset, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
        running_loss = 0.0
        running_acc = 0.0
        classifier.train()

        for batch_index, batch_dict in enumerate(batch_generator):
            # the training routine is these 5 steps:

            # --------------------------------------
            # step 1. zero the gradients
            optimizer.zero_grad()

            # step 2. compute the output
            y_pred = classifier(batch_dict['x_data'])

            # step 3. compute the loss
            loss = loss_func(y_pred, batch_dict['y_target'])
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            # step 4. use loss to produce gradients
            loss.backward()

            # step 5. use optimizer to take gradient step
            optimizer.step()
            # -----------------------------------------
            # compute the accuracy
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)

            # update bar
            train_bar.set_postfix(loss=running_loss, acc=running_acc, 
                                  epoch=epoch_index)
            train_bar.update()

        train_state['train_loss'].append(running_loss)
        train_state['train_acc'].append(running_acc)

        # Iterate over val dataset

        # setup: batch generator, set loss and acc to 0; set eval mode on
        dataset.set_split('val')
        batch_generator = generate_batches(dataset, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
        running_loss = 0.
        running_acc = 0.
        classifier.eval()

        for batch_index, batch_dict in enumerate(batch_generator):

            # compute the output
            y_pred =  classifier(batch_dict['x_data'])

            # step 3. compute the loss
            loss = loss_func(y_pred, batch_dict['y_target'])
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            # compute the accuracy
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)
            val_bar.set_postfix(loss=running_loss, acc=running_acc, 
                            epoch=epoch_index)
            val_bar.update()

        train_state['val_loss'].append(running_loss)
        train_state['val_acc'].append(running_acc)

        train_state = update_train_state(args=args, model=classifier,
                                         train_state=train_state)

        scheduler.step(train_state['val_loss'][-1])

        if train_state['stop_early']:
            break

        train_bar.n = 0
        val_bar.n = 0
        epoch_bar.update()
except KeyboardInterrupt:
    print("Exiting loop")

training routine:   0%|          | 0/100 [00:00<?, ?it/s]

split=train:   0%|          | 0/10 [00:00<?, ?it/s]

split=val:   0%|          | 0/1 [00:00<?, ?it/s]

In [1019]:
# compute the loss & accuracy on the test set using the best available model

classifier.load_state_dict(torch.load(train_state['model_filename']))

classifier = classifier.to(args.device)
dataset.class_weights = dataset.class_weights.to(args.device)
loss_func = nn.CrossEntropyLoss(dataset.class_weights)

dataset.set_split('test')
batch_generator = generate_batches(dataset, 
                                   batch_size=args.batch_size, 
                                   device=args.device)
running_loss = 0.
running_acc = 0.
classifier.eval()

for batch_index, batch_dict in enumerate(batch_generator):
    # compute the output
    y_pred =  classifier(batch_dict['x_data'])
    
    # compute the loss
    loss = loss_func(y_pred, batch_dict['y_target'])
    loss_t = loss.item()
    running_loss += (loss_t - running_loss) / (batch_index + 1)

    # compute the accuracy
    acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
    running_acc += (acc_t - running_acc) / (batch_index + 1)

train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc

print("Test loss: {};".format(train_state['test_loss']))
print("Test Accuracy: {}".format(train_state['test_acc']))

# Test loss: 0.6115186810493469;
# Test Accuracy: 71.61458333333333

Test loss: 0.5884544253349304;
Test Accuracy: 70.3125


# Task 2 Multi-class Document Classification

## Vanilla Elman RNN

In [1165]:
# Dataset
class RNNDataset(Dataset):
    def __init__(self, task2rnn_df, vectorizer):
        self.task2rnn_df = task2rnn_df 
        self._vectorizer = vectorizer

        self._max_seq_length = max(map(len, self.task2rnn_df.top_10_words)) + 2

        self.train_df = self.task2rnn_df[self.task2rnn_df.split=='train']
        self.train_size = len(self.train_df)

        self.val_df = self.task2rnn_df[self.task2rnn_df.split=='val']
        self.validation_size = len(self.val_df)

        self.test_df = self.task2rnn_df[self.task2rnn_df.split=='test']
        self.test_size = len(self.test_df)

        self._lookup_dict = {'train': (self.train_df, self.train_size), 
                             'val': (self.val_df, self.validation_size), 
                             'test': (self.test_df, self.test_size)}

        self.set_split('train')
        
        # Class weights
        class_counts = self.train_df.category.value_counts().to_dict()
        def sort_key(item):
            return self._vectorizer.category_vocab.lookup_token(item[0])
        sorted_counts = sorted(class_counts.items(), key=sort_key)
        frequencies = [count for _, count in sorted_counts]
        self.class_weights = 1.0 / torch.tensor(frequencies, dtype=torch.float32)

    @classmethod
    def load_dataset_and_make_vectorizer(cls, task2RNN_csv):
        task2rnn_df = pd.read_csv(task2RNN_csv)
        train_task2rnn_df = task2rnn_df[task2rnn_df.split=='train']
        return cls(task2rnn_df, RNNVectorizer.from_dataframe(train_task2rnn_df))
        
    @classmethod
    def load_dataset_and_load_vectorizer(cls, task2RNN_csv, vectorizer_filepath):
        task2rnn_df = pd.read_csv(task2RNN_csv)
        vectorizer = cls.load_vectorizer_only(vectorizer_filepath)
        return cls(task2rnn_df, vectorizer)

    @staticmethod
    def load_vectorizer_only(vectorizer_filepath):
        with open(vectorizer_filepath) as fp:
            return RNNVectorizer.from_serializable(json.load(fp))

    def save_vectorizer(self, vectorizer_filepath):
        with open(vectorizer_filepath, "w") as fp:
            json.dump(self._vectorizer.to_serializable(), fp)

    def get_vectorizer(self):
        return self._vectorizer

    def set_split(self, split="train"):
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]

    def __len__(self):
        return self._target_size

    def __getitem__(self, index):
        row = self._target_df.iloc[index]
        
        top_10_words_vector, vec_length = \
            self._vectorizer.vectorize(row.top_10_words, self._max_seq_length)
        
        category_index = \
            self._vectorizer.category_vocab.lookup_token(row.category)

        return {'x_data': top_10_words_vector, 
                'y_target': category_index, 
                'x_length': vec_length}

    def get_num_batches(self, batch_size):
        return len(self) // batch_size

def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"): 
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

In [1166]:
# Vocabulary
class Vocabulary(object):
    def __init__(self, token_to_idx=None):
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx

        self._idx_to_token = {idx: token 
                              for token, idx in self._token_to_idx.items()}
        
    def to_serializable(self):
        return {'token_to_idx': self._token_to_idx}

    @classmethod
    def from_serializable(cls, contents):
        return cls(**contents)

    def add_token(self, token):
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index
            
    def add_many(self, tokens):
        return [self.add_token(token) for token in tokens]

    def lookup_token(self, token):
        return self._token_to_idx[token]

    def lookup_index(self, index):
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]

    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)

    def __len__(self):
        return len(self._token_to_idx)

In [1167]:
# SequenceVocabulary
class SequenceVocabulary(Vocabulary):
    def __init__(self, token_to_idx=None, unk_token="<UNK>",
                 mask_token="<MASK>", begin_seq_token="<BEGIN>",
                 end_seq_token="<END>"):

        super(SequenceVocabulary, self).__init__(token_to_idx)

        self._mask_token = mask_token
        self._unk_token = unk_token
        self._begin_seq_token = begin_seq_token
        self._end_seq_token = end_seq_token

        self.mask_index = self.add_token(self._mask_token)
        self.unk_index = self.add_token(self._unk_token)
        self.begin_seq_index = self.add_token(self._begin_seq_token)
        self.end_seq_index = self.add_token(self._end_seq_token)

    def to_serializable(self):
        contents = super(SequenceVocabulary, self).to_serializable()
        contents.update({'unk_token': self._unk_token,
                         'mask_token': self._mask_token,
                         'begin_seq_token': self._begin_seq_token,
                         'end_seq_token': self._end_seq_token})
        return contents

    def lookup_token(self, token):
        if self.unk_index >= 0:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]

In [1168]:
# RNN Vectorizer
class RNNVectorizer(object): 
    def __init__(self, top_10_words_vocab, category_vocab):
        self.top_10_words_vocab = top_10_words_vocab
        self.category_vocab = category_vocab

    def vectorize(self, top_10_words, vector_length=-1):
        indices = [self.top_10_words_vocab.begin_seq_index]
        indices.extend(self.top_10_words_vocab.lookup_token(token) 
                       for token in top_10_words)
        indices.append(self.top_10_words_vocab.end_seq_index)

        if vector_length < 0:
            vector_length = len(indices)

        out_vector = np.zeros(vector_length, dtype=np.int64)         
        out_vector[:len(indices)] = indices
        out_vector[len(indices):] = self.top_10_words_vocab.mask_index
        
        return out_vector, len(indices)

    @classmethod
    def from_dataframe(cls, task2rnn_df):
        top_10_words_vocab = SequenceVocabulary()
        category_vocab = Vocabulary()

        for index, row in task2rnn_df.iterrows():
            for top_10 in row.top_10_words:
                top_10_words_vocab.add_token(top_10)
            category_vocab.add_token(row.category)

        return cls(top_10_words_vocab, category_vocab)

    @classmethod
    def from_serializable(cls, contents):
        top_10_words_vocab = SequenceVocabulary.from_serializable(contents['top_10_words_vocab'])
        category_vocab =  Vocabulary.from_serializable(contents['category_vocab'])

        return cls(top_10_words_vocab=top_10_words_vocab, category_vocab=category_vocab)

    def to_serializable(self):
        return {'top_10_words_vocab': self.top_10_words_vocab.to_serializable(), 
                'category_vocab': self.category_vocab.to_serializable()}

In [1169]:
# Model - retrieving the last vector of each sequence
def column_gather(y_out, x_lengths):
    x_lengths = x_lengths.long().detach().cpu().numpy() - 1
    out = []
    for batch_index, column_index in enumerate(x_lengths):
        out.append(y_out[batch_index, column_index])

    return torch.stack(out)

In [1170]:
# Vanilla ElmanRNN model
class ElmanRNN(nn.Module):
    def __init__(self, input_size, hidden_size, batch_first=False):
        super(ElmanRNN, self).__init__()
        
        self.rnn_cell = nn.RNNCell(input_size, hidden_size)
        
        self.batch_first = batch_first
        self.hidden_size = hidden_size

    def _initial_hidden(self, batch_size):
        return torch.zeros((batch_size, self.hidden_size))

    def forward(self, x_in, initial_hidden=None):
        if self.batch_first:
            batch_size, seq_size, feat_size = x_in.size()
            x_in = x_in.permute(1, 0, 2)
        else:
            seq_size, batch_size, feat_size = x_in.size()
    
        hiddens = []

        if initial_hidden is None:
            initial_hidden = self._initial_hidden(batch_size)
            initial_hidden = initial_hidden.to(x_in.device)

        hidden_t = initial_hidden
                    
        for t in range(seq_size):
            hidden_t = self.rnn_cell(x_in[t], hidden_t)
            hiddens.append(hidden_t)
            
        hiddens = torch.stack(hiddens)

        if self.batch_first:
            hiddens = hiddens.permute(1, 0, 2)

        return hiddens

class CategoryClassifier(nn.Module):
    def __init__(self, embedding_size, num_embeddings, num_classes,
                 rnn_hidden_size, batch_first=True, padding_idx=0, pretrained_embeddings=None):
        super(CategoryClassifier, self).__init__()
        # modify the classifier to allow for the choice of one of the three:
        if pretrained_embeddings is None:

            self.emb = nn.Embedding(embedding_dim=embedding_size,
                                    num_embeddings=num_embeddings,
                                    padding_idx=padding_idx)        
        else:
            pretrained_embeddings = torch.from_numpy(pretrained_embeddings).float()
            self.emb = nn.Embedding(embedding_dim=embedding_size,
                                    num_embeddings=num_embeddings,
                                    padding_idx=padding_idx,
                                    _weight=pretrained_embeddings)
                                    
        self.rnn = ElmanRNN(input_size=embedding_size,
                             hidden_size=rnn_hidden_size,
                             batch_first=batch_first)
        self.fc1 = nn.Linear(in_features=rnn_hidden_size,
                         out_features=rnn_hidden_size)
        self.fc2 = nn.Linear(in_features=rnn_hidden_size,
                          out_features=num_classes)

    def forward(self, x_in, x_lengths=None, apply_softmax=False):
        x_embedded = self.emb(x_in)
        y_out = self.rnn(x_embedded)

        if x_lengths is not None:
            y_out = column_gather(y_out, x_lengths)
        else:
            y_out = y_out[:, -1, :]

        y_out = F.relu(self.fc1(F.dropout(y_out, 0.5)))
        y_out = self.fc2(F.dropout(y_out, 0.5))

        if apply_softmax:
            y_out = F.softmax(y_out, dim=1)

        return y_out

#### Training Routine

In [1171]:
# helper function / hyperparameters
def make_train_state(args):
    return {'stop_early': False,
            'early_stopping_step': 0,
            'early_stopping_best_val': 1e8,
            'learning_rate': args.learning_rate,
            'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': [],
            'test_loss': -1,
            'test_acc': -1,
            'model_filename': args.model_state_file}


def update_train_state(args, model, train_state):
    # Save one model at least
    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(), train_state['model_filename'])
        train_state['stop_early'] = False

    # Save model if performance improved
    elif train_state['epoch_index'] >= 1:
        loss_tm1, loss_t = train_state['val_loss'][-2:]
         
        # If loss worsened
        if loss_t >= loss_tm1:
            # Update step
            train_state['early_stopping_step'] += 1
        # Loss decreased
        else:
            # Save the best model
            if loss_t < train_state['early_stopping_best_val']:
                torch.save(model.state_dict(), train_state['model_filename'])
                train_state['early_stopping_best_val'] = loss_t

            # Reset early stopping step
            train_state['early_stopping_step'] = 0

        # Stop early ?
        train_state['stop_early'] = \
            train_state['early_stopping_step'] >= args.early_stopping_criteria

    return train_state


def compute_accuracy(y_pred, y_target):
    _, y_pred_indices = y_pred.max(dim=1)
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100

In [1172]:
# General utilities
def set_seed_everywhere(seed, cuda):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed_all(seed)

def handle_dirs(dirpath):
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)
        
def load_glove_from_file(glove_filepath): 
    word_to_index = {}
    embeddings = []
    with open(glove_filepath, encoding="utf8") as fp:
        for index, line in enumerate(fp):
            line = line.split(" ") # each line: word num1 num2 ...
            word_to_index[line[0]] = index # word = line[0] 
            embedding_i = np.array([float(val) for val in line[1:]])
            embeddings.append(embedding_i)
    return word_to_index, np.stack(embeddings)

def load_domain_from_file(domain_filepath):
    word_to_index = {}
    embeddings = []
    with open(domain_filepath, encoding="utf8") as fp:
        for index, line in enumerate(fp):
            line = line.split(" ") # each line: word num1 num2 ...
            word_to_index[line[0]] = index # word = line[0] 
            embedding_i = np.array([float(val) for val in line[1:]])
            embeddings.append(embedding_i)
    return word_to_index, np.stack(embeddings)

def make_glove_embedding_matrix(glove_filepath, words):
    word_to_idx, glove_embeddings = load_glove_from_file(glove_filepath)
    embedding_size = glove_embeddings.shape[1]
    
    final_embeddings = np.zeros((len(words), embedding_size))

    for i, word in enumerate(words):
        if word in word_to_idx:
            final_embeddings[i, :] = glove_embeddings[word_to_idx[word]]
        else:
            embedding_i = torch.ones(1, embedding_size)
            torch.nn.init.xavier_uniform_(embedding_i)
            final_embeddings[i, :] = embedding_i
    return final_embeddings

def make_domain_embedding_matrix(domain_filepath, words):
    word_to_idx, domain_embeddings = load_domain_from_file(domain_filepath)
    embedding_size = domain_embeddings.shape[1]
    
    final_embeddings = np.zeros((len(words), embedding_size))

    for i, word in enumerate(words):
        if word in word_to_idx:
            final_embeddings[i, :] = domain_embeddings[word_to_idx[word]]
        else:
            embedding_i = torch.ones(1, embedding_size)
            torch.nn.init.xavier_uniform_(embedding_i)
            final_embeddings[i, :] = embedding_i
    return final_embeddings

In [1173]:
import os
os.getcwd()

'/Users/jrc/Documents/Data science/S1-2022/CITS4012 Natural Language Processing/Project2'

In [1176]:
# Setting and some prep work
args = Namespace(
    # Data and path information
    task2RNN_csv="ass02_task02.csv",
    vectorizer_file="vectorizer1.json",
    model_state_file="model1.pth",
    save_dir="model_storage/rnn/glove",
    # Model hyper parameter
    glove_filepath='glove.6B.100d.txt', 
    domain_filepath='domain_embeddings.txt',
    use_glove=False,
    use_domain_embeddings=False,
    char_embedding_size=100,
    rnn_hidden_size=600,
    # Training hyper parameter
    num_epochs=100,
    learning_rate=1e-3,
    batch_size=64,
    seed=1337,
    early_stopping_criteria=5,
    # Runtime hyper parameter
    cuda=True,
    device='cuda',
    catch_keyboard_interrupt=True,
    reload_from_files=False,
    expand_filepaths_to_save_dir=True,
)

if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir,
                                        args.vectorizer_file)

    args.model_state_file = os.path.join(args.save_dir,
                                         args.model_state_file)
    
    print("Expanded filepaths: ")
    print("\t{}".format(args.vectorizer_file))
    print("\t{}".format(args.model_state_file))

# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False

args.device = torch.device("cuda" if args.cuda else "cpu")
    
print("Using CUDA: {}".format(args.cuda))
    
# Set seed for reproducibility
set_seed_everywhere(args.seed, args.cuda)

# handle dirs
handle_dirs(args.save_dir)

Expanded filepaths: 
	model_storage/rnn/glove/vectorizer1.json
	model_storage/rnn/glove/model1.pth
Using CUDA: False


### Pre-trained GloVe embedding

In [1177]:
# Initializations
args.use_glove = True
# args.use_domain_embeddings = True
if args.reload_from_files:
    # training from a checkpoint
    dataset = RNNDataset.load_dataset_and_load_vectorizer(args.task2RNN_csv, 
                                                              args.vectorizer_file)
else:
    # create dataset and vectorizer
    dataset = RNNDataset.load_dataset_and_make_vectorizer(args.task2RNN_csv)
    dataset.save_vectorizer(args.vectorizer_file)
vectorizer = dataset.get_vectorizer()

# Use GloVe, domain_embeddingsor randomly initialized embeddings
if args.use_glove: 
    words = vectorizer.top_10_words_vocab._token_to_idx.keys()
    embeddings = make_glove_embedding_matrix(glove_filepath=args.glove_filepath, 
                                       words=words)
    print("Using pre-trained glove embeddings")
elif args.use_domain_embeddings:
    words = vectorizer.top_10_words_vocab._token_to_idx.keys()
    embeddings = make_domain_embedding_matrix(domain_filepath=args.domain_filepath, 
                                       words=words)
    print("Using pre-trained domain specific embeddings")
else:
    print("Not using pre-trained embeddings")
    embeddings = None

classifier = CategoryClassifier(embedding_size=args.char_embedding_size, 
                               num_embeddings=len(vectorizer.top_10_words_vocab),
                               num_classes=len(vectorizer.category_vocab),
                               rnn_hidden_size=args.rnn_hidden_size,
                               padding_idx=vectorizer.top_10_words_vocab.mask_index,
                               pretrained_embeddings=embeddings)

Using pre-trained glove embeddings


In [1178]:
classifier = classifier.to(args.device)
dataset.class_weights = dataset.class_weights.to(args.device)
    
loss_func = nn.CrossEntropyLoss(dataset.class_weights)
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                           mode='min', factor=0.5,
                                           patience=1)

train_state = make_train_state(args)

epoch_bar = tqdm(desc='training routine', 
                          total=args.num_epochs,
                          position=0)

dataset.set_split('train')
train_bar = tqdm(desc='split=train',
                          total=dataset.get_num_batches(args.batch_size), 
                          position=1, 
                          leave=True)
dataset.set_split('val')
val_bar = tqdm(desc='split=val',
                        total=dataset.get_num_batches(args.batch_size), 
                        position=1, 
                        leave=True)

try:
    for epoch_index in range(args.num_epochs):
        train_state['epoch_index'] = epoch_index

        # Iterate over training dataset

        # setup: batch generator, set loss and acc to 0, set train mode on
        dataset.set_split('train')
        batch_generator = generate_batches(dataset, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
        running_loss = 0.0
        running_acc = 0.0
        classifier.train()

        for batch_index, batch_dict in enumerate(batch_generator):
            # the training routine is these 5 steps:

            # --------------------------------------    
            # step 1. zero the gradients
            optimizer.zero_grad()

            # step 2. compute the output
            y_pred = classifier(x_in=batch_dict['x_data'], 
                                x_lengths=batch_dict['x_length'])

            # step 3. compute the loss
            loss = loss_func(y_pred, batch_dict['y_target'])
    
            running_loss += (loss.item() - running_loss) / (batch_index + 1)

            # step 4. use loss to produce gradients
            loss.backward()

            # step 5. use optimizer to take gradient step
            optimizer.step()
            # -----------------------------------------
            # compute the accuracy
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)

            # update bar
            train_bar.set_postfix(loss=running_loss, acc=running_acc, epoch=epoch_index)
            train_bar.update()

        train_state['train_loss'].append(running_loss)
        train_state['train_acc'].append(running_acc)

        # Iterate over val dataset

        # setup: batch generator, set loss and acc to 0; set eval mode on

        dataset.set_split('val')
        batch_generator = generate_batches(dataset, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
        running_loss = 0.
        running_acc = 0.
        classifier.eval()

        for batch_index, batch_dict in enumerate(batch_generator):
            # compute the output
            y_pred = classifier(x_in=batch_dict['x_data'], 
                                x_lengths=batch_dict['x_length'])

            # step 3. compute the loss
            loss = loss_func(y_pred, batch_dict['y_target'])
            running_loss += (loss.item() - running_loss) / (batch_index + 1)

            # compute the accuracy
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)
            val_bar.set_postfix(loss=running_loss, acc=running_acc, epoch=epoch_index)
            val_bar.update()

        train_state['val_loss'].append(running_loss)
        train_state['val_acc'].append(running_acc)

        train_state = update_train_state(args=args, model=classifier, 
                                         train_state=train_state)

        scheduler.step(train_state['val_loss'][-1])

        train_bar.n = 0
        val_bar.n = 0
        epoch_bar.update()

        if train_state['stop_early']:
            break
            
except KeyboardInterrupt:
    print("Exiting loop")

training routine:   0%|          | 0/100 [00:00<?, ?it/s]

split=train:   0%|          | 0/21 [00:00<?, ?it/s]

split=val:   0%|          | 0/2 [00:00<?, ?it/s]

In [1179]:
# compute the loss & accuracy on the test set using the best available model
classifier.load_state_dict(torch.load(train_state['model_filename']))

classifier = classifier.to(args.device)
dataset.class_weights = dataset.class_weights.to(args.device)
loss_func = nn.CrossEntropyLoss(dataset.class_weights)

dataset.set_split('test')
batch_generator = generate_batches(dataset, 
                                   batch_size=args.batch_size, 
                                   device=args.device)
running_loss = 0.
running_acc = 0.
classifier.eval()

for batch_index, batch_dict in enumerate(batch_generator):
    # compute the output
    y_pred =  classifier(batch_dict['x_data'],
                         x_lengths=batch_dict['x_length'])
    
    # # compute the loss
    loss = loss_func(y_pred, batch_dict['y_target'])
    loss_t = loss.item()
    running_loss += (loss_t - running_loss) / (batch_index + 1)

    # # compute the accuracy
    acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
    running_acc += (acc_t - running_acc) / (batch_index + 1)

train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc

In [1181]:
print("Test loss: {};".format(train_state['test_loss']))
print("Test Accuracy: {}".format(train_state['test_acc']))

# Test loss: 3.3487893740336103;
# Test Accuracy: 7.03125

Test loss: 4.097150802612305;
Test Accuracy: 19.791666666666668


In [1183]:
# Inference
def predict_category(top_10_words, classifier, vectorizer):
    vectorized_type, vec_length = vectorizer.vectorize(top_10_words)
    vectorized_type = torch.tensor(vectorized_type).unsqueeze(dim=0)
    vec_length = torch.tensor([vec_length], dtype=torch.int64)
    
    result = classifier(vectorized_type, vec_length, apply_softmax=True)
    probability_values, indices = result.max(dim=1)
    
    index = indices.item()
    prob_value = probability_values.item()

    predicted_category = vectorizer.category_vocab.lookup_index(index)

    return {'category': predicted_category, 'probability': prob_value, 'top_10_words': top_10_words}

ex1 = split_task2.iloc[100,]
ex2 = split_task2.iloc[500,]
ex3 = split_task2.iloc[1000,]
for top_10_words in [ex1, ex2, ex3]:
    print('Predicted label is:', predict_category(top_10_words.top_10_words, classifier, vectorizer)['category'])
    print('True label is:', top_10_words.category)
    print('Probability is:', predict_category(top_10_words.top_10_words, classifier, vectorizer)['probability'], '\n')

Predicted label is: Accounting
True label is: Accounting
Probability is: 0.17699715495109558 

Predicted label is: Design & Architecture
True label is: Design & Architecture
Probability is: 0.5028868913650513 

Predicted label is: Government & Defence
True label is: Hospitality & Tourism
Probability is: 0.15045621991157532 



### Pre-trained domain specific embedding

In [1185]:
# Setting and some prep work
args = Namespace(
    # Data and path information
    task2RNN_csv="ass02_task02.csv",
    vectorizer_file="vectorizer2.json",
    model_state_file="model2.pth",
    save_dir="model_storage/rnn/domain",
    # Model hyper parameter
    glove_filepath='glove.6B.100d.txt', 
    domain_filepath='domain_embeddings.txt',
    use_glove=False,
    use_domain_embeddings=False,
    char_embedding_size=100,
    rnn_hidden_size=400,
    # Training hyper parameter
    num_epochs=100,
    learning_rate=1e-3,
    batch_size=64,
    seed=1337,
    early_stopping_criteria=5,
    # Runtime hyper parameter
    cuda=True,
    catch_keyboard_interrupt=True,
    reload_from_files=False,
    expand_filepaths_to_save_dir=True,
)

if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir,
                                        args.vectorizer_file)

    args.model_state_file = os.path.join(args.save_dir,
                                         args.model_state_file)
    
    print("Expanded filepaths: ")
    print("\t{}".format(args.vectorizer_file))
    print("\t{}".format(args.model_state_file))

# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False

args.device = torch.device("cuda" if args.cuda else "cpu")
    
print("Using CUDA: {}".format(args.cuda))
    
# Set seed for reproducibility
set_seed_everywhere(args.seed, args.cuda)

# handle dirs
handle_dirs(args.save_dir)

Expanded filepaths: 
	model_storage/rnn/domain/vectorizer2.json
	model_storage/rnn/domain/model2.pth
Using CUDA: False


In [1186]:
# Initializations
args.use_domain_embeddings = True
if args.reload_from_files:
    # training from a checkpoint
    dataset = RNNDataset.load_dataset_and_load_vectorizer(args.task2RNN_csv, 
                                                              args.vectorizer_file)
else:
    # create dataset and vectorizer
    dataset = RNNDataset.load_dataset_and_make_vectorizer(args.task2RNN_csv)
    dataset.save_vectorizer(args.vectorizer_file)
vectorizer = dataset.get_vectorizer()

# Use GloVe, domain_embeddingsor randomly initialized embeddings
if args.use_glove: 
    words = vectorizer.top_10_words_vocab._token_to_idx.keys()
    embeddings = make_glove_embedding_matrix(glove_filepath=args.glove_filepath, 
                                       words=words)
    print("Using pre-trained glove embeddings")
elif args.use_domain_embeddings:
    words = vectorizer.top_10_words_vocab._token_to_idx.keys()
    embeddings = make_domain_embedding_matrix(domain_filepath=args.domain_filepath, 
                                       words=words)
    print("Using pre-trained domain specific embeddings")
else:
    print("Not using pre-trained embeddings")
    embeddings = None

classifier = CategoryClassifier(embedding_size=args.char_embedding_size, 
                               num_embeddings=len(vectorizer.top_10_words_vocab),
                               num_classes=len(vectorizer.category_vocab),
                               rnn_hidden_size=args.rnn_hidden_size,
                               padding_idx=vectorizer.top_10_words_vocab.mask_index,
                               pretrained_embeddings=embeddings)

Using pre-trained domain specific embeddings


In [1187]:
classifier = classifier.to(args.device)
dataset.class_weights = dataset.class_weights.to(args.device)
    
loss_func = nn.CrossEntropyLoss(dataset.class_weights)
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                           mode='min', factor=0.5,
                                           patience=1)

train_state = make_train_state(args)

epoch_bar = tqdm(desc='training routine', 
                          total=args.num_epochs,
                          position=0)

dataset.set_split('train')
train_bar = tqdm(desc='split=train',
                          total=dataset.get_num_batches(args.batch_size), 
                          position=1, 
                          leave=True)
dataset.set_split('val')
val_bar = tqdm(desc='split=val',
                        total=dataset.get_num_batches(args.batch_size), 
                        position=1, 
                        leave=True)

try:
    for epoch_index in range(args.num_epochs):
        train_state['epoch_index'] = epoch_index

        # Iterate over training dataset

        # setup: batch generator, set loss and acc to 0, set train mode on
        dataset.set_split('train')
        batch_generator = generate_batches(dataset, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
        running_loss = 0.0
        running_acc = 0.0
        classifier.train()

        for batch_index, batch_dict in enumerate(batch_generator):
            # the training routine is these 5 steps:

            # --------------------------------------    
            # step 1. zero the gradients
            optimizer.zero_grad()

            # step 2. compute the output
            y_pred = classifier(x_in=batch_dict['x_data'], 
                                x_lengths=batch_dict['x_length'])

            # step 3. compute the loss
            loss = loss_func(y_pred, batch_dict['y_target'])
    
            running_loss += (loss.item() - running_loss) / (batch_index + 1)

            # step 4. use loss to produce gradients
            loss.backward()

            # step 5. use optimizer to take gradient step
            optimizer.step()
            # -----------------------------------------
            # compute the accuracy
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)

            # update bar
            train_bar.set_postfix(loss=running_loss, acc=running_acc, epoch=epoch_index)
            train_bar.update()

        train_state['train_loss'].append(running_loss)
        train_state['train_acc'].append(running_acc)

        # Iterate over val dataset

        # setup: batch generator, set loss and acc to 0; set eval mode on

        dataset.set_split('val')
        batch_generator = generate_batches(dataset, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
        running_loss = 0.
        running_acc = 0.
        classifier.eval()

        for batch_index, batch_dict in enumerate(batch_generator):
            # compute the output
            y_pred = classifier(x_in=batch_dict['x_data'], 
                                x_lengths=batch_dict['x_length'])

            # step 3. compute the loss
            loss = loss_func(y_pred, batch_dict['y_target'])
            running_loss += (loss.item() - running_loss) / (batch_index + 1)

            # compute the accuracy
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)
            val_bar.set_postfix(loss=running_loss, acc=running_acc, epoch=epoch_index)
            val_bar.update()

        train_state['val_loss'].append(running_loss)
        train_state['val_acc'].append(running_acc)

        train_state = update_train_state(args=args, model=classifier, 
                                         train_state=train_state)

        scheduler.step(train_state['val_loss'][-1])

        train_bar.n = 0
        val_bar.n = 0
        epoch_bar.update()

        if train_state['stop_early']:
            break
            
except KeyboardInterrupt:
    print("Exiting loop")

training routine:   0%|          | 0/100 [00:00<?, ?it/s]

split=train:   0%|          | 0/21 [00:00<?, ?it/s]

split=val:   0%|          | 0/2 [00:00<?, ?it/s]

In [1188]:
# compute the loss & accuracy on the test set using the best available model
classifier.load_state_dict(torch.load(train_state['model_filename']))

classifier = classifier.to(args.device)
dataset.class_weights = dataset.class_weights.to(args.device)
loss_func = nn.CrossEntropyLoss(dataset.class_weights)

dataset.set_split('test')
batch_generator = generate_batches(dataset, 
                                   batch_size=args.batch_size, 
                                   device=args.device)
running_loss = 0.
running_acc = 0.
classifier.eval()

for batch_index, batch_dict in enumerate(batch_generator):
    # compute the output
    y_pred =  classifier(batch_dict['x_data'],
                         x_lengths=batch_dict['x_length'])
    
    # # compute the loss
    loss = loss_func(y_pred, batch_dict['y_target'])
    loss_t = loss.item()
    running_loss += (loss_t - running_loss) / (batch_index + 1)

    # # compute the accuracy
    acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
    running_acc += (acc_t - running_acc) / (batch_index + 1)

train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc

In [1189]:
print("Test loss: {};".format(train_state['test_loss']))
print("Test Accuracy: {}".format(train_state['test_acc']))

# Test loss: 3.5385095675786333;
# Test Accuracy: 5.989583333333333

Test loss: 3.8599623044331866;
Test Accuracy: 16.40625


In [1190]:
# Inference
def predict_category(top_10_words, classifier, vectorizer):
    vectorized_type, vec_length = vectorizer.vectorize(top_10_words)
    vectorized_type = torch.tensor(vectorized_type).unsqueeze(dim=0)
    vec_length = torch.tensor([vec_length], dtype=torch.int64)
    
    result = classifier(vectorized_type, vec_length, apply_softmax=True)
    probability_values, indices = result.max(dim=1)
    
    index = indices.item()
    prob_value = probability_values.item()

    predicted_category = vectorizer.category_vocab.lookup_index(index)

    return {'category': predicted_category, 'probability': prob_value, 'top_10_words': top_10_words}

ex1 = split_task2.iloc[100,]
ex2 = split_task2.iloc[500,]
ex3 = split_task2.iloc[1000,]
for top_10_words in [ex1, ex2, ex3]:
    print('Predicted label is:', predict_category(top_10_words.top_10_words, classifier, vectorizer)['category'])
    print('True label is:', top_10_words.category)
    print('Probability is:', predict_category(top_10_words.top_10_words, classifier, vectorizer)['probability'], '\n')

Predicted label is: Real Estate & Property
True label is: Accounting
Probability is: 0.19581517577171326 

Predicted label is: Design & Architecture
True label is: Design & Architecture
Probability is: 0.34595853090286255 

Predicted label is: Hospitality & Tourism
True label is: Hospitality & Tourism
Probability is: 0.11304925382137299 



## Gated Recurrent Unit (GRU)

In [None]:
class Vocabulary(object):
    """Class to process text and extract vocabulary for mapping"""

    def __init__(self, token_to_idx=None):
        """
        Args:
            token_to_idx (dict): a pre-existing map of tokens to indices
        """

        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx

        self._idx_to_token = {idx: token 
                              for token, idx in self._token_to_idx.items()}
        
    def to_serializable(self):
        """ returns a dictionary that can be serialized """
        return {'token_to_idx': self._token_to_idx}

    @classmethod
    def from_serializable(cls, contents):
        """ instantiates the Vocabulary from a serialized dictionary """
        return cls(**contents)

    def add_token(self, token):
        """Update mapping dicts based on the token.

        Args:
            token (str): the item to add into the Vocabulary
        Returns:
            index (int): the integer corresponding to the token
        """
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index
            
    def add_many(self, tokens):
        """Add a list of tokens into the Vocabulary
        
        Args:
            tokens (list): a list of string tokens
        Returns:
            indices (list): a list of indices corresponding to the tokens
        """
        return [self.add_token(token) for token in tokens]

    def lookup_token(self, token):
        """Retrieve the index associated with the token 
        
        Args:
            token (str): the token to look up 
        Returns:
            index (int): the index corresponding to the token
        """
        return self._token_to_idx[token]

    def lookup_index(self, index):
        """Return the token associated with the index
        
        Args: 
            index (int): the index to look up
        Returns:
            token (str): the token corresponding to the index
        Raises:
            KeyError: if the index is not in the Vocabulary
        """
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]

    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)

    def __len__(self):
        return len(self._token_to_idx)

In [None]:
class SequenceVocabulary(Vocabulary):
    def __init__(self, token_to_idx=None, unk_token="<UNK>",
                 mask_token="<MASK>", begin_seq_token="<BEGIN>",
                 end_seq_token="<END>"):

        super(SequenceVocabulary, self).__init__(token_to_idx)

        self._mask_token = mask_token
        self._unk_token = unk_token
        self._begin_seq_token = begin_seq_token
        self._end_seq_token = end_seq_token

        self.mask_index = self.add_token(self._mask_token)
        self.unk_index = self.add_token(self._unk_token)
        self.begin_seq_index = self.add_token(self._begin_seq_token)
        self.end_seq_index = self.add_token(self._end_seq_token)

    def to_serializable(self):
        contents = super(SequenceVocabulary, self).to_serializable()
        contents.update({'unk_token': self._unk_token,
                         'mask_token': self._mask_token,
                         'begin_seq_token': self._begin_seq_token,
                         'end_seq_token': self._end_seq_token})
        return contents

    def lookup_token(self, token):
        """Retrieve the index associated with the token 
          or the UNK index if token isn't present.
        
        Args:
            token (str): the token to look up 
        Returns:
            index (int): the index corresponding to the token
        Notes:
            `unk_index` needs to be >=0 (having been added into the Vocabulary) 
              for the UNK functionality 
        """
        if self.unk_index >= 0:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]

In [None]:
class GRUVectorizer(object):
    """ The Vectorizer which coordinates the Vocabularies and puts them to use"""    
    def __init__(self, top_10_words_vocab, category_vocab):
        """
        Args:
            top_10_words_vocab (SequenceVocabulary): maps words to integers
            category_vocab (Vocabulary): maps categories to integers
        """
        self.top_10_words_vocab = top_10_words_vocab
        self.category_vocab = category_vocab

    def vectorize(self, top_10_words, vector_length=-1):
        """Vectorize a top_10_words into a vector of observations and targets
        
        The outputs are the vectorized top_10_words split into two vectors:
            top_10_words[:-1] and top_10_words[1:]
        At each timestep, the first vector is the observation and the second vector is the target. 
        
        Args:
            top_10_words (str): the top_10_words to be vectorized
            vector_length (int): an argument for forcing the length of index vector
        Returns:
            a tuple: (from_vector, to_vector)
            from_vector (numpy.ndarray): the observation vector 
            to_vector (numpy.ndarray): the target prediction vector
        """
        indices = [self.top_10_words_vocab.begin_seq_index] 
        indices.extend(self.top_10_words_vocab.lookup_token(token) for token in top_10_words)
        indices.append(self.top_10_words_vocab.end_seq_index)

        if vector_length < 0:
            vector_length = len(indices) - 1

        from_vector = np.zeros(vector_length, dtype=np.int64)         
        from_indices = indices[:-1]
        from_vector[:len(from_indices)] = from_indices
        from_vector[len(from_indices):] = self.top_10_words_vocab.mask_index

        to_vector = np.zeros(vector_length, dtype=np.int64)
        to_indices = indices[1:]
        to_vector[:len(to_indices)] = to_indices
        to_vector[len(to_indices):] = self.top_10_words_vocab.mask_index
        
        return from_vector, to_vector

    @classmethod
    def from_dataframe(cls, top_10_words_df):
        """Instantiate the vectorizer from the dataset dataframe
        
        Args:
            surname_df (pandas.DataFrame): the surname dataset
        Returns:
            an instance of the SurnameVectorizer
        """
        top_10_words_vocab = SequenceVocabulary()
        category_vocab = Vocabulary()

        for index, row in top_10_words_df.iterrows():
            for char in row.top_10_words:
                top_10_words_vocab.add_token(char)
            category_vocab.add_token(row.category)

        return cls(top_10_words_vocab, category_vocab)

    @classmethod
    def from_serializable(cls, contents):
        """Instantiate the vectorizer from saved contents
        
        Args:
            contents (dict): a dict holding two vocabularies for this vectorizer
                This dictionary is created using `vectorizer.to_serializable()`
        Returns:
            an instance of SurnameVectorizer
        """
        top_10_words_vocab = SequenceVocabulary.from_serializable(contents['top_10_words_vocab'])
        cat_vocab =  Vocabulary.from_serializable(contents['category_vocab'])

        return cls(top_10_words_vocab=top_10_words_vocab, category_vocab=cat_vocab)

    def to_serializable(self):
        """ Returns the serializable contents """
        return {'top_10_words_vocab': self.top_10_words_vocab.to_serializable(), 
                'category_vocab': self.category_vocab.to_serializable()}

In [None]:
class GRUDataset(Dataset):
    def __init__(self, top_10_words_df, vectorizer):
        """
        Args:
            surname_df (pandas.DataFrame): the dataset
            vectorizer (SurnameVectorizer): vectorizer instatiated from dataset
        """
        self.top_10_words_df = top_10_words_df 
        self._vectorizer = vectorizer

        self._max_seq_length = max(map(len, self.top_10_words_df.top_10_words)) + 2

        self.train_df = self.top_10_words_df[self.top_10_words_df.split=='train']
        self.train_size = len(self.train_df)

        self.val_df = self.top_10_words_df[self.top_10_words_df.split=='val']
        self.validation_size = len(self.val_df)

        self.test_df = self.top_10_words_df[self.top_10_words_df.split=='test']
        self.test_size = len(self.test_df)

        self._lookup_dict = {'train': (self.train_df, self.train_size), 
                             'val': (self.val_df, self.validation_size), 
                             'test': (self.test_df, self.test_size)}

        self.set_split('train')
        
    @classmethod
    def load_dataset_and_make_vectorizer(cls, top_10_words_csv):
        """Load dataset and make a new vectorizer from scratch
        
        Args:
            surname_csv (str): location of the dataset
        Returns:
            an instance of SurnameDataset
        """
        
        top_10_words_df = pd.read_csv(top_10_words_csv)
        return cls(top_10_words_df, GRUVectorizer.from_dataframe(top_10_words_df))
        
    @classmethod
    def load_dataset_and_load_vectorizer(cls, top_10_words_csv, vectorizer_filepath):
        """Load dataset and the corresponding vectorizer. 
        Used in the case in the vectorizer has been cached for re-use
        
        Args:
            surname_csv (str): location of the dataset
            vectorizer_filepath (str): location of the saved vectorizer
        Returns:
            an instance of SurnameDataset
        """
        top_10_words_df = pd.read_csv(top_10_words_csv)
        vectorizer = cls.load_vectorizer_only(vectorizer_filepath)
        return cls(top_10_words_df, vectorizer)

    @staticmethod
    def load_vectorizer_only(vectorizer_filepath):
        """a static method for loading the vectorizer from file
        
        Args:
            vectorizer_filepath (str): the location of the serialized vectorizer
        Returns:
            an instance of SurnameVectorizer
        """
        with open(vectorizer_filepath) as fp:
            return GRUVectorizer.from_serializable(json.load(fp))

    def save_vectorizer(self, vectorizer_filepath):
        """saves the vectorizer to disk using json
        
        Args:
            vectorizer_filepath (str): the location to save the vectorizer
        """
        with open(vectorizer_filepath, "w") as fp:
            json.dump(self._vectorizer.to_serializable(), fp)

    def get_vectorizer(self):
        """ returns the vectorizer """
        return self._vectorizer

    def set_split(self, split="train"):
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]

    def __len__(self):
        return self._target_size

    def __getitem__(self, index):
        """the primary entry point method for PyTorch datasets
        
        Args:
            index (int): the index to the data point 
        Returns:
            a dictionary holding the data point: (x_data, y_target, class_index)
        """
        row = self._target_df.iloc[index]
        
        from_vector, to_vector = \
            self._vectorizer.vectorize(row.top_10_words, self._max_seq_length)
        
        category_index = \
            self._vectorizer.category_vocab.lookup_token(row.category)

        return {'x_data': from_vector, 
                'y_target': to_vector, 
                'class_index': category_index}

    def get_num_batches(self, batch_size):
        """Given a batch size, return the number of batches in the dataset
        
        Args:
            batch_size (int)
        Returns:
            number of batches in the dataset
        """
        return len(self) // batch_size
    
def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"): 
    """
    A generator function which wraps the PyTorch DataLoader. It will 
      ensure each tensor is on the write device location.
    """
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

In [None]:
import torch.nn as nn
from torch.nn import functional as F
import torch.optim as optim

class JobDescriptionGenerationModel(nn.Module):
    def __init__(self, top_10_words_embedding_size, top_10_words_vocab_size, num_categories,
                 rnn_hidden_size, batch_first=True, padding_idx=0, dropout_p=0.5):
        """
        Args:
            char_embedding_size (int): The size of the character embeddings
            char_vocab_size (int): The number of characters to embed
            num_nationalities (int): The size of the prediction vector 
            rnn_hidden_size (int): The size of the RNN's hidden state
            batch_first (bool): Informs whether the input tensors will 
                have batch or the sequence on the 0th dimension
            padding_idx (int): The index for the tensor padding; 
                see torch.nn.Embedding
            dropout_p (float): the probability of zeroing activations using
                the dropout method.  higher means more likely to zero.
        """
        super(JobDescriptionGenerationModel, self).__init__()
        
        self.top_10_words_emb = nn.Embedding(num_embeddings=top_10_words_vocab_size,
                                     embedding_dim=top_10_words_embedding_size,
                                     padding_idx=padding_idx)

        self.category_emb = nn.Embedding(num_embeddings= num_categories,
                                       embedding_dim=rnn_hidden_size)

        self.rnn = nn.GRU(input_size=top_10_words_embedding_size, 
                          hidden_size=rnn_hidden_size,
                          batch_first=batch_first)
        
        self.fc = nn.Linear(in_features=rnn_hidden_size, 
                            out_features=top_10_words_vocab_size)
        
        self._dropout_p = dropout_p

    def forward(self, x_in, category_index, apply_softmax=False):
        """The forward pass of the model
        
        Args:
            x_in (torch.Tensor): an input data tensor. 
                x_in.shape should be (batch, max_seq_size)
            nationality_index (torch.Tensor): The index of the nationality for each data point
                Used to initialize the hidden state of the RNN
            apply_softmax (bool): a flag for the softmax activation
                should be false if used with the Cross Entropy losses
        Returns:
            the resulting tensor. tensor.shape should be (batch, char_vocab_size)
        """
        x_embedded = self.top_10_words_emb(x_in)
        
        # hidden_size: (num_layers * num_directions, batch_size, rnn_hidden_size)
        # L,N,F
        category_embedded = self.category_emb(category_index).unsqueeze(0)

        y_out, _ = self.rnn(x_embedded, category_embedded)

        # N,L,F
        batch_size, seq_size, feat_size = y_out.shape
        y_out = y_out.contiguous().view(batch_size * seq_size, feat_size)

        y_out = self.fc(F.dropout(y_out, p=self._dropout_p))
                         
        if apply_softmax:
            y_out = F.softmax(y_out, dim=1)

        new_feat_size = y_out.shape[-1]
        y_out = y_out.view(batch_size, seq_size, new_feat_size)
            
        return y_out

In [None]:
def sample_from_model(model, vectorizer, categories, sample_size=20, 
                      temperature=1.0):
    """Sample a sequence of indices from the model
    
    Args:
        model (JobDescriptionGenerationModel): the trained model
        vectorizer (SurnameVectorizer): the corresponding vectorizer
        nationalities (list): a list of integers representing nationalities
        sample_size (int): the max length of the samples
        temperature (float): accentuates or flattens 
            the distribution. 
            0.0 < temperature < 1.0 will make it peakier. 
            temperature > 1.0 will make it more uniform
    Returns:
        indices (torch.Tensor): the matrix of indices; 
        shape = (num_samples, sample_size)
    """
    num_samples = len(categories)
    begin_seq_index = [vectorizer.top_10_words_vocab.begin_seq_index 
                       for _ in range(num_samples)]
    begin_seq_index = torch.tensor(begin_seq_index, 
                                   dtype=torch.int64).unsqueeze(dim=1)
    indices = [begin_seq_index]
    category_indices = torch.tensor(categories, dtype=torch.int64).unsqueeze(dim=0)
    h_t = model.category_emb(category_indices)
    
    for time_step in range(sample_size):
        x_t = indices[time_step]
        x_emb_t = model.top_10_words_emb(x_t)
        rnn_out_t, h_t = model.rnn(x_emb_t, h_t)
        prediction_vector = model.fc(rnn_out_t.squeeze(dim=1))
        probability_vector = F.softmax(prediction_vector / temperature, dim=1)
        indices.append(torch.multinomial(probability_vector, num_samples=1))
    indices = torch.stack(indices).squeeze().permute(1, 0)
    return indices

In [None]:
def decode_samples(sampled_indices, vectorizer):
    """Transform indices into the string form of a surname
    
    Args:
        sampled_indices (torch.Tensor): the inidces from `sample_from_model`
        vectorizer (SurnameVectorizer): the corresponding vectorizer
    """
    decoded_top_10_words = []
    vocab = vectorizer.top_10_words_vocab
    
    for sample_index in range(sampled_indices.shape[0]):
        top_10_words = ""
        for time_step in range(sampled_indices.shape[1]):
            sample_item = sampled_indices[sample_index, time_step].item()
            if sample_item == vocab.begin_seq_index:
                continue
            elif sample_item == vocab.end_seq_index:
                break
            else:
                top_10_words += vocab.lookup_index(sample_item)
        decoded_top_10_words.append(top_10_words)
    return decoded_top_10_words

In [None]:
def make_train_state(args):
    return {'stop_early': False,
            'early_stopping_step': 0,
            'early_stopping_best_val': 1e8,
            'learning_rate': args.learning_rate,
            'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': [],
            'test_loss': -1,
            'test_acc': -1,
            'model_filename': args.model_state_file}

def update_train_state(args, model, train_state):
    """Handle the training state updates.
    Components:
     - Early Stopping: Prevent overfitting.
     - Model Checkpoint: Model is saved if the model is better
    
    :param args: main arguments
    :param model: model to train
    :param train_state: a dictionary representing the training state values
    :returns:
        a new train_state
    """

    # Save one model at least
    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(), train_state['model_filename'])
        train_state['stop_early'] = False

    # Save model if performance improved
    elif train_state['epoch_index'] >= 1:
        loss_tm1, loss_t = train_state['val_loss'][-2:]
         
        # If loss worsened
        if loss_t >= loss_tm1:
            # Update step
            train_state['early_stopping_step'] += 1
        # Loss decreased
        else:
            # Save the best model
            if loss_t < train_state['early_stopping_best_val']:
                torch.save(model.state_dict(), train_state['model_filename'])
                train_state['early_stopping_best_val'] = loss_t

            # Reset early stopping step
            train_state['early_stopping_step'] = 0

        # Stop early ?
        train_state['stop_early'] = \
            train_state['early_stopping_step'] >= args.early_stopping_criteria

    return train_state

def normalize_sizes(y_pred, y_true):
    """Normalize tensor sizes
    
    Args:
        y_pred (torch.Tensor): the output of the model
            If a 3-dimensional tensor, reshapes to a matrix
        y_true (torch.Tensor): the target predictions
            If a matrix, reshapes to be a vector
    """
    if len(y_pred.size()) == 3:
        y_pred = y_pred.contiguous().view(-1, y_pred.size(2))
    if len(y_true.size()) == 2:
        y_true = y_true.contiguous().view(-1)
    return y_pred, y_true

def compute_accuracy(y_pred, y_true, mask_index):
    y_pred, y_true = normalize_sizes(y_pred, y_true)

    _, y_pred_indices = y_pred.max(dim=1)
    
    correct_indices = torch.eq(y_pred_indices, y_true).float()
    valid_indices = torch.ne(y_true, mask_index).float()
    
    n_correct = (correct_indices * valid_indices).sum().item()
    n_valid = valid_indices.sum().item()

    return n_correct / n_valid * 100

def sequence_loss(y_pred, y_true, mask_index):
    y_pred, y_true = normalize_sizes(y_pred, y_true)
    return F.cross_entropy(y_pred, y_true, ignore_index=mask_index)

In [None]:
def make_train_state(args):
    return {'stop_early': False,
            'early_stopping_step': 0,
            'early_stopping_best_val': 1e8,
            'learning_rate': args.learning_rate,
            'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': [],
            'test_loss': -1,
            'test_acc': -1,
            'model_filename': args.model_state_file}

def update_train_state(args, model, train_state):
    """Handle the training state updates.
    Components:
     - Early Stopping: Prevent overfitting.
     - Model Checkpoint: Model is saved if the model is better
    
    :param args: main arguments
    :param model: model to train
    :param train_state: a dictionary representing the training state values
    :returns:
        a new train_state
    """

    # Save one model at least
    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(), train_state['model_filename'])
        train_state['stop_early'] = False

    # Save model if performance improved
    elif train_state['epoch_index'] >= 1:
        loss_tm1, loss_t = train_state['val_loss'][-2:]
         
        # If loss worsened
        if loss_t >= loss_tm1:
            # Update step
            train_state['early_stopping_step'] += 1
        # Loss decreased
        else:
            # Save the best model
            if loss_t < train_state['early_stopping_best_val']:
                torch.save(model.state_dict(), train_state['model_filename'])
                train_state['early_stopping_best_val'] = loss_t

            # Reset early stopping step
            train_state['early_stopping_step'] = 0

        # Stop early ?
        train_state['stop_early'] = \
            train_state['early_stopping_step'] >= args.early_stopping_criteria

    return train_state

def normalize_sizes(y_pred, y_true):
    """Normalize tensor sizes
    
    Args:
        y_pred (torch.Tensor): the output of the model
            If a 3-dimensional tensor, reshapes to a matrix
        y_true (torch.Tensor): the target predictions
            If a matrix, reshapes to be a vector
    """
    if len(y_pred.size()) == 3:
        y_pred = y_pred.contiguous().view(-1, y_pred.size(2))
    if len(y_true.size()) == 2:
        y_true = y_true.contiguous().view(-1)
    return y_pred, y_true

def compute_accuracy(y_pred, y_true, mask_index):
    y_pred, y_true = normalize_sizes(y_pred, y_true)

    _, y_pred_indices = y_pred.max(dim=1)
    
    correct_indices = torch.eq(y_pred_indices, y_true).float()
    valid_indices = torch.ne(y_true, mask_index).float()
    
    n_correct = (correct_indices * valid_indices).sum().item()
    n_valid = valid_indices.sum().item()

    return n_correct / n_valid * 100

def sequence_loss(y_pred, y_true, mask_index):
    y_pred, y_true = normalize_sizes(y_pred, y_true)
    return F.cross_entropy(y_pred, y_true, ignore_index=mask_index)

### Pre-trained GloVe embedding For GRU

In [None]:
# Setting and some prep work
args = Namespace(
    # Data and path information
    task2RNN_csv="ass02_task02.csv",
    vectorizer_file="vectorizer1.json",
    model_state_file="model1.pth",
    save_dir="model_storage/GRU/glove",
    # Model hyper parameter
    glove_filepath='glove.6B.100d.txt',
    domain_filepath='domain_embeddings.txt',
    use_glove=False,
    use_domain_embeddings=False,
    char_embedding_size=100,
    rnn_hidden_size=32,
    # Training hyper parameter
    num_epochs=100,
    learning_rate=1e-3,
    batch_size=64,
    seed=1337,
    early_stopping_criteria=5,
    # Runtime hyper parameter
    cuda=True,
    catch_keyboard_interrupt=True,
    reload_from_files=False,
    expand_filepaths_to_save_dir=True,
)

if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir,
                                        args.vectorizer_file)

    args.model_state_file = os.path.join(args.save_dir,
                                         args.model_state_file)
    
    print("Expanded filepaths: ")
    print("\t{}".format(args.vectorizer_file))
    print("\t{}".format(args.model_state_file))

# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False

args.device = torch.device("cuda" if args.cuda else "cpu")
    
print("Using CUDA: {}".format(args.cuda))


if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir,
                                        args.vectorizer_file)

    args.model_state_file = os.path.join(args.save_dir,
                                         args.model_state_file)
    
# Set seed for reproducibility
set_seed_everywhere(args.seed, args.cuda)

# handle dirs
handle_dirs(args.save_dir)
# Set seed for reproducibility
set_seed_everywhere(args.seed, args.cuda)

# handle dirs
handle_dirs(args.save_dir)

In [None]:
# Initializations
args.use_glove = True
# args.use_domain_embeddings = True
if args.reload_from_files:
    # training from a checkpoint
    dataset = GRUDataset.load_dataset_and_load_vectorizer(args.task2RNN_csv, 
                                                              args.vectorizer_file)
else:
    # create dataset and vectorizer
    dataset = GRUDataset.load_dataset_and_make_vectorizer(args.task2RNN_csv)
    dataset.save_vectorizer(args.vectorizer_file)
vectorizer = dataset.get_vectorizer()

# Use pretrained GloVe embeddings
words = vectorizer.top_10_words_vocab._token_to_idx.keys()
embeddings = make_glove_embedding_matrix(glove_filepath=args.glove_filepath, 
                                    words=words)
print("Using pre-trained glove embeddings")

model = JobDescriptionGenerationModel(top_10_words_embedding_size=args.char_embedding_size,
                               top_10_words_vocab_size=len(vectorizer.top_10_words_vocab),
                               num_categories=len(vectorizer.category_vocab),
                               rnn_hidden_size=args.rnn_hidden_size,
                               padding_idx=vectorizer.top_10_words_vocab.mask_index,
                               dropout_p=0.5)

#### Training Loop

In [None]:
mask_index = vectorizer.top_10_words_vocab.mask_index

model = model.to(args.device)


optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                           mode='min', factor=0.5,
                                           patience=1)
train_state = make_train_state(args)

epoch_bar = tqdm(desc='training routine', 
                          total=args.num_epochs,
                          position=0)

dataset.set_split('train')
train_bar = tqdm(desc='split=train',
                          total=dataset.get_num_batches(args.batch_size), 
                          position=1, 
                          leave=True)
dataset.set_split('val')
val_bar = tqdm(desc='split=val',
                        total=dataset.get_num_batches(args.batch_size), 
                        position=1, 
                        leave=True)

try:
    for epoch_index in range(args.num_epochs):
        train_state['epoch_index'] = epoch_index

        # Iterate over training dataset

        # setup: batch generator, set loss and acc to 0, set train mode on
        dataset.set_split('train')
        batch_generator = generate_batches(dataset, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
        running_loss = 0.0
        running_acc = 0.0
        model.train()
        
        for batch_index, batch_dict in enumerate(batch_generator):
            # the training routine is these 5 steps:

            # --------------------------------------    
            # step 1. zero the gradients
            optimizer.zero_grad()

            # step 2. compute the output
            y_pred = model(x_in=batch_dict['x_data'], 
                           category_index=batch_dict['class_index'])

            # step 3. compute the loss
            loss = sequence_loss(y_pred, batch_dict['y_target'], mask_index)


            # step 4. use loss to produce gradients
            loss.backward()

            # step 5. use optimizer to take gradient step
            optimizer.step()
            # -----------------------------------------
            # compute the  running loss and running accuracy
            running_loss += (loss.item() - running_loss) / (batch_index + 1)
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'], mask_index)
            running_acc += (acc_t - running_acc) / (batch_index + 1)

            # update bar
            train_bar.set_postfix(loss=running_loss,
                                  acc=running_acc,
                                  epoch=epoch_index)
            train_bar.update()

        train_state['train_loss'].append(running_loss)
        train_state['train_acc'].append(running_acc)

        # Iterate over val dataset

        # setup: batch generator, set loss and acc to 0; set eval mode on
        dataset.set_split('val')
        batch_generator = generate_batches(dataset, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
        running_loss = 0.
        running_acc = 0.
        model.eval()

        for batch_index, batch_dict in enumerate(batch_generator):
            # compute the output
            y_pred = model(x_in=batch_dict['x_data'], 
                           category_index=batch_dict['class_index'])

            # step 3. compute the loss
            loss = sequence_loss(y_pred, batch_dict['y_target'], mask_index)

            # compute the  running loss and running accuracy
            running_loss += (loss.item() - running_loss) / (batch_index + 1)
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'], mask_index)
            running_acc += (acc_t - running_acc) / (batch_index + 1)
            
            # Update bar
            val_bar.set_postfix(loss=running_loss, acc=running_acc, 
                            epoch=epoch_index)
            val_bar.update()

        train_state['val_loss'].append(running_loss)
        train_state['val_acc'].append(running_acc)

        train_state = update_train_state(args=args, model=model, 
                                         train_state=train_state)

        scheduler.step(train_state['val_loss'][-1])

        if train_state['stop_early']:
            break
            
        # move model to cpu for sampling
        
        categories = np.random.choice(np.arange(len(vectorizer.category_vocab)), replace=True, size=2)
        model = model.cpu()
        sampled_top_10_words = decode_samples(
            sample_from_model(model, vectorizer, categories=categories), 
            vectorizer)
        
        sample1 = "{}->{}".format(vectorizer.category_vocab.lookup_index(categories[0]), 
                                  sampled_top_10_words[0])
        sample2 = "{}->{}".format(vectorizer.category_vocab.lookup_index(categories[1]), 
                                  sampled_top_10_words[1])
        epoch_bar.set_postfix(sample1=sample1, 
                              sample2=sample2)
        # move model back to whichever device it should be on
        model = model.to(args.device)
        
        train_bar.n = 0
        val_bar.n = 0
        epoch_bar.update()
        
except KeyboardInterrupt:
    print("Exiting loop")

In [None]:
# compute the loss & accuracy on the test set using the best available model

model.load_state_dict(torch.load(train_state['model_filename']))

model = model.to(args.device)

dataset.set_split('test')
batch_generator = generate_batches(dataset, 
                                   batch_size=args.batch_size, 
                                   device=args.device)
running_acc = 0.
model.eval()

for batch_index, batch_dict in enumerate(batch_generator):
    # compute the output
    y_pred = model(x_in=batch_dict['x_data'], 
                   category_index=batch_dict['class_index'])

    # compute the loss
    loss = sequence_loss(y_pred, batch_dict['y_target'], mask_index)
    
    # compute the running loss and running accuracy
    running_loss += (loss.item() - running_loss) / (batch_index + 1)
    acc_t = compute_accuracy(y_pred, batch_dict['y_target'], mask_index)
    running_acc += (acc_t - running_acc) / (batch_index + 1)

train_state['test_loss'] = running_loss 
train_state['test_acc'] = running_acc 

In [None]:
print("Test loss: {};".format(train_state['test_loss']))
print("Test Accuracy: {}".format(train_state['test_acc']))

In [None]:
# Inference
def predict_category(top_10_words, model, vectorizer):
    vectorized_top_10_words, vec_length = vectorizer.vectorize(top_10_words)
    vectorized_top_10_words = torch.tensor(vectorized_top_10_words).unsqueeze(dim=0)
    vec_length = torch.tensor([vec_length], dtype=torch.int64)
    
    result = model(vectorized_top_10_words, vec_length, apply_softmax=True)
    probability_values, indices = result.max(dim=1)
    
    index = indices.item()
    prob_value = probability_values.item()

    predicted_category = vectorizer.category_vocab.lookup_index(index)

    return {'category': predicted_category, 'probability': prob_value, 'top_10_words': top_10_words}

ex1 = split_task2.iloc[100,]
ex2 = split_task2.iloc[500,]
ex3 = split_task2.iloc[1000,]
for top_10_words in [ex1, ex2, ex3]:
    print('Predicted label is:', predict_category(top_10_words.top_10_words, model, vectorizer)['category'])
    print('True label is:', top_10_words.category)
    print('Probability is:', predict_category(top_10_words.top_10_words, model, vectorizer)['probability'], '\n')

### Pre-trained Domain Specific Embedding For GRU

In [None]:
# Setting and some prep work
args = Namespace(
    # Data and path information
    task2RNN_csv="ass02_task02.csv",
    vectorizer_file="vectorizer2.json",
    model_state_file="model2.pth",
    save_dir="model_storage/GRU/domian",
    # Model hyper parameter
    glove_filepath='glove.6B.100d.txt',
    domain_filepath='domain_embeddings.txt',
    use_glove=False,
    use_domain_embeddings=False,
    char_embedding_size=100,
    rnn_hidden_size=32,
    # Training hyper parameter
    num_epochs=100,
    learning_rate=1e-3,
    batch_size=64,
    seed=1337,
    early_stopping_criteria=5,
    # Runtime hyper parameter
    cuda=True,
    catch_keyboard_interrupt=True,
    reload_from_files=False,
    expand_filepaths_to_save_dir=True,
)

if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir,
                                        args.vectorizer_file)

    args.model_state_file = os.path.join(args.save_dir,
                                         args.model_state_file)
    
    print("Expanded filepaths: ")
    print("\t{}".format(args.vectorizer_file))
    print("\t{}".format(args.model_state_file))

# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False

args.device = torch.device("cuda" if args.cuda else "cpu")
    
print("Using CUDA: {}".format(args.cuda))


if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir,
                                        args.vectorizer_file)

    args.model_state_file = os.path.join(args.save_dir,
                                         args.model_state_file)
    
# Set seed for reproducibility
set_seed_everywhere(args.seed, args.cuda)

# handle dirs
handle_dirs(args.save_dir)
# Set seed for reproducibility
set_seed_everywhere(args.seed, args.cuda)

# handle dirs
handle_dirs(args.save_dir)

In [None]:
# Initializations
args.use_domain_embeddings = True
if args.reload_from_files:
    # training from a checkpoint
    dataset = GRUDataset.load_dataset_and_load_vectorizer(args.task2RNN_csv, 
                                                              args.vectorizer_file)
else:
    # create dataset and vectorizer
    dataset = GRUDataset.load_dataset_and_make_vectorizer(args.task2RNN_csv)
    dataset.save_vectorizer(args.vectorizer_file)
vectorizer = dataset.get_vectorizer()

# Use pretrained domain specific embeddings
words = vectorizer.top_10_words_vocab._token_to_idx.keys()
embeddings = make_domain_embedding_matrix(domain_filepath=args.domain_filepath, 
                                    words=words)
print("Using pre-trained domain specific embeddings")

model = JobDescriptionGenerationModel(top_10_words_embedding_size=args.char_embedding_size,
                               top_10_words_vocab_size=len(vectorizer.top_10_words_vocab),
                               num_categories=len(vectorizer.category_vocab),
                               rnn_hidden_size=args.rnn_hidden_size,
                               padding_idx=vectorizer.top_10_words_vocab.mask_index,
                               dropout_p=0.5)

#### Training Loop

In [None]:
mask_index = vectorizer.top_10_words_vocab.mask_index

model = model.to(args.device)


optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                           mode='min', factor=0.5,
                                           patience=1)
train_state = make_train_state(args)

epoch_bar = tqdm(desc='training routine', 
                          total=args.num_epochs,
                          position=0)

dataset.set_split('train')
train_bar = tqdm(desc='split=train',
                          total=dataset.get_num_batches(args.batch_size), 
                          position=1, 
                          leave=True)
dataset.set_split('val')
val_bar = tqdm(desc='split=val',
                        total=dataset.get_num_batches(args.batch_size), 
                        position=1, 
                        leave=True)

try:
    for epoch_index in range(args.num_epochs):
        train_state['epoch_index'] = epoch_index

        # Iterate over training dataset

        # setup: batch generator, set loss and acc to 0, set train mode on
        dataset.set_split('train')
        batch_generator = generate_batches(dataset, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
        running_loss = 0.0
        running_acc = 0.0
        model.train()
        
        for batch_index, batch_dict in enumerate(batch_generator):
            # the training routine is these 5 steps:

            # --------------------------------------    
            # step 1. zero the gradients
            optimizer.zero_grad()

            # step 2. compute the output
            y_pred = model(x_in=batch_dict['x_data'], 
                           category_index=batch_dict['class_index'])

            # step 3. compute the loss
            loss = sequence_loss(y_pred, batch_dict['y_target'], mask_index)


            # step 4. use loss to produce gradients
            loss.backward()

            # step 5. use optimizer to take gradient step
            optimizer.step()
            # -----------------------------------------
            # compute the  running loss and running accuracy
            running_loss += (loss.item() - running_loss) / (batch_index + 1)
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'], mask_index)
            running_acc += (acc_t - running_acc) / (batch_index + 1)

            # update bar
            train_bar.set_postfix(loss=running_loss,
                                  acc=running_acc,
                                  epoch=epoch_index)
            train_bar.update()

        train_state['train_loss'].append(running_loss)
        train_state['train_acc'].append(running_acc)

        # Iterate over val dataset

        # setup: batch generator, set loss and acc to 0; set eval mode on
        dataset.set_split('val')
        batch_generator = generate_batches(dataset, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
        running_loss = 0.
        running_acc = 0.
        model.eval()

        for batch_index, batch_dict in enumerate(batch_generator):
            # compute the output
            y_pred = model(x_in=batch_dict['x_data'], 
                           category_index=batch_dict['class_index'])

            # step 3. compute the loss
            loss = sequence_loss(y_pred, batch_dict['y_target'], mask_index)

            # compute the  running loss and running accuracy
            running_loss += (loss.item() - running_loss) / (batch_index + 1)
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'], mask_index)
            running_acc += (acc_t - running_acc) / (batch_index + 1)
            
            # Update bar
            val_bar.set_postfix(loss=running_loss, acc=running_acc, 
                            epoch=epoch_index)
            val_bar.update()

        train_state['val_loss'].append(running_loss)
        train_state['val_acc'].append(running_acc)

        train_state = update_train_state(args=args, model=model, 
                                         train_state=train_state)

        scheduler.step(train_state['val_loss'][-1])

        if train_state['stop_early']:
            break
            
        # move model to cpu for sampling
        
        categories = np.random.choice(np.arange(len(vectorizer.category_vocab)), replace=True, size=2)
        model = model.cpu()
        sampled_top_10_words = decode_samples(
            sample_from_model(model, vectorizer, categories=categories), 
            vectorizer)
        
        sample1 = "{}->{}".format(vectorizer.category_vocab.lookup_index(categories[0]), 
                                  sampled_top_10_words[0])
        sample2 = "{}->{}".format(vectorizer.category_vocab.lookup_index(categories[1]), 
                                  sampled_top_10_words[1])
        epoch_bar.set_postfix(sample1=sample1, 
                              sample2=sample2)
        # move model back to whichever device it should be on
        model = model.to(args.device)
        
        train_bar.n = 0
        val_bar.n = 0
        epoch_bar.update()
        
except KeyboardInterrupt:
    print("Exiting loop")

In [None]:
# compute the loss & accuracy on the test set using the best available model

model.load_state_dict(torch.load(train_state['model_filename']))

model = model.to(args.device)

dataset.set_split('test')
batch_generator = generate_batches(dataset, 
                                   batch_size=args.batch_size, 
                                   device=args.device)
running_acc = 0.
model.eval()

for batch_index, batch_dict in enumerate(batch_generator):
    # compute the output
    y_pred = model(x_in=batch_dict['x_data'], 
                   category_index=batch_dict['class_index'])

    # compute the loss
    loss = sequence_loss(y_pred, batch_dict['y_target'], mask_index)
    
    # compute the running loss and running accuracy
    running_loss += (loss.item() - running_loss) / (batch_index + 1)
    acc_t = compute_accuracy(y_pred, batch_dict['y_target'], mask_index)
    running_acc += (acc_t - running_acc) / (batch_index + 1)

train_state['test_loss'] = running_loss 
train_state['test_acc'] = running_acc 

In [None]:
print("Test loss: {};".format(train_state['test_loss']))
print("Test Accuracy: {}".format(train_state['test_acc']))

In [None]:
# Inference
def predict_category(top_10_words, model, vectorizer):
    vectorized_nar, vec_length = vectorizer.vectorize(top_10_words)
    vectorized_nar = torch.tensor(vectorized_nar).unsqueeze(dim=0)
    vec_length = torch.tensor([vec_length], dtype=torch.int64)
    
    result = model(vectorized_nar, vec_length, apply_softmax=True)
    probability_values, indices = result.max(dim=1)
    
    index = indices.item()
    prob_value = probability_values.item()

    predicted_category = vectorizer.category_vocab.lookup_index(index)

    return {'category': predicted_category, 'probability': prob_value, 'top_10_words': top_10_words}

ex1 = split_task2.iloc[100,]
ex2 = split_task2.iloc[500,]
ex3 = split_task2.iloc[1000,]
for top_10_words in [ex1, ex2, ex3]:
    print('Predicted label is:', predict_category(top_10_words.top_10_words, model, vectorizer)['category'])
    print('True label is:', top_10_words.category)
    print('Probability is:', predict_category(top_10_words.top_10_words, model, vectorizer)['probability'], '\n')