# Extra Functionalities:

In [4]:
# Generate word2vec file from glove files:(Run only once, once the word2vec file is generated,we can use that directly)
import numpy as np
import gensim
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors
import os

glove_input_file = 'Dataset/glove.6B.100d.txt'
word2vec_output_file = 'glove.6B.100d.txt.word2vec'

if not os.path.exists(word2vec_output_file):
    glove2word2vec(glove_input_file, word2vec_output_file)

In [None]:
# Load word2vec embedded file:
word2vec_output_file = 'Dataset/glove.6B.100d.txt.word2vec'
embed = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

In [None]:
# PDF to txt: (Resumes)

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO

def convertPDFToText(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)
    fp.close()
    device.close()
    string = retstr.getvalue()
    retstr.close()
    return string

# Complete Approach:

In [4]:
# Cleaning the resume:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import string

# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# turn a doc into clean tokens
def clean_doc(doc):
	# split into tokens by white space
	tokens = doc.split()
	# remove punctuation from each token
	table = str.maketrans('', '', string.punctuation)
	tokens = [w.translate(table) for w in tokens]
	# remove remaining tokens that are not alphabetic
	tokens = [word for word in tokens if word.isalpha()]
	# filter out stop words
	stop_words = set(stopwords.words('english'))
	tokens = [w for w in tokens if not w in stop_words]
	# filter out short tokens
	tokens = [word for word in tokens if len(word) > 1]
	return tokens

# load the document
filename = 'Dataset/resumes/tech/train_resume1.txt'
text = load_doc(filename)
tokens = clean_doc(text)
print(tokens)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pratikcr7/nltk_data...


['Nazli', 'Uzgur', 'Education', 'May', 'Bachelor', 'Science', 'Computer', 'Science', 'Carnegie', 'Mellon', 'University', 'Pittsburgh', 'Relevant', 'Coursework', 'Great', 'Theoretical', 'Ideas', 'Computer', 'Science', 'Principles', 'Functional', 'Programming', 'Introduction', 'Computer', 'Systems', 'Fundamentals', 'Programming', 'Computer', 'Science', 'Principles', 'Imperative', 'Computation', 'Concepts', 'Mathematics', 'May', 'Game', 'Design', 'Minor', 'Carnegie', 'Mellon', 'University', 'Pittsburgh', 'Skills', 'Programming', 'Languages', 'Limited', 'Proficieny', 'Python', 'Java', 'SML', 'Spoken', 'Languages', 'Fluent', 'Turkish', 'English', 'Conversational', 'Proficiency', 'Japanese', 'Experience', 'NLP', 'Been', 'working', 'Natural', 'Language', 'Processing', 'program', 'January', 'Mecidiyekoy', 'EnglishTurkish', 'translator', 'company', 'June', 'Karate', 'Cofounder', 'president', 'team', 'Oct', 'Anime', 'Manga', 'President', 'club', 'Oct', 'PR', 'Public', 'relations', 'tour', 'guide

[nltk_data]   Unzipping corpora/stopwords.zip.


In [7]:
# Build Vocabulary from the training and testing data:

from string import punctuation
from os import listdir
from collections import Counter
from nltk.corpus import stopwords
from gensim.models import Word2Vec
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# turn a doc into clean tokens
def clean_doc(doc):
	# split into tokens by white space
	tokens = doc.split()
	# remove punctuation from each token
	table = str.maketrans('', '', punctuation)
	tokens = [w.translate(table) for w in tokens]
	# remove remaining tokens that are not alphabetic
	tokens = [word for word in tokens if word.isalpha()]
	# filter out stop words
	stop_words = set(stopwords.words('english'))
	tokens = [w for w in tokens if not w in stop_words]
	# filter out short tokens
	tokens = [word for word in tokens if len(word) > 1]
	return tokens

# load doc and add to vocab
def add_doc_to_vocab(filename, vocab):
	# load doc
	doc = load_doc(filename)
	# clean doc
	tokens = clean_doc(doc)
	# update counts
	vocab.update(tokens)

# load all docs in a directory
def process_docs(directory, vocab, is_trian):
	# walk through all files in the folder
	for filename in listdir(directory):
		# skip any reviews in the test set
		if is_trian and filename.startswith('test'):
			continue
		if not is_trian and not filename.startswith('test'):
			continue
		# create the full path of the file to open
		path = directory + '/' + filename
		# add doc to vocab
		add_doc_to_vocab(path, vocab)

# define vocab
vocab = Counter()
# add all docs to vocab
process_docs('Dataset/resumes/tech/', vocab, True)
process_docs('Dataset/resumes/nontech/', vocab, True)
# print the size of the vocab
print(len(vocab))
# print the top words in the vocab
print(vocab.most_common(50))

# keep tokens with a min occurrence
min_occurance = 1
tokens = [k for k,c in vocab.items() if c >= min_occurance]
print(len(tokens))

# save list to file
def save_list(lines, filename):
	# convert lines to a single blob of text
	data = '\n'.join(lines)
	# open file
	file = open(filename, 'w')
	# write text
	file.write(data)
	# close file
	file.close()

# save tokens to a vocabulary file
save_list(tokens, 'vocab.txt')

311
[('FL', 6), ('Computer', 5), ('Ocala', 5), ('Science', 4), ('years', 4), ('children', 4), ('Team', 4), ('May', 3), ('Programming', 3), ('Languages', 3), ('Japanese', 3), ('team', 3), ('Oct', 3), ('President', 3), ('College', 3), ('students', 3), ('Vanguard', 3), ('High', 3), ('School', 3), ('ages', 3), ('families', 3), ('holiday', 3), ('Carnegie', 2), ('Mellon', 2), ('University', 2), ('Pittsburgh', 2), ('Principles', 2), ('Skills', 2), ('Fluent', 2), ('Turkish', 2), ('translator', 2), ('June', 2), ('Skiing', 2), ('introduced', 2), ('new', 2), ('Smith', 2), ('Northampton', 2), ('MA', 2), ('National', 2), ('Honor', 2), ('Spanish', 2), ('EXPERIENCE', 2), ('Softball', 2), ('Summers', 2), ('sessions', 2), ('including', 2), ('Designed', 2), ('progress', 2), ('parents', 2), ('Anchor', 2)]
311


In [10]:
# Word2Vec training:
from gensim.models import Word2Vec

# load training data
positive_docs = process_docs('Dataset/resumes/tech/', vocab, True)
negative_docs = process_docs('Dataset/resumes/nontech/', vocab, True)
sentences = negative_docs + positive_docs
print('Total training sentences: %d' % len(sentences))
 
# train word2vec model
model = Word2Vec(sentences, size=100, window=5, workers=8, min_count=1)
# summarize vocabulary size in model
words = list(model.wv.vocab)
print('Vocabulary size: %d' % len(words))
 
# save model in ASCII (word2vec) format
filename = 'embedding_word2vec.txt'
model.wv.save_word2vec_format(filename, binary=False)

Total training sentences: 2
Vocabulary size: 51


In [13]:
# Binary CNN Classifier for Tech and NonTech Resumes:
from string import punctuation
from os import listdir
from numpy import array
from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# turn a doc into clean tokens
def clean_doc(doc, vocab):
	# split into tokens by white space
	tokens = doc.split()
	# remove punctuation from each token
	table = str.maketrans('', '', punctuation)
	tokens = [w.translate(table) for w in tokens]
	# filter out tokens not in vocab
	tokens = [w for w in tokens if w in vocab]
	tokens = ' '.join(tokens)
	return tokens

# load all docs in a directory
def process_docs(directory, vocab, is_trian):
	documents = list()
	# walk through all files in the folder
	for filename in listdir(directory):
		# skip any reviews in the test set
		if is_trian and filename.startswith('test'):
			continue
		if not is_trian and not filename.startswith('test'):
			continue
		# create the full path of the file to open
		path = directory + '/' + filename
		# load the doc
		doc = load_doc(path)
		# clean doc
		tokens = clean_doc(doc, vocab)
		# add to list
		documents.append(tokens)
	return documents

# load embedding as a dict
def load_embedding(filename):
	# load embedding into memory, skip first line
	file = open(filename,'r')
	lines = file.readlines()[1:]
	file.close()
	# create a map of words to vectors
	embedding = dict()
	for line in lines:
		parts = line.split()
		# key is string word, value is numpy array for vector
		embedding[parts[0]] = asarray(parts[1:], dtype='float32')
	return embedding

# create a weight matrix for the Embedding layer from a loaded embedding
def get_weight_matrix(embedding, vocab):
	# total vocabulary size plus 0 for unknown words
	vocab_size = len(vocab) + 1
	# define weight matrix dimensions with all 0
	weight_matrix = zeros((vocab_size, 100))
	# step vocab, store vectors using the Tokenizer's integer mapping
	for word, i in vocab.items():
		weight_matrix[i] = embedding.get(word)
	return weight_matrix

# load the vocabulary
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
# To remove duplicates convert the list into a set.
vocab = set(vocab)

# load all training reviews
tech_docs = process_docs('Dataset/resumes/tech/', vocab, True)
nontech_docs = process_docs('Dataset/resumes/nontech/', vocab, True)
train_docs = tech_docs + nontech_docs

# create the tokenizer
tokenizer = Tokenizer()
# fit the tokenizer on the documents
tokenizer.fit_on_texts(train_docs)

# sequence encode
encoded_docs = tokenizer.texts_to_sequences(train_docs)
# pad sequences
max_length = max([len(s.split()) for s in train_docs])
Xtrain = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# define training labels
num_samples_train_tech = 1
num_samples_train_nontech = 1
num_samples_test_tech = 1
num_samples_test_nontech = 1
ytrain = array([0 for _ in range(num_samples_train_tech)] + [1 for _ in range(num_samples_train_nontech)])

# load all test reviews
tech_docs = process_docs('Dataset/resumes/tech/', vocab, False)
nontech_docs = process_docs('Dataset/resumes/nontech/', vocab, False)
test_docs = tech_docs + nontech_docs
# sequence encode
encoded_docs = tokenizer.texts_to_sequences(test_docs)
# pad sequences
Xtest = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# define test labels
ytest = array([0 for _ in range(num_samples_test_tech)] + [1 for _ in range(num_samples_test_nontech)])

# define vocabulary size (largest integer value)
vocab_size = len(tokenizer.word_index) + 1

# load embedding from file
raw_embedding = load_embedding('embedding_word2vec.txt')
# get vectors in the right order
embedding_vectors = get_weight_matrix(raw_embedding, tokenizer.word_index)
# create the embedding layer
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_vectors], input_length=max_length, trainable=False)

# define model
model = Sequential()
model.add(embedding_layer)
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
print(model.summary())
# compile network
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(Xtrain, ytrain, epochs=10, verbose=2)
# evaluate
loss, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 263, 100)          29700     
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 259, 128)          64128     
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 129, 128)          0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 16512)             0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 16513     
Total params: 110,341
Trainable params: 80,641
Non-trainable params: 29,700
_________________________________________________________________
None
Epoch 1/10
 - 0s - loss: 8.0590 - accuracy: 0.5000
Epoch 2/10
 - 0s - loss: 8.0590 - accuracy: 0.5000
Epo