In [2]:
# Import packages
import os
import re, string, unicodedata
import inflect
import pandas as pd
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from collections import Counter
from os import listdir
from os.path import isfile, join
from nltk.stem import LancasterStemmer, WordNetLemmatizer
import numpy as np

In [4]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mwizasimbeye/anaconda3/envs/dlr/lib/nltk_data..
[nltk_data]     .
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

### Data Preprocessing
Data preprocessing, our data is in .doc and docx format. We need to convert the files to .txt to enable easy preprocessing.
The mothod convert_resume() reads all the resumes in a provided folder. Note: The prepocessor uses a Mac tool called **textutil**. For linux users, please use **pandoc**. example: *pandoc -s example.docx -o output.txt*

In [3]:
# Define our method.
def convert_resumes(path):
    dirs = [os.path.join(path, o) for o in os.listdir(path) if os.path.isdir(os.path.join(path,o))]
    for x in dirs:
        x = x.replace(" ", "\ ")
        os.system('textutil -convert txt '+ x +'/*.doc*')
    return "Converted files to .txt"

In [4]:
# Read and preprocess the resume. 
def process_resume(filename):
    # load text
    file = open(filename, 'rt')
    text = file.read()
    text = ' '.join(text.split()) # Remove double spaces
    file.close()
    # Remove the common hex values from text.
    text = text.replace("\xe2", "").replace("\x80", "").replace("\x93", "").replace("\x99", "").replace("0x9c", "")
    text = text.lower()    
    text = ' '.join(text.split())
    text = re.sub('\[[^]]*\]', '', text)
    return text

In [5]:
def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

In [6]:
def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

In [7]:
def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

In [8]:
def normalize(words):
    words = remove_punctuation(words)
    words = replace_numbers(words)
    words = remove_stopwords(words)
    return words

#### Read data & create dataframe
After the data has been read, we need to create a data frame of all the features.

In [21]:
# Get job titles & candidate name
path = 'data/'
col_names =  ['candidate_name', 'job_title', 'resume']
data  = pd.DataFrame(columns = col_names)
dirs = [os.path.join(path, o) for o in os.listdir(path) if os.path.isdir(os.path.join(path,o))]
for x in dirs:
    job = (x.replace('data/', ''))
    files = [f for f in listdir(x) if isfile(join(x, f))]
    candidates = [f for f in files if f.endswith('.txt') if f!="Success.txt" if f!="Job Description.txt"]
    for x in candidates:
        resume = process_resume(path+"/"+job+"/"+x).split()
        resume = normalize(resume)
        data.loc[len(data)] = [x[:-4], job, resume]

Preview of new dataset

In [31]:
data

Unnamed: 0,candidate_name,job_title,resume
0,Ardiela Dramat,administration manager,"[confidentiality, clause, information, concern..."
1,Jeanine Adant,administration manager,"[confidentiality, clause, information, concern..."
2,Joan Nicolene Esterhuizen,administration manager,"[confidentiality, clause, information, concern..."
3,Rozina Scheepers,administration manager,"[confidentiality, clause, information, concern..."
4,Samantha Jennings,administration manager,"[information, concerning, candidate, furnished..."
5,Sonja Krog,administration manager,"[confidentiality, clause, information, concern..."
6,Surelda Schlebusch,administration manager,"[position, applied, new, business, sales, cros..."
7,Gwendolene Margaret Matjan,assistant accountant,"[pdf15, one, zero, obj, typecatalogpages, two,..."
8,Hallesheen Leshae Moos,assistant accountant,"[confidentiality, clause, information, concern..."
9,Luan Benjamin,assistant accountant,"[confidentiality, clause, information, concern..."


Save dataframe

In [34]:
data.to_csv('resume_challenge.csv')