# Preprocessing Job Text

* pandas
* re
* numpy

## Importing libraries 

In [1]:
# Code to import libraries as you need in this assessment, e.g.,
import pandas as pd
import re
import numpy as np
import os
import nltk.data
from nltk.tokenize import sent_tokenize, word_tokenize
from __future__ import division
from itertools import chain
from nltk.probability import *
from collections import defaultdict
import pickle

### 1.1 Examining and loading data
- Examine the data folder, including the categories and job advertisment txt documents, etc. Explain your findings here, e.g., number of folders and format of txt files, etc.
- Load the data into proper data structures and get it ready for processing.
- Extract webIndex and description into proper data structures.


In [2]:
# Code to inspect the provided data file...
from sklearn.datasets import load_files # for loading multiple files
text = load_files(r"data/") # load all files in the directory named data
text.keys() # inspect the keys of the dataset

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [3]:
text.target_names # categories of the data

['Accounting_Finance', 'Engineering', 'Healthcare_Nursing', 'Sales']

### 1.2 Pre-processing data
Perform the required text pre-processing steps.

# Creating a corpus / one document 
- Get directory paths
- Create lists of id and texts
- Load in as one file

In [4]:
dir_path = ["./data/Accounting_Finance/", "./data/Engineering/", "./data/Healthcare_Nursing/", "./data/Sales/"]
article_ids = [] # list to store the article ID
article_txts = [] # list to store the raw text
for i in range(len(dir_path)):
    for filename in sorted(os.listdir(dir_path[i])): # we want to load articles in ascending order of their file names
        if filename.endswith(".txt"): # we only look at the txt file
            article_ids.append(filename.split(".")[0]) # split the file name with '.', 
                                                        # so the first part is the article ID, and 2nd part is 'txt'
                                                        # we then take the first part and store it
            path = os.path.join(dir_path[i],filename) # this gives the file path, e.g., './articles/0001.txt'
            with open(path,"r",encoding= 'unicode_escape') as f: # open the txt file
                article_txts.append(f.read()) # read the file into a string, and append it to the article_txts list
                f.close()

In [5]:
print("Article ID:", article_ids[7]) # display the article ID of the 7th index
print("Article txt:\n", article_txts[7]) # display the text of the 7th index

Article ID: Job_00239
Article txt:
 Title: Pensions Administrator
Webindex: 71852020
Company: Hillman Saunders
Description: Are you a proven Pensions Administrator looking to work for one of the leading third party administrators in the UK? Then I have an excellent opportunity for you. The primary responsibility will be to provide a full pension's administration service to clients and customers in an accurate, efficient and timely manner. This would include the processing and settling of transfers and leavers of the pension scheme, as well as dealing with all types of retirement queries, including quotations, settlements and manual calculations. To be considered for this role you NEED to have good working knowledge of the above pension's administration duties; this will preferably come from working in a third party pension administrator. You will also need to have a high level of numeracy and literacy as this may be tested at interview stage. In return you will receive a very competiti

In [6]:
def tokenizeRawData(article):
    # extract the description text from the article  
    
    # Matching the pattern "Description: " and then any character after it until the end of the line 
    match = re.search(r"Description: (.+)", article)  
    if match:
      # extract the description text from the article
      description_text = match.group(1)
    
    nl_article = description_text.lower() # cover all words to lowercase

    pattern = r'''(?x)          # set flag to allow verbose regexps
        (?:[A-Za-z]\.)+        # abbreviations, e.g. U.S.A.
      #| \w*[\$£]?(?:\d+(?:,\d+)?)+(?:\.\d+)?%?\w*  # numbers, currency and percentages, e.g. $12.40, 82%
      | [A-Za-z]+(?:[-'][A-Za-z]*)?        # words with optional internal hyphens and apostrophes
    '''
    # create a tokenizer that matches the regex pattern from the assignment
    tokenizer = nltk.RegexpTokenizer(pattern) 
    # tokenize the article
    tokenised_article = tokenizer.tokenize(nl_article)
    return tokenised_article

In [7]:
  # Creating a list of tokenized articles
tokenized_articles = [tokenizeRawData(article) for article in article_txts] 

In [8]:
# Testing to see if the tokenization worked
tokenized_articles[7]

['are',
 'you',
 'a',
 'proven',
 'pensions',
 'administrator',
 'looking',
 'to',
 'work',
 'for',
 'one',
 'of',
 'the',
 'leading',
 'third',
 'party',
 'administrators',
 'in',
 'the',
 'uk',
 'then',
 'i',
 'have',
 'an',
 'excellent',
 'opportunity',
 'for',
 'you',
 'the',
 'primary',
 'responsibility',
 'will',
 'be',
 'to',
 'provide',
 'a',
 'full',
 "pension's",
 'administration',
 'service',
 'to',
 'clients',
 'and',
 'customers',
 'in',
 'an',
 'accurate',
 'efficient',
 'and',
 'timely',
 'manner',
 'this',
 'would',
 'include',
 'the',
 'processing',
 'and',
 'settling',
 'of',
 'transfers',
 'and',
 'leavers',
 'of',
 'the',
 'pension',
 'scheme',
 'as',
 'well',
 'as',
 'dealing',
 'with',
 'all',
 'types',
 'of',
 'retirement',
 'queries',
 'including',
 'quotations',
 'settlements',
 'and',
 'manual',
 'calculations',
 'to',
 'be',
 'considered',
 'for',
 'this',
 'role',
 'you',
 'need',
 'to',
 'have',
 'good',
 'working',
 'knowledge',
 'of',
 'the',
 'above',
 "

#### Function to show term frequency throughout each preprocessing step

In [9]:
# Creating a frequency distribution of the words
def showTerms():
    # Creating a list of all the words in the articles
    word_list = list(chain.from_iterable(tokenized_articles))
    # Creating a set of all the words in the articles
    vocab = set(word_list)
    # Creating a frequency distribution of the words
    term_fd = FreqDist(word_list) 
    return term_fd
term_fd = showTerms()
term_fd.most_common(20)


[('and', 8309),
 ('the', 6487),
 ('to', 6265),
 ('a', 4699),
 ('of', 4630),
 ('in', 3290),
 ('for', 2832),
 ('with', 2306),
 ('will', 2021),
 ('you', 2011),
 ('be', 1869),
 ('is', 1793),
 ('as', 1425),
 ('this', 1393),
 ('an', 1361),
 ('are', 1332),
 ('experience', 1276),
 ('on', 1216),
 ('have', 1114),
 ('or', 1088)]

## Removing words with < 2 characters
- #4 on Assignment

In [10]:
# creating a variable that contains all the words in the articles
all_words = list(chain.from_iterable(tokenized_articles))

def removeLessThanTwoWords(article):
    return [w for w in article if len(w)>=2]

tokenized_articles = [removeLessThanTwoWords(article) for article in tokenized_articles]
term_fd = showTerms()
term_fd.most_common(20)

[('and', 8309),
 ('the', 6487),
 ('to', 6265),
 ('of', 4630),
 ('in', 3290),
 ('for', 2832),
 ('with', 2306),
 ('will', 2021),
 ('you', 2011),
 ('be', 1869),
 ('is', 1793),
 ('as', 1425),
 ('this', 1393),
 ('an', 1361),
 ('are', 1332),
 ('experience', 1276),
 ('on', 1216),
 ('have', 1114),
 ('or', 1088),
 ('sales', 1030)]

# Removing words from stopwords_en.txt
- #5 on Assignment

In [11]:
stopwords = []
with open('./stopwords_en.txt') as f:
    stopwords = f.read().splitlines()

# filter out stop words
tokenized_articles = [[w for w in article if w not in stopwords] for article in tokenized_articles]
term_fd = showTerms()
term_fd.most_common(20)

[('experience', 1276),
 ('sales', 1030),
 ('role', 946),
 ('work', 861),
 ('business', 832),
 ('team', 789),
 ('working', 719),
 ('job', 688),
 ('care', 675),
 ('skills', 669),
 ('company', 614),
 ('client', 594),
 ('management', 572),
 ('manager', 517),
 ('support', 501),
 ('uk', 496),
 ('service', 480),
 ('excellent', 455),
 ('development', 430),
 ('required', 399)]

# Removing Words that appear once in tokenized_articles
- #6 on assignment

In [12]:
# Calculate term frequency for all words across all articles
term_freq = FreqDist(chain.from_iterable(tokenized_articles))

# Identify words that appear once
less_freq_words_term = set(word for word, freq in term_freq.items() if freq == 1)

# Remove these words from the articles
tokenized_articles = [[word for word in article if word not in less_freq_words_term] for article in tokenized_articles]
term_fd = showTerms()
term_fd.most_common(20)


[('experience', 1276),
 ('sales', 1030),
 ('role', 946),
 ('work', 861),
 ('business', 832),
 ('team', 789),
 ('working', 719),
 ('job', 688),
 ('care', 675),
 ('skills', 669),
 ('company', 614),
 ('client', 594),
 ('management', 572),
 ('manager', 517),
 ('support', 501),
 ('uk', 496),
 ('service', 480),
 ('excellent', 455),
 ('development', 430),
 ('required', 399)]

# Removing top 50 Document Frequency Words
- #7 on Assignment

In [13]:
# Calculate document frequency for each word
doc_freq = defaultdict(int)

for article in tokenized_articles:
    # Get the unique words per article
    for word in set(article):  
        doc_freq[word] += 1

# Get the top 50 words based on document frequency
top_50_doc_freq_words = sorted(doc_freq, key=doc_freq.get, reverse=True)[:50]

# Remove these words from each article
tokenized_articles = [[word for word in article if word not in top_50_doc_freq_words] for article in tokenized_articles]
term_fd = showTerms()
term_fd.most_common(20)


[('care', 675),
 ('design', 337),
 ('engineering', 336),
 ('customer', 335),
 ('home', 291),
 ('ensure', 290),
 ('engineer', 285),
 ('financial', 279),
 ('staff', 271),
 ('systems', 267),
 ('time', 254),
 ('quality', 250),
 ('key', 244),
 ('requirements', 239),
 ('opportunities', 238),
 ('project', 236),
 ('environment', 235),
 ('career', 235),
 ('candidates', 233),
 ('nursing', 228)]

## Saving required outputs
Save the vocabulary, bigrams and job advertisment txt as per spectification.
- vocab.txt

In [14]:
lterm_fd = list(term_fd.keys())
lterm_fd.sort()
sorted_term_fd = {i: term_fd[i] for i in lterm_fd}
with open("vocab.txt", "w") as f:
    # Loop through each term and its index in all_terms
    for index, term in enumerate(sorted_term_fd):
        # Write the term and index to the file, followed by a newline
        f.write(f"{term}:{index}\n")

#### Preprocessed Descriptions and the Original Text File

In [15]:
# Create a csv of the tokenized articles
with open('tokenized_articles.pkl', 'wb') as f:
    pickle.dump(tokenized_articles, f)   

with open('jobsBlob.pkl', 'wb') as f:
    pickle.dump(text, f)