In [37]:
import os
from io import StringIO

from pdfminer.high_level import extract_text
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser

import re
import pymongo

In [38]:
badones = './badones'
goodones = './goodones'

In [39]:
mg_host = os.environ.get("MONGO_HOST")
mg_username = os.environ.get("MONGO_USER")
mg_password = os.environ.get("MONGO_PW")
client = pymongo.MongoClient(f'mongodb://{mg_username}:{mg_password}@{mg_host}')

cvs = client.badtranslator.cvs
print(cvs)

Collection(Database(MongoClient(host=['10.115.22.20:27017'], document_class=dict, tz_aware=False, connect=True), 'badtranslator'), 'cvs')


## 1. Extraction, preprocessing and saving of the 'fake' resumes

### Utility functions

In [57]:
def read_pdf(fp):
    # PDFParser can throw out an error with certain pdf files. See:
    # 
    output_string = StringIO()

    with open(fp, 'rb') as in_file:
        parser = PDFParser(in_file)
        try:
            doc = PDFDocument(parser)
        except:
            return False
        else:
            rsrcmgr = PDFResourceManager()
            device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            for page in PDFPage.create_pages(doc):
                interpreter.process_page(page)

    return output_string.getvalue()

multiple_spaces = re.compile(r' {2,}')
linebreaks_regex = re.compile(r'\n\s*\n')
non_standard_chars  = re.compile(r'[^a-zA-z0-9.,!?/:;@&<>\"\'\s]')
email_regex_good = re.compile(r'\([A-Za-z. ]+\)')    
email_regex_bad = re.compile(r'([\w.-]+@[\w.-]+\.\w+)\.pdf')
email_regex = re.compile(r'([^(pdf)]+)\.pdf')

def cleanup(text):
    # Minimal cleaning of the text
    # !! The new lines (LF) can be simple or double. This is not apparent in the Jupyter Notebook
    text = re.sub(multiple_spaces, ' ', text)
    text = re.sub(linebreaks_regex, "\n", text)
    text = re.sub(non_standard_chars, '', text)
    return text

def insert_mongo(cv_record):
    cvs.insert_many(list_of_cvs)
    

def append_list_cvs(document, status):
    name = ''
    email = ''
    
    raw_text = read_pdf(document.path) 
    raw_text = read_pdf(document.path)
    if raw_text is not False:
        clean_text = cleanup(raw_text)

        if (status == 'bad'):
            m = re.search(email_regex_bad, document.name)
            if m is not None:
                email = m.group(1)
                cv_record = {
                    "email": email,
                    "cv": clean_text,
                    "status": status
                }
                list_of_cvs.append(cv_record)  

        elif (status == 'good'):
            m = email_regex_good.sub("", document.name)
            m = m.replace(', ', '')
            m = re.search(email_regex, m)
            if m is not None:
                name = m.group(1)
                cv_record = {
                    "name": name,
                    "cv": clean_text,
                    "status": status
                }
                
                list_of_cvs.append(cv_record)  
    

In [58]:
list_of_cvs = []

with os.scandir(badones) as it:
    for entry in it:
        if entry.is_file:
            append_list_cvs(entry, 'bad')

result = insert_mongo(list_of_cvs)            

## 2. Extraction, preprocessing and saving of the 'authentic' resumes

In [59]:
list_of_cvs = []

with os.scandir(goodones) as it:
    for entry in it:
        if entry.is_file:
            append_list_cvs(entry, 'good')

result = insert_mongo(list_of_cvs) 