In [17]:
from pdfminer.high_level import extract_text
import cohere
import spacy
import pandas as pd
from spacy.matcher import PhraseMatcher
import re
from IPython.display import display, Markdown

# Extracting text from pdf as is
def extract_text_from_pdf(pdf_path):
    return extract_text(pdf_path)

# Cleaning up the text for better processing in the later stages
def clean_text(text):
    text = re.sub(r'\n+', '\n', text)   # Remove newlines
    text = re.sub(r'\s+', ' ', text)    # Remove multiple spaces
    text = re.sub(r'Page \d+ of \d+', '', text) # Remove page numbers
    text = text.replace('•', ' ')   # Replace bullets with space.
    text = re.sub(r'\+46-\d{9}', lambda x: x.group().replace('-', ''), text) # Remove hyphens or spaces in phone number 
    text = text.strip() # Remove redundant spaces
    return text

# Main part of the script - Taking out important data
nlp = spacy.load("en_core_web_lg")
EMAIL_PATTERN = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
PHONE_PATTERN = r'(?:(?:\+|00)(?:\d{1,3}))?[-.\s]?(?:\(?\d{1,4}\)?)?[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}'

skills_list = {
        'Keyword/Skill': [
        'Test Engineer', 'System Crash Testing', 'Safety Systems', 'Testing', 'Analysis', 
        'Data Collection', 'Signal Analysis', 'Measurement Techniques', 'Rig Development', 
        'PC Skills', 'Collaboration', 'Problem-Solving', 'Creativity', 'Change Management', 
        'Swedish', 'English', 'Driver\'s License', 'Structured', 'Analytical', 'Flexibility', 
        'Multi-Tasking'
    ],
    'Relevance Score': [10, 8, 7, 7, 6, 5, 5, 5, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 2, 2, 2]
}
skills_df = pd.DataFrame(skills_list)

skills_patterns = [nlp(skill) for skill in skills_list]
matcher = PhraseMatcher(nlp.vocab)
matcher.add("Skills", None, *skills_patterns)

def extract_info(text):
    # Process text with spaCy
    doc = nlp(text)
    extracted_data = {
        "names": [],
        "emails": [],
        "phones": []
    }

    # Name
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            if not extracted_data["names"]:
                extracted_data["names"].append(ent.text)

    # Phone number and email
    extracted_data["emails"] = re.findall(EMAIL_PATTERN, text)
    extracted_data["phones"] = max(re.findall(PHONE_PATTERN, text), key=len)
    # The phone number extraction also extracts dates from the later sections.
    # Thus, max() is used with the length of string being the primary keyword.
    # Hopefully it works in most, if not all, cases.    

    return extracted_data

def extract_skill(text, reqs):
    co = cohere.Client("YOUImGXa6Wg0XgfsFdM0c2iIFOX3fM8UDMKzdGYZ")
    response = co.chat(
        message = "Following is my resume:\n" + text + "\n<end of resume>\nI am applying for a job which has the keywords and their respective percieved relevance in this dataframe:\n" + str(reqs) + "\nestimate how likely a match I am (give a score out of 100) to get the job taking into account my skills and projects. Do not sugarcoat the score \n Note: Do not add any work experience, roles, or responsibilities that I have not held. Recommend changes to my resume, and highlight those changes in italics, use markdown for formatting."
    )
    display(Markdown(response.text))
    return None

filename = "AbhishekKolekar_Resume.pdf"
textFromResume = clean_text(extract_text_from_pdf(filename))
infoFromResume = extract_info(textFromResume)
skills_match = extract_skill(textFromResume, skills_list)