In [22]:
import spacy

nlp = spacy.load("en_core_web_md")

In [23]:
ruler = nlp.add_pipe("entity_ruler")
ruler.from_disk("./data/skills.jsonl")

<spacy.pipeline.entityruler.EntityRuler at 0x70b04ecd2f10>

In [13]:
import re, fitz

def text_from_pdf(file_bytes: bytes) -> str:
    try:
        with fitz.open(stream= file_bytes, filetype="pdf") as pdf:
            text = "".join([page.get_text() for page in pdf])
        return preprocess_text(text)
    except Exception as e:
        print(f"[Error]: Failed to extract text: {e}")
        return ""

def preprocess_text(text: str) -> str:
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    res = " ".join([token.text for token in doc if (not token.is_stop)])
    res= re.sub(r"[^\w@.|’|'|-]+", " ", text)
    res = re.sub(r"\s+", " ", res).strip()

    return res


Functional Resume Sample John W. Smith 2002 Front Range Way Fort Collins CO 80525 jwsmith@colostate.edu Career Summary Four years experience in early childhood development with a diverse background in the care of special needs children and adults. Adult Care Experience Determined work placement for 150 special needs adult clients. Maintained client databases and records. Coordinated client contact with local health care professionals on a monthly basis. Managed 25 volunteer workers. Childcare Experience Coordinated service assignments for 20 part-time counselors and 100 client families. Oversaw daily activity and outing planning for 100 clients. Assisted families of special needs clients with researching financial assistance and healthcare. Assisted teachers with managing daily classroom activities. Oversaw daily and special student activities. Employment History 1999-2002 Counseling Supervisor The Wesley Center Little Rock Arkansas. 1997-1999 Client Specialist Rainbow Special Care Cen

In [15]:
def extract_text(path):
    with open(path, "rb") as pdf:
        bytes_ = pdf.read()
    return text_from_pdf(bytes_)

In [36]:
text = extract_text("../pdfs/test3.pdf")
print(text)

1 of 2 Juan Jose Carin Data Scientist Mountain View CA 94041 650-336-4590 | juanjose.carin@gmail.com linkedin.com in juanjosecarin | juanjocarin.github.io Professional Profile Passionate about data analysis and experiments mainly focused on user behavior experience and engagement with a solid background in data science and statistics and extensive experience using data insights to drive business growth. Education 2016 University of California Berkeley Master of Information and Data Science GPA 3.93 Relevant courses Machine Learning Machine Learning at Scale Storing and Retrieving Data Field Experiments Applied Regression and Time Series Analysis Exploring and Analyzing Data Data Visualization and Communication Research Design and Applications for Data Analysis 2014 Universidad Politécnica de Madrid M.S. in Statistical and Computational Information Processing GPA 3.69 Relevant courses Data Mining Multivariate Analysis Time Series Neural Networks and Statistical Learning Regression and P

In [24]:

for key, value in entities.items():
    print(key, ":", value)

ORG : ['Functional Resume Sample', 'Fort Collins CO', 'Childcare Experience Coordinated', 'The Wesley Center Little Rock', 'Cowell Elementary', 'Arkansas Education University', 'Elementary Education']
PERSON : ['John W. Smith', 'Dean']
DATE : ['Four years', 'monthly', 'daily', 'daily', 'daily', '1999-2002', '1997-1999', '1996-1997', '1998']
EXPERIENCE : ['experience in early']
CARDINAL : ['150', '25', '20', '100', '100', '3.8', '3.5', '3.4']
SKILL : ['databases']
GPE : ['Arkansas', 'Arkansas', 'Conway']
FAC : ['Little Rock Little Rock AR BS']


In [37]:
def extract_entities(text):
    entities = {"SKILLS":[],
                "EXPERIENCE": []}

    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == "SKILL":
            entities["SKILLS"].append(ent.text)
    experiences= re.findall(r"\d{4} ?\-\ ?\d{4}|\d{4}\ ?to\ ?\d{4}", text)
    total_experience = 0
    for experience in experiences:
        start, end = experience.replace(" ", "").replace("to", "-").split("-")
        start, end = int(start), int(end)
        total_experience += end-start
    if total_experience > 0:
        entities["EXPERIENCE"].append(str(total_experience)+" years")
    return entities



In [38]:
entities = extract_entities(text)
for key, value in entities.items():
    print(key, ":", value)

SKILLS : ['data analysis', 'data science', 'business', 'Time Series', 'Finance', 'Finance', 'mobile', 'R', 'Python', 'Tableau', 'Git', 'Storm', 'Bash', 'D3.js', 'DATA SCIENCE', 'R', 'anomaly detection', 'MySQL', 'design', 'R', 'Python', 'OpenCV', 'TensorFlow', 'monitoring', 'motion detection', 'algorithms', 'variables', 'Hadoop', 'Python', 'API', 'R', 'mobile', 'logistic regression', 'smoothing']
EXPERIENCE : ['2 years']


In [39]:
nlp.to_disk("./output")