# PDF Data Extraction

In [1]:
# Load nessesary libraries
import os
import pandas as pd
import PyPDF2
import re 
import numpy as np

In [11]:
# Build a pdf extractor
def extract_resume_data(file_path):
    resume_data = []
    
    if file_path.endswith('.pdf'):
        category = input('Enter Category: ')
        with open(file_path, 'rb') as f:
            pdf_reader = PyPDF2.PdfReader(f)
            text = ''
            for page_num in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_num]
                text += page.extract_text()
            resume_data.append({'text': text, 'category': category,'candidate_resume_name': os.path.basename(file_path)})
            
    else:
        for root, dirs, files in os.walk(file_path):
            for file in files:
                if file.endswith('.pdf'):
                    with open(os.path.join(root, file), 'rb') as f:
                        category = os.path.basename(root)
                        text = ''
                        pdf_reader = PyPDF2.PdfReader(f)
                        for page_num in range(len(pdf_reader.pages)):
                            page = pdf_reader.pages[page_num]
                            text += page.extract_text()
                        data = pd.DataFrame({"text": [text], "category": [category],'candidate_resume_name': file})
                        resume_data.append(data)
                        
    return pd.concat(resume_data, ignore_index=True)


In [12]:
# Fill the past and exract the data
file_path = r"C:\Users\pcc\Desktop\data"
resume_data = extract_resume_data(file_path)

In [13]:
resume_data.head()

Unnamed: 0,text,category,candidate_resume_name
0,ACCOUNTANT\nSummary\nFinancial Accountant spec...,ACCOUNTANT,10554236.pdf
1,STAFF ACCOUNTANT\nSummary\nHighly analytical a...,ACCOUNTANT,10674770.pdf
2,ACCOUNTANT\nProfessional Summary\nTo obtain a ...,ACCOUNTANT,11163645.pdf
3,SENIOR ACCOUNTANT\nExperience\nCompany Name\n ...,ACCOUNTANT,11759079.pdf
4,SENIOR ACCOUNTANT\nProfessional Summary\nSenio...,ACCOUNTANT,12065211.pdf


In [14]:
# Clean the text
def clean_resume_text(text):
    text=text.str.lower().str.strip()
    text=text.str.replace(r'[^a-zA-Z0-9\s\n]',"",regex=True)
    text=text.str.replace(r'\s+',' ',regex=True)
    return text

resume_data["cleaned_text"]=clean_resume_text(resume_data["text"])


In [16]:
# Extract the Sections
keywords=['summary','education','executive profile',
          'professional profile','accomplishments',
          'personal profile','work background',
          'academic profiles', 'qualifications',
          'other activities', 'experience','interests',
          'skills','achievements','publications','publication',
          'certifications','workshops','internships','trainings',
          'overview','objective','hobbies','jobs','position of responsibility'
]

resume_list=[]

def extract_content(text,keywords):
    content={}
    indices=[]
    keys=[]
    
    for key in keywords:
        key_start=text.find(key)
        if key_start != -1:
            next_key_index=text.find(key,key_start+1)
            if next_key_index !=-1:
                content[key]=text[key_start+len(key):next_key_index].strip()
            else:
                content[key]=text[key_start+len(key):].strip()
    return content     

for i in resume_data["cleaned_text"]:
    text=i
    parse_section_content=extract_content(text,keywords)
    resume_list.append(parse_section_content)
    
section_df=pd.DataFrame(resume_list)
section_df

Unnamed: 0,summary,education,accomplishments,experience,interests,skills,certifications,workshops,qualifications,achievements,...,objective,professional profile,hobbies,publications,publication,personal profile,executive profile,other activities,trainings,internships
0,financial accountant specializing in financial...,northern maine community college 1994 associat...,served on a tiger team which identified and re...,company name july 2011 to november 2012 accoun...,american society of military comptrollers addi...,accounting general accounting accounts payable...,certified defense financial manager cdfm may 2...,to participate and contribute to accounting po...,,,...,,,,,,,,,,
1,highly analytical and detailoriented professio...,bachelor of science accounting may 2010 univer...,,staff accountant january 2014 to october 2014 ...,alpha sigma phi officer and chair positions ja...,highlights dba quick books mas sage software m...,,,,,...,,,,,,,,,,
2,to obtain a position in a fastpaced business o...,computer applications specialist certificate p...,,accountant january 2011 to november 2015 compa...,,and attributes attributes selfmotivated and ho...,,,intermediate word advanced excel powerpoint in...,,...,,,,,,,,,,
3,,emory university goizueta business school 5 20...,,company name june 2011 to current senior accou...,fulton county casa board of directors member t...,by drafting over forty memorandums that summar...,and awards fulton county casa board of directo...,,,by the highest performing junior year accounti...,...,,,,,,,,,,
4,senior accountant who completes accounting act...,bachelor of business administration accounting...,found material misstatement in prepaid propert...,in full life cycle of general ledger accountin...,world travelphotographygolfsoccermoviefashion ...,aderantcms excel quickbooks pro sql access pea...,certified public accountant new york state dat...,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2479,i am a highly motivated educator and selfstart...,university of south florida 2002 bachelor of s...,student development education strategies goal ...,tenured hillsborough alliance for black school...,,that have helped a plethora of teachers and st...,,centered on learning goals classroom managemen...,,,...,s for all lessonsprojects and communicated wit...,,,,,,,,company name august 2006 to june 2009 girls jv...,
2480,to be employed as an administrative assistant ...,al software and the internet supervised an ave...,administrative assistantsales representative m...,creating excel databases and powerpoint presen...,met regularly with parents and guardians to di...,will benefit the company technicallyadept job ...,skills accounting administrative basic billing...,fostered team collaboration between students t...,,,...,s for all lessons units and projects to studen...,,,,,,,,,
2481,highly ethical dependable and diligent expert ...,university of phoenix 2007 master of arts,designed effective lesson plans focused on age...,company name june 2012 to current teacher desi...,,conflict resolution course development critica...,,,,,...,s by facilitating courses using my curriculum ...,,,,,,,,,
2482,talented early education professional with div...,professional with diverse experience in planni...,,in planning and implementing various activitie...,,cpr creative thinking delivery first aid polic...,,,,,...,,,,,,,,,,


In [18]:
# Drop unnessesary features
resume_data[["education","skills"]]=section_df[["education","skills"]]
resume_data=resume_data.drop(columns=["text","cleaned_text"],axis=1)
resume_data.head()

Unnamed: 0,category,candidate_resume_name,education,skills
0,ACCOUNTANT,10554236.pdf,northern maine community college 1994 associat...,accounting general accounting accounts payable...
1,ACCOUNTANT,10674770.pdf,bachelor of science accounting may 2010 univer...,highlights dba quick books mas sage software m...
2,ACCOUNTANT,11163645.pdf,computer applications specialist certificate p...,and attributes attributes selfmotivated and ho...
3,ACCOUNTANT,11759079.pdf,emory university goizueta business school 5 20...,by drafting over forty memorandums that summar...
4,ACCOUNTANT,12065211.pdf,bachelor of business administration accounting...,aderantcms excel quickbooks pro sql access pea...


In [19]:
# create a csv of extracted data
resume_data.to_csv("C:/Users/pcc/Desktop/resume_data.csv")