# Libraries

In [171]:
import docx2txt
from PyPDF2 import PdfReader, PdfFileWriter, PdfFileMerger

import spacy
import en_core_web_sm
from spacy.matcher import Matcher

import re
import nltk

import pandas as pd

from nltk.corpus import stopwords
from spacy import displacy


import string

In [172]:
nltk.download('stopwords')
spacy.load('en_core_web_sm')
spacy.load('en_core_web_lg')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cfont\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<spacy.lang.en.English at 0x1fc8a5b1e90>

# Extract Text.

In [173]:
#Extracting text from DOCX
def doctotext(m):
    temp = docx2txt.process(m)
    resume_text = [line.replace('\t', ' ') for line in temp.split('\n') if line]
    text = ' '.join(resume_text)
    return (text)
    
#Extracting text from PDF
def pdftotext(m):
    # pdf file object
    # you can find find the pdf file with complete code in below
    pdfFileObj = open(m, 'rb')

    # pdf reader object
    pdfFileReader = PdfReader(pdfFileObj)

    # number of pages in pdf
    num_pages = len(pdfFileReader.pages)

    currentPageNumber = 0
    text = ''

    # Loop in all the pdf pages.
    while(currentPageNumber < num_pages ):

        # Get the specified pdf page object.
        pdfPage = pdfFileReader.pages[currentPageNumber]

        # Get pdf page text.
        text = text + pdfPage.extract_text()

        # Process next page.
        currentPageNumber += 1

    return (text)

# Document.

In [174]:
text = pdftotext("C:/Users/cfont/Downloads/Cristian Fontana - CV-Resume.pdf")

# load pre-trained model
nlp = spacy.load('en_core_web_sm')
# nlp_big = spacy.load('en_core_web_lg')

# initialize matcher with a vocab
matcher = Matcher(nlp.vocab)

# displacy.render(nlp(text),style="ent",jupyter=True)

# Name

In [175]:
def extract_name(resume_text):
    nlp_text = nlp(resume_text)
    
    # First name and Last name are always Proper Nouns
    pattern = [{'POS': 'PROPN'}, {'POS': 'PROPN'}]
    
    matcher.add('NAME', [pattern])
    
    matches = matcher(nlp_text)
    
    for match_id, start, end in matches:
        span = nlp_text[start:end]
        return span.text
print('Name: ',extract_name(pdftotext("C:/Users/cfont/Downloads/Cristian Fontana - CV-Resume.pdf")))

Name:  Sickit Learn


# Qualification.

In [176]:
nlp_text = nlp(text)

# Sentence Tokenizer
nlp_text = [sent.text.strip() for sent in nlp_text.sents]

In [178]:
# Grad all general stop words
STOPWORDS = set(stopwords.words('english'))

# Education Degrees
EDUCATION = [
            'BE','B.E.', 'B.E', 'BS', 'B.S', 
            'ME', 'M.E', 'M.E.', 'M.B.A', 'MBA', 'MS', 'M.S', 
            'BTECH', 'B.TECH', 'M.TECH', 'MTECH', 
            'SSLC', 'SSC' 'HSC', 'CBSE', 'ICSE', 'X', 'XII',
            'BACHELOR', 'MASTER', 'PHD', 'BACHELORS', 'MASTERS', 'Ph.D.',
            'Licenciatura', 'Ingeniería', 'Maestría', 'Maestria',
            'Maestra', 'Maestro', 'Doctorado', 'Doctora', 'Doctor', 'Licenciado', 'Licenciada',
            'Ingeniero', 'Ingeniera', 'Maestrante', 'Doctorante', 'L'
            'Lic', 'Ing'
        ]

# Make every word in EDUCATION lowercase.
EDUCATION = [x.lower() for x in EDUCATION]

bachelor_subjects = [
    "Computer Science",
    "Physics",
    "Chemistry",
    "Biology",
    "Mathematics",
    "Engineering",
    "Psychology",
    "English Literature",
    "History",
    "Sociology",
    "Economics",
    "Political Science",
    "Business Administration",
    "Marketing",
    "Accounting",
    "Finance",
    "Nursing",
    "Environmental Science",
    "Art",
    "Music",
    "Film Studies",
    "Philosophy",
    "Anthropology",
    "Communications",
    "Languages",
    "Geography",
    "Architecture",
    "Urban Planning",
    "Graphic Design",
    "Journalism",
    "Criminal Justice",
    "Law",
    "International Relations",
    "Sports Science",
    "Theater",
    "Dance",
    "Religious Studies",
    "Information Technology",
    "Health Sciences",
    "Social Work",
    "Public Health",
    "Nutrition",
    "Linguistics",
    "Human Resources",
    "Hospitality Management",
    "Tourism",
    "Fashion Design",
    "Interior Design"
]

# Lemmatization of every word in bachelor_subjects
bachelor_subjects = [nlp(text) for text in bachelor_subjects]

# For each sentence in bachelor_subjects keep the lemma of each word and keep them as strings.
bachelor_subjects_lemma = [' '.join([word.lemma_ for word in subject]) for subject in bachelor_subjects]

def extract_education(resume_text):
    nlp_text = nlp(resume_text)

    # Sentence Tokenizer
    nlp_text = [sent.text.strip() for sent in nlp_text.sents]

    edu = {}

    for index, text in enumerate(nlp_text):
        for tex in text.split():
            # Replace all special symbols
                tex = re.sub(r'[?|$|.|!|,\']', r'', tex)
                if tex.lower() in EDUCATION and tex not in STOPWORDS:
                    edu['Sentence'] = text
                    edu['Education'] = tex

                    # Find the GPA.
                    edu['GPA'] = re.findall(r'\b\d\.\d\b', text)
                    
                    text_lemma = [nlp(word) for word in text.split()]

                    for word in text_lemma:
                        for single in word:
                            if single.lemma_ in bachelor_subjects_lemma:
                                edu['Subject'] = single.lemma_
                                
                    # Find dates using regex.
                    # If you want to extract the year from the text, change the regex to r'(\d{4})'
                    edu['Years'] = re.findall(r'(20\d{2}|19\d{2})', text)
    
    return edu
print('Qualification: ', extract_education(text))

Qualification:  {'Sentence': "Experience\nEDUCATION\n2014-2020\nBachelor's degree in Economics.", 'Education': 'Bachelors', 'GPA': [], 'Subject': 'economic', 'Years': ['2014', '2020']}


# Email

In [179]:
def extract_email_addresses(string):
    r = re.compile(r'[\w\.-]+@[\w\.-]+')
    return r.findall(string)
print('Mail id: ',extract_email_addresses(text))

Mail id:  ['c.fontana95@gmail.com']


# Phone Number

In [180]:
def extract_mobile_number(resume_text):
    phone = re.findall(re.compile(r'(?:(?:\+?([1-9]|[0-9][0-9]|[0-9][0-9][0-9])\s*(?:[.-]\s*)?)?(?:\(\s*([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|([0-9][1-9]|[0-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?([2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?([0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(\d+))?'), resume_text)
    
    if phone:
        number = ''.join(phone[0])
        if len(number) > 10:
            return number
        else:
            return number
print('Mobile Number: ',extract_mobile_number(text))

Mobile Number:  3934444659


# Skills

In [181]:
general_skills_list = [
    "Communication Skills",
    "Verbal Communication",
    "Written Communication",
    "Presentation Skills",
    "Public Speaking",
    "Interpersonal Skills",
    "Negotiation Skills",
    "Listening Skills",
    "Persuasion Skills",
    "Teamwork",
    "Collaboration",
    "Leadership",
    "Conflict Resolution",
    "Relationship Building",
    "Networking",
    "Analytical Skills",
    "Critical Thinking",
    "Problem Solving",
    "Research Skills",
    "Data Analysis",
    "Quantitative Analysis",
    "Qualitative Analysis",
    "Decision Making",
    "Attention to Detail",
    "Logical Reasoning",
    "Technical Skills",
    "Computer Literacy",
    "Programming Languages",
    "Software Proficiency",
    "Web Development",
    "Database Management",
    "Information Technology",
    "Troubleshooting",
    "Systems Administration",
    "Network Security",
    "Creativity",
    "Innovation",
    "Graphic Design",
    "Artistic Skills",
    "Photography",
    "Video Editing",
    "Content Creation",
    "Writing Skills",
    "Copywriting",
    "Proofreading and Editing",
    "Organization",
    "Time Management",
    "Project Management",
    "Planning and Coordination",
    "Multitasking",
    "Prioritization",
    "Detail Orientation",
    "Meeting Deadlines",
    "Resource Management",
    "Customer Service",
    "Client Management",
    "Relationship Management",
    "Conflict Resolution (customer-facing)",
    "Sales Skills",
    "Account Management",
    "Marketing Skills",
    "Market Research",
    "Advertising",
    "Social Media Marketing",
    "Search Engine Optimization (SEO)",
    "Language Skills",
    "Bilingualism",
    "Translation",
    "Interpretation",
    "Financial Skills",
    "Accounting",
    "Financial Analysis",
    "Budgeting",
    "Financial Planning",
    "Risk Management",
    "Teaching and Training",
    "Instructional Design",
    "Curriculum Development",
    "Tutoring",
    "Mentoring",
    "Project Coordination",
    "Event Planning",
    "Event Management",
    "Logistics",
    "Supply Chain Management",
    "Research and Development",
    "Scientific Methodology",
    "Lab Techniques",
    "Experimental Design",
    "Statistical Analysis",
    "Problem Diagnosis",
    "Troubleshooting (Technical)",
    "Maintenance and Repair",
    "Equipment Handling",
    "Mechanical Skills",
    "Health and Safety",
    "First Aid",
    "CPR",
    "Occupational Health and Safety",
    "Risk Assessment"
]


In [182]:
# Convert general skills to lowercase.
general_skills_list = [skill.lower() for skill in general_skills_list]

In [183]:
def general_skills(resume_text):
    nlp_text = nlp(resume_text)

    noun_chunks = nlp_text.noun_chunks

    # removing stop words and implementing word tokenization
    tokens = [token.text for token in nlp_text if not token.is_stop]
    
    # extract values
    skillset = []
    
    # check for one-grams (example: python)
    for token in tokens:
        if token.lower() in general_skills_list:
            skillset.append(token)
   
    for token in noun_chunks:
        token = token.text.lower().strip()
        if token in general_skills_list:
            skillset.append(token)
    return [i.capitalize() for i in set([i.lower() for i in skillset])]
  
print ('General Skills',general_skills(text))

General Skills []


In [184]:
programming_skills_list = [
    "Programming Languages",
    "Python",
    "JavaScript",
    "Java",
    "C++",
    "C#",
    "Ruby",
    "PHP",
    "Swift",
    "Go",
    "Rust",
    "TypeScript",
    "HTML",
    "CSS",
    "SQL",
    "Shell Scripting",
    "Version Control",
    "Git",
    "SVN",
    "Continuous Integration/Continuous Deployment (CI/CD)",
    "Agile Development",
    "Test-Driven Development (TDD)",
    "Object-Oriented Programming (OOP)",
    "Functional Programming",
    "Web Development",
    "Front-end Development",
    "Back-end Development",
    "Full-Stack Development",
    "Mobile Development",
    "iOS Development",
    "Android Development",
    "Database Management",
    "Database Design",
    "Query Optimization",
    "API Development",
    "RESTful APIs",
    "Web Services",
    "Microservices",
    "Cloud Computing",
    "Amazon Web Services (AWS)",
    "Microsoft Azure",
    "Google Cloud Platform (GCP)",
    "Containerization",
    "Docker",
    "Kubernetes",
    "Server Administration",
    "Linux",
    "Windows Server",
    "Networking",
    "Security",
    "Cybersecurity",
    "Data Structures",
    "Algorithms",
    "Software Development",
    "Software Architecture",
    "Software Testing",
    "Debugging",
    "Problem Solving",
    "Code Optimization",
    "Performance Tuning",
    "Code Review",
    "Documentation",
    "Unit Testing",
    "Integration Testing",
    "System Testing",
    "Front-end Frameworks",
    "React",
    "Angular",
    "Vue.js",
    "Back-end Frameworks",
    "Django",
    "Ruby on Rails",
    "Node.js",
    "Flask",
    "ASP.NET",
    "PHP Frameworks",
    "Laravel",
    "Symfony",
    "CodeIgniter",
    "Testing Frameworks",
    "JUnit",
    "PyTest",
    "Mocha",
    "Jest",
    "Database Systems",
    "MySQL",
    "PostgreSQL",
    "Oracle",
    "MongoDB",
    "Redis",
    "Machine Learning",
    "Data Analysis",
    "Data Visualization",
    "Artificial Intelligence",
    "Natural Language Processing (NLP)",
    "Big Data",
    "Hadoop",
    "Spark",
    "Blockchain Development",
    "Internet of Things (IoT)",
    "DevOps",
    "Infrastructure as Code (IaC)",
    "Configuration Management",
    "Scripting",
    "Problem Diagnosis",
    "Technical Support",
    "API Integration",
    "Project Management",
    "Agile Methodologies",
    "Scrum",
    "Kanban",
    "Software Documentation",
    "Collaboration Tools",
    "Jira",
    "Confluence",
    "Slack",
    "Version Control Systems",
    "Git",
    "SVN",
    "Code Editors",
    "Visual Studio Code",
    "PyCharm",
    "IntelliJ IDEA",
    "Eclipse",
    "Sublime Text",
    "Atom",
    "Operating Systems",
    "Linux",
    "Windows",
    "macOS"
]


In [185]:
# Convert programming skills to lower case
programming_skills_list = [i.lower() for i in programming_skills_list]

In [186]:
def programming_skills(resume_text):
    nlp_text = nlp(resume_text)

    noun_chunks = nlp_text.noun_chunks

    # removing stop words and implementing word tokenization
    tokens = [token.text for token in nlp_text if not token.is_stop]

    skillset = []
    
    # check for one-grams (example: python)
    for token in tokens:
        if token.lower() in programming_skills_list:
            skillset.append(token)
   
    for token in noun_chunks:
        token = token.text.lower().strip()
        if token in programming_skills_list:
            skillset.append(token)
    return [i.capitalize() for i in set([i.lower() for i in skillset])]
  
print ('Skills',programming_skills(text))

Skills ['Agile methodologies', 'Slack', 'Mysql', 'Sql', 'Mongodb', 'Git', 'Docker', 'Postgresql', 'Jira', 'Python', 'Machine learning', 'Spark']


# Languages

In [187]:
languages_list = [
    "English",
    "Spanish",
    "French",
    "German",
    "Chinese",
    "Mandarin",
    "Arabic",
    "Hindi",
    "Portuguese",
    "Bengali",
    "Russian",
    "Japanese",
    "Lahnda",
    "Javanese",
    "Wu",
    "Telugu"]

In [188]:
# Convert languages to lower case
languages_list = [i.lower() for i in languages_list]

In [189]:
language_level = [
    "Elementary Proficiency",
    "Limited Working Proficiency",
    "Professional Working Proficiency",
    "Full Professional Proficiency",
    "Native or Bilingual Proficiency",
    "Native",
    "Advanced",
    "A1",
    "A2",
    "B1",
    "B2",
    "C1",
    "C2"
]

In [190]:
# Convert language levels to lower case
language_level = [i.lower() for i in language_level]

In [191]:
def language_skill(resume_text):
    nlp_text = nlp(resume_text)

    # removing stop words and implementing word tokenization
    tokens = [token.text for token in nlp_text if not token.is_stop]

    # Drop every token that is equal to a special character
    tokens = [token for token in tokens if not token in string.punctuation]

    skillset = {}

    # check for one-grams (example: python)
    for token in tokens:
        if token.lower() in languages_list:
            skillset[token] = 'Level Not Specified'
            if tokens[tokens.index(token) + 1].lower() in language_level:
                skillset[token] = tokens[tokens.index(token) + 1]

    return skillset
  
print ('Languages:' ,language_skill(text))

Languages: {'Spanish': 'Native', 'English': 'Advanced', 'Portuguese': 'Level Not Specified'}


# Bonus Points



In [192]:
bonus_points_list = [
    'Projects',
    'Achievements',
    'Hobbies'
]

In [193]:
def bonus(resume_text):
    nlp_text = nlp(resume_text)

    # removing stop words and implementing word tokenization
    tokens = [token.text for token in nlp_text if not token.is_stop]

    # Put tokens list in lower case
    tokens = [token.lower() for token in tokens]

    bonus_points = {}

    # check for one-grams (example: python)
    for bonus_piece in bonus_points_list:
        if bonus_piece.lower() in tokens:
            bonus_points[bonus_piece] = 'Yes'
        else:
            bonus_points[bonus_piece] = 'No'

    return bonus_points
  
print ('Bonus:' , bonus(text))

Bonus: {'Projects': 'Yes', 'Achievements': 'No', 'Hobbies': 'No'}


# All data

In [194]:
print('Name: ',extract_name(text))
print('Qualification: ', extract_education(text))
print('Mail id: ',extract_email_addresses(text))
print('Mobile Number: ',extract_mobile_number(text))
print ('General Skills:',general_skills(text))
print ('Programming Skills:',programming_skills(text))
print ('Languages:' ,language_skill(text))
print ('Bonus:' , bonus(text))

Name:  Sickit Learn
Qualification:  {'Sentence': "Experience\nEDUCATION\n2014-2020\nBachelor's degree in Economics.", 'Education': 'Bachelors', 'GPA': [], 'Subject': 'economic', 'Years': ['2014', '2020']}
Mail id:  ['c.fontana95@gmail.com']
Mobile Number:  3934444659
General Skills: []
Programming Skills: ['Agile methodologies', 'Slack', 'Mysql', 'Sql', 'Mongodb', 'Git', 'Docker', 'Postgresql', 'Jira', 'Python', 'Machine learning', 'Spark']
Languages: {'Spanish': 'Native', 'English': 'Advanced', 'Portuguese': 'Level Not Specified'}
Bonus: {'Projects': 'Yes', 'Achievements': 'No', 'Hobbies': 'No'}


In [195]:
el punto 2, si usas Python, y haces una Lambda en AWS que de entrada tiene un Json ó CSV
con los datos del CSV y de salida te devuelve un Json o CSV con los datos del candidato,
ya eso Ayrton puede consumirlo en la app de HR q esta haciendo con PHP Laravel

y de respuesta, es un json q diga por ej: 

Perfil: front end developer
Edad: 20 años
Tech principal: JavaScript
Framework principal: React Native
Tech secundaria: CSS
Ultima empresa donde trabajo: Amazon
Años en ultima empresa: 4.5
Idioma principal: Ingles
Nivel del idioma Principal: 8

SyntaxError: invalid non-printable character U+00A0 (14378996.py, line 3)