# Libraries

In [1]:
import docx2txt
from PyPDF2 import PdfReader, PdfFileWriter, PdfFileMerger

import spacy
import en_core_web_sm
from spacy.matcher import Matcher

import re
import nltk
import pickle

import pandas as pd

from nltk.corpus import stopwords
from spacy import displacy


import string

In [2]:
nltk.download('stopwords')
spacy.load('en_core_web_sm')
spacy.load('en_core_web_lg')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cfont\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<spacy.lang.en.English at 0x1a23c5a16d0>

# Extract Text.

In [3]:
#Extracting text from DOCX
def doctotext(m):
    temp = docx2txt.process(m)
    resume_text = [line.replace('\t', ' ') for line in temp.split('\n') if line]
    text = ' '.join(resume_text)
    return (text)
    
#Extracting text from PDF
def pdftotext(m):
    # pdf file object
    # you can find find the pdf file with complete code in below
    pdfFileObj = open(m, 'rb')

    # pdf reader object
    pdfFileReader = PdfReader(pdfFileObj)

    # number of pages in pdf
    num_pages = len(pdfFileReader.pages)

    currentPageNumber = 0
    text = ''

    # Loop in all the pdf pages.
    while(currentPageNumber < num_pages ):

        # Get the specified pdf page object.
        pdfPage = pdfFileReader.pages[currentPageNumber]

        # Get pdf page text.
        text = text + pdfPage.extract_text()

        # Process next page.
        currentPageNumber += 1

    return (text)

# Document.

In [68]:
text = pdftotext("C:/Users/cfont/Downloads/Cristian Fontana - CV-Resume.pdf")

# load pre-trained model
nlp = spacy.load('en_core_web_sm')
# nlp_big = spacy.load('en_core_web_lg')

# initialize matcher with a vocab
matcher = Matcher(nlp.vocab)

# displacy.render(nlp(text),style="ent",jupyter=True)

# Name.

In [72]:
from transformers import pipeline
from subprocess import list2cmdline
from pdfminer.high_level import extract_text
import docx2txt
import spacy
from spacy.matcher import Matcher
import time
start = time.time()
nlp = spacy.load('en_core_web_sm')
model_checkpoint = "xlm-roberta-large-finetuned-conll03-english"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)



def text_extraction(file):
    """"
    To extract texts from both pdf and word
    """
    if file.endswith(".pdf"):
        return extract_text(file)
    else:
        resume_text = docx2txt.process(file)
    if resume_text:
        return resume_text.replace('\t', ' ')
    return None



# Organisation names extraction
def org_name(file):
    # Extract the complete text in the resume
    extracted_text = text_extraction(file)
    classifier = token_classifier(extracted_text)
    # Get the list of dictionary with key value pair "entity":'ORG'
    values = [item for item in classifier if item["entity_group"] == "PER"]
    # Get the list of dictionary with key value pair "entity":'ORG'
    res = [sub['word'] for sub in values]
    final1 = list(set(res))  # Remove duplicates
    final = list(filter(None, final1)) # Remove empty strings
    print('Name:', final)


end = time.time()

print("The time of execution of above program is :", round((end - start), 2))

The time of execution of above program is : 7.17


In [53]:
text_extraction("C:/Users/cfont/Downloads/Cristian Fontana - CV-Resume.pdf")

"Cristian Fontana\n\nD a t a   S c i e n t i s t   -   I t a l i a n   C i t i z e n\nEconomist with a strong background in Data Science and Business Intelligence with more than 5 years of\nexperience,  building  quantitative  solutions,  doing  forecasts,  regression,  time  series,  machine  learning,  and\ncreating dashboards. \nAdvanced  understanding  of  statistical,  algebraic  and  other  analytical  techniques.  Highly  organized,\nmotivated and diligent. I enjoy finding problems and working on projects to resolve them holistically.\n\nExperience\n\nDATA SCIENTIST\nLEANTK (CONTRACT)\n0 1 / 2 0 2 3   -   0 6 / 2 0 2 3   ( 6   M O N T H S )\n\nSaved more than 20.000 USD annually to small startups building a financial app with AI that take inputs\nfrom multiple models and make decisions in real time. \nDevelop  input  and  assumptions  based  on  preexisting  models  to  estimate  the  costs  and  savings\nopportunities associated with varying levels of network growth and operati

In [40]:
ner_pipeline = pipeline("ner", model="dccuchile/bert-base-spanish-wwm-uncased", tokenizer="dccuchile/bert-base-spanish-wwm-uncased")

Downloading (…)lve/main/config.json: 100%|██████████| 650/650 [00:00<00:00, 649kB/s]
Downloading pytorch_model.bin: 100%|██████████| 440M/440M [01:59<00:00, 3.69MB/s] 
Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassifi

In [54]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

bert_tokenizer = AutoTokenizer.from_pretrained('dslim/bert-large-NER')
bert_model = AutoModelForTokenClassification.from_pretrained('dslim/bert-large-NER')

nlp = pipeline('ner', model=bert_model, tokenizer=bert_tokenizer)
ner_list = nlp(text_extraction("C:/Users/cfont/Downloads/Cristian Fontana - CV-Resume.pdf"))
print(ner_list)

[{'entity': 'B-PER', 'score': 0.9973539, 'index': 1, 'word': 'C', 'start': 0, 'end': 1}, {'entity': 'B-PER', 'score': 0.61249125, 'index': 2, 'word': '##rist', 'start': 1, 'end': 5}, {'entity': 'I-PER', 'score': 0.7767754, 'index': 3, 'word': '##ian', 'start': 5, 'end': 8}, {'entity': 'I-PER', 'score': 0.9985752, 'index': 4, 'word': 'F', 'start': 9, 'end': 10}, {'entity': 'I-PER', 'score': 0.99541783, 'index': 5, 'word': '##ont', 'start': 10, 'end': 13}, {'entity': 'I-PER', 'score': 0.9936428, 'index': 6, 'word': '##ana', 'start': 13, 'end': 16}, {'entity': 'B-MISC', 'score': 0.6400754, 'index': 44, 'word': 'Data', 'start': 120, 'end': 124}, {'entity': 'I-MISC', 'score': 0.72663593, 'index': 48, 'word': 'Intelligence', 'start': 146, 'end': 158}, {'entity': 'B-MISC', 'score': 0.7665526, 'index': 223, 'word': 'A', 'start': 995, 'end': 996}, {'entity': 'B-MISC', 'score': 0.6003907, 'index': 224, 'word': '##WS', 'start': 996, 'end': 998}, {'entity': 'B-MISC', 'score': 0.8929192, 'index': 2

In [56]:
this_name = []
all_names_list_tmp = []

for ner_dict in ner_list:
    if ner_dict['entity'] == 'B-PER':
        if len(this_name) == 0:
            this_name.append(ner_dict['word'])
        else:
            all_names_list_tmp.append([this_name])
            this_name = []
            this_name.append(ner_dict['word'])
    elif ner_dict['entity'] == 'I-PER':
        this_name.append(ner_dict['word'])

all_names_list_tmp.append([this_name])

print(all_names_list_tmp)

[[['C']], [['##rist', '##ian', 'F', '##ont', '##ana']]]


In [45]:
def find_spanish_full_names(text):
    entities = ner_pipeline(text)
    person_names = []
    current_name = ""

    for entity in entities:
        if entity["entity"] == "I-PER":
            current_name += entity["word"] + " "
        elif current_name:
            person_names.append(current_name.strip())
            current_name = ""

    if current_name:
        person_names.append(current_name.strip())

    return person_names



In [58]:
find_spanish_full_names(text_extraction("C:/Users/cfont/Downloads/Cristian Fontana - CV-Resume.pdf"))

[]

In [60]:
find_spanish_full_names(text_extraction("C:/Users/cfont/Downloads/Cristian Fontana - CV-Resume.pdf"))

[]

In [64]:
# Use dispacy to visualize the NER
displacy.render(nlp(text_extraction("C:/Users/cfont/Downloads/Cristian Fontana - CV-Resume.pdf")),jupyter=True)

AttributeError: 'dict' object has no attribute 'vocab'

In [39]:
from transformers import pipeline

ner_pipeline = pipeline("ner", model="bert-base-uncased", tokenizer="bert-base-uncased")

def find_person_names(text):
    entities = ner_pipeline(text)
    person_names = [entity["word"] for entity in entities if entity["entity"] == "PER"]
    return person_names

find_person_names(re.sub(r'\.(?!\))', r'. ', text))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: 

[]

In [38]:
def find_full_names(text):
    entities = ner_pipeline(text)
    person_names = []
    current_name = ""

    for entity in entities:
        if entity["entity"] == "PER":
            current_name += entity["word"] + " "
        elif current_name:
            person_names.append(current_name.strip())
            current_name = ""

    if current_name:
        person_names.append(current_name.strip())

    return person_names

find_full_names(re.sub(r'\.(?!\))', r'. ', text))

[]

In [36]:
clean_function(text)

'Saved more than 20 000 USD annually to small startups building a financial app with AI that take inputs from multiple models and make decisions in real time Develop input and assumptions based on preexisting models to estimate the costs and savings opportunities associated with varying levels of network growth and operations Toolbox AWS GCP Python Git Numpy Pandas Sickit Learn Matplotlib Pipenv Statsmodels Plotly SQL No SQL Boto3 Jupyternotebook Dash Lambda S3 DynamoDB Serverless Deployment Docker Notion Figma Slack Github Spark RedShift Athena Azure Scipy PostgreSQL Reduced the churn rate by 15 in 3 months by finding the importance of the clients two first days of operating In charge of the dashboards and analytic presentations about the status of the company clients and commercial status Toolbox Mode Snowflake Python Git Excel Jira Github SQLAlchemy Jupyternotebook Numpy Pandas Scikit Learn XGBoost Scipy Maplotlib Pipenv Seaborn PostgreSQL dbt Carta Slack Looker Power BI Docker Mode

In [5]:
def extract_name(resume_text):
    nlp_text = nlp(resume_text)
    
    # First name and Last name are always Proper Nouns
    pattern = [{'POS': 'PROPN'}, {'POS': 'PROPN'}]
    
    matcher.add('NAME', [pattern])
    
    matches = matcher(nlp_text)
    
    for match_id, start, end in matches:
        span = nlp_text[start:end]
        return span.text
print('Name: ',extract_name(pdftotext("C:/Users/cfont/Downloads/Cristian Fontana - CV-Resume.pdf")))

Name:  Sickit Learn


# Predict Profile

In [6]:
# Load pickle models.
with open('C:/Users/cfont/OneDrive/Documents/GitHub/Projects/Read Resume - NLP/src/models/model.pkl', 'rb') as f:
    predict_profile = pickle.load(f)

with open('C:/Users/cfont/OneDrive/Documents/GitHub/Projects/Read Resume - NLP/src/models/vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)

In [7]:
# Clean the resume text.
def clean_function(resumeText):
    resumeText = re.sub('http\S+\s*', ' ', resumeText)  # remove URLs
    resumeText = re.sub('RT|cc', ' ', resumeText)  # remove RT and cc
    resumeText = re.sub('#\S+', '', resumeText)  # remove hashtags
    resumeText = re.sub('@\S+', '  ', resumeText)  # remove mentions
    resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', resumeText)  # remove punctuations
    resumeText = re.sub(r'[^\x00-\x7f]',r' ', resumeText) 
    resumeText = re.sub('\s+', ' ', resumeText)  # remove extra whitespace
    return resumeText

In [8]:
# Cleand and vectorize the resume text.
def new_inputs(resumes):
    cleaned_resumes = resumes.apply(lambda x:clean_function(x))
    transformed_resumes = vectorizer.transform(cleaned_resumes)
    return transformed_resumes

In [9]:
# Predict the profile.
print('Predicted Profile:', predict_profile.predict(new_inputs(pd.Series(text)))[0])

Predicted Profile: Data Science


# Qualification.

In [84]:
# Grad all general stop words
STOPWORDS = set(stopwords.words('english'))

# Education Degrees
EDUCATION = [
            'BE','B.E.', 'B.E', 'BS', 'B.S', 
            'ME', 'M.E', 'M.E.', 'M.B.A', 'MBA', 'MS', 'M.S', 
            'BTECH', 'B.TECH', 'M.TECH', 'MTECH', 
            'SSLC', 'SSC' 'HSC', 'CBSE', 'ICSE', 'X', 'XII',
            'BACHELOR', 'MASTER', 'PHD', 'BACHELORS', 'MASTERS', 'Ph.D.',
            'Licenciatura', 'Ingeniería', 'Maestría', 'Maestria',
            'Maestra', 'Maestro', 'Doctorado', 'Doctora', 'Doctor', 'Licenciado', 'Licenciada',
            'Ingeniero', 'Ingeniera', 'Maestrante', 'Doctorante', 'L'
            'Lic', 'Ing'
        ]

# Make every word in EDUCATION lowercase.
EDUCATION = [x.lower() for x in EDUCATION]

bachelor_subjects = [
    "Computer Science",
    "Physics",
    "Chemistry",
    "Biology",
    "Mathematics",
    "Engineering",
    "Psychology",
    "English Literature",
    "History",
    "Sociology",
    "Economics",
    "Political Science",
    "Business Administration",
    "Marketing",
    "Accounting",
    "Finance",
    "Nursing",
    "Environmental Science",
    "Art",
    "Music",
    "Film Studies",
    "Philosophy",
    "Anthropology",
    "Communications",
    "Languages",
    "Geography",
    "Architecture",
    "Urban Planning",
    "Graphic Design",
    "Journalism",
    "Criminal Justice",
    "Law",
    "International Relations",
    "Sports Science",
    "Theater",
    "Dance",
    "Religious Studies",
    "Information Technology",
    "Health Sciences",
    "Social Work",
    "Public Health",
    "Nutrition",
    "Linguistics",
    "Human Resources",
    "Hospitality Management",
    "Tourism",
    "Fashion Design",
    "Interior Design"
]

# Lemmatization of every word in bachelor_subjects
bachelor_subjects = [nlp(text) for text in bachelor_subjects]

# For each sentence in bachelor_subjects keep the lemma of each word and keep them as strings.
bachelor_subjects_lemma = [' '.join([word.lemma_ for word in subject]) for subject in bachelor_subjects]

def extract_education(resume_text):
    nlp_text = nlp(resume_text)

    # Sentence Tokenizer
    nlp_text = [sent.text.strip() for sent in nlp_text.sents]

    edu = {}

    for index, text in enumerate(nlp_text):
        for tex in text.split():
            # Replace all special symbols
                tex = re.sub(r'[?|$|.|!|,\']', r'', tex)
                if tex.lower() in EDUCATION and tex not in STOPWORDS:
                    edu['Sentence'] = text
                    edu['Education'] = tex

                    # Find the GPA.
                    edu['GPA'] = re.findall(r'\b\d\.\d\b', text)
                    
                    text_lemma = [nlp(word) for word in text.split()]

                    for word in text_lemma:
                        for single in word:
                            if single.lemma_ in bachelor_subjects_lemma:
                                edu['Subject'] = single.lemma_
                                
                    # Find dates using regex.
                    # If you want to extract the year from the text, change the regex to r'(\d{4})'
                    edu['Years'] = re.findall(r'(20\d{2}|19\d{2})', text)
    
    print('Full sentence: ', {edu['Sentence']},
            '\nDegree: ', edu['Education'],
            '\nSubject: ', edu['Subject'],
            '\nGPA: ', edu['GPA'],
            '\nYears: ', edu['Years'])

extract_education(text)


Full sentence:  {"Experience\nEDUCATION\n2014-2020\nBachelor's degree in Economics."} 
Degree:  Bachelors 
Subject:  economic 
GPA:  [] 
Years:  ['2014', '2020']


In [77]:
extract_education(text)

{'Sentence': "Experience\nEDUCATION\n2014-2020\nBachelor's degree in Economics.",
 'Education': 'Bachelors',
 'GPA': [],
 'Subject': 'economic',
 'Years': ['2014', '2020']}

In [74]:
extract_education(text)['Sentence']

"Experience\nEDUCATION\n2014-2020\nBachelor's degree in Economics."

# Email

In [11]:
def extract_email_addresses(string):
    r = re.compile(r'[\w\.-]+@[\w\.-]+')
    return r.findall(string)
print('Mail id: ',extract_email_addresses(text))

Mail id:  ['c.fontana95@gmail.com']


# Phone Number

In [12]:
def extract_mobile_number(resume_text):
    phone = re.findall(re.compile(r'(?:(?:\+?([1-9]|[0-9][0-9]|[0-9][0-9][0-9])\s*(?:[.-]\s*)?)?(?:\(\s*([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|([0-9][1-9]|[0-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?([2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?([0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(\d+))?'), resume_text)
    
    if phone:
        number = ''.join(phone[0])
        if len(number) > 10:
            return number
        else:
            return number
print('Mobile Number: ',extract_mobile_number(text))

Mobile Number:  3934444659


# Skills

In [13]:
general_skills_list = [
    "Communication Skills",
    "Verbal Communication",
    "Written Communication",
    "Presentation Skills",
    "Public Speaking",
    "Interpersonal Skills",
    "Negotiation Skills",
    "Listening Skills",
    "Persuasion Skills",
    "Teamwork",
    "Collaboration",
    "Leadership",
    "Conflict Resolution",
    "Relationship Building",
    "Networking",
    "Analytical Skills",
    "Critical Thinking",
    "Problem Solving",
    "Research Skills",
    "Data Analysis",
    "Quantitative Analysis",
    "Qualitative Analysis",
    "Decision Making",
    "Attention to Detail",
    "Logical Reasoning",
    "Technical Skills",
    "Computer Literacy",
    "Programming Languages",
    "Software Proficiency",
    "Web Development",
    "Database Management",
    "Information Technology",
    "Troubleshooting",
    "Systems Administration",
    "Network Security",
    "Creativity",
    "Innovation",
    "Graphic Design",
    "Artistic Skills",
    "Photography",
    "Video Editing",
    "Content Creation",
    "Writing Skills",
    "Copywriting",
    "Proofreading and Editing",
    "Organization",
    "Time Management",
    "Project Management",
    "Planning and Coordination",
    "Multitasking",
    "Prioritization",
    "Detail Orientation",
    "Meeting Deadlines",
    "Resource Management",
    "Customer Service",
    "Client Management",
    "Relationship Management",
    "Conflict Resolution (customer-facing)",
    "Sales Skills",
    "Account Management",
    "Marketing Skills",
    "Market Research",
    "Advertising",
    "Social Media Marketing",
    "Search Engine Optimization (SEO)",
    "Language Skills",
    "Bilingualism",
    "Translation",
    "Interpretation",
    "Financial Skills",
    "Accounting",
    "Financial Analysis",
    "Budgeting",
    "Financial Planning",
    "Risk Management",
    "Teaching and Training",
    "Instructional Design",
    "Curriculum Development",
    "Tutoring",
    "Mentoring",
    "Project Coordination",
    "Event Planning",
    "Event Management",
    "Logistics",
    "Supply Chain Management",
    "Research and Development",
    "Scientific Methodology",
    "Lab Techniques",
    "Experimental Design",
    "Statistical Analysis",
    "Problem Diagnosis",
    "Troubleshooting (Technical)",
    "Maintenance and Repair",
    "Equipment Handling",
    "Mechanical Skills",
    "Health and Safety",
    "First Aid",
    "CPR",
    "Occupational Health and Safety",
    "Risk Assessment"
]


In [14]:
# Convert general skills to lowercase.
general_skills_list = [skill.lower() for skill in general_skills_list]

In [15]:
def general_skills(resume_text):
    nlp_text = nlp(resume_text)

    noun_chunks = nlp_text.noun_chunks

    # removing stop words and implementing word tokenization
    tokens = [token.text for token in nlp_text if not token.is_stop]
    
    # extract values
    skillset = []
    
    # check for one-grams (example: python)
    for token in tokens:
        if token.lower() in general_skills_list:
            skillset.append(token)
   
    for token in noun_chunks:
        token = token.text.lower().strip()
        if token in general_skills_list:
            skillset.append(token)
    return [i.capitalize() for i in set([i.lower() for i in skillset])]
  
print ('General Skills',general_skills(text))

General Skills []


In [16]:
programming_skills_list = [
    "Programming Languages",
    "Python",
    "JavaScript",
    "Java",
    "C++",
    "C#",
    "Ruby",
    "PHP",
    "Swift",
    "Go",
    "Rust",
    "TypeScript",
    "HTML",
    "CSS",
    "SQL",
    "Shell Scripting",
    "Version Control",
    "Git",
    "SVN",
    "Continuous Integration/Continuous Deployment (CI/CD)",
    "Agile Development",
    "Test-Driven Development (TDD)",
    "Object-Oriented Programming (OOP)",
    "Functional Programming",
    "Web Development",
    "Front-end Development",
    "Back-end Development",
    "Full-Stack Development",
    "Mobile Development",
    "iOS Development",
    "Android Development",
    "Database Management",
    "Database Design",
    "Query Optimization",
    "API Development",
    "RESTful APIs",
    "Web Services",
    "Microservices",
    "Cloud Computing",
    "Amazon Web Services (AWS)",
    "Microsoft Azure",
    "Google Cloud Platform (GCP)",
    "Containerization",
    "Docker",
    "Kubernetes",
    "Server Administration",
    "Linux",
    "Windows Server",
    "Networking",
    "Security",
    "Cybersecurity",
    "Data Structures",
    "Algorithms",
    "Software Development",
    "Software Architecture",
    "Software Testing",
    "Debugging",
    "Problem Solving",
    "Code Optimization",
    "Performance Tuning",
    "Code Review",
    "Documentation",
    "Unit Testing",
    "Integration Testing",
    "System Testing",
    "Front-end Frameworks",
    "React",
    "Angular",
    "Vue.js",
    "Back-end Frameworks",
    "Django",
    "Ruby on Rails",
    "Node.js",
    "Flask",
    "ASP.NET",
    "PHP Frameworks",
    "Laravel",
    "Symfony",
    "CodeIgniter",
    "Testing Frameworks",
    "JUnit",
    "PyTest",
    "Mocha",
    "Jest",
    "Database Systems",
    "MySQL",
    "PostgreSQL",
    "Oracle",
    "MongoDB",
    "Redis",
    "Machine Learning",
    "Data Analysis",
    "Data Visualization",
    "Artificial Intelligence",
    "Natural Language Processing (NLP)",
    "Big Data",
    "Hadoop",
    "Spark",
    "Blockchain Development",
    "Internet of Things (IoT)",
    "DevOps",
    "Infrastructure as Code (IaC)",
    "Configuration Management",
    "Scripting",
    "Problem Diagnosis",
    "Technical Support",
    "API Integration",
    "Project Management",
    "Agile Methodologies",
    "Scrum",
    "Kanban",
    "Software Documentation",
    "Collaboration Tools",
    "Jira",
    "Confluence",
    "Slack",
    "Version Control Systems",
    "Git",
    "SVN",
    "Code Editors",
    "Visual Studio Code",
    "PyCharm",
    "IntelliJ IDEA",
    "Eclipse",
    "Sublime Text",
    "Atom",
    "Operating Systems",
    "Linux",
    "Windows",
    "macOS"
]


In [17]:
# Convert programming skills to lower case
programming_skills_list = [i.lower() for i in programming_skills_list]

In [18]:
def programming_skills(resume_text):
    nlp_text = nlp(resume_text)

    noun_chunks = nlp_text.noun_chunks

    # removing stop words and implementing word tokenization
    tokens = [token.text for token in nlp_text if not token.is_stop]

    skillset = []
    
    # check for one-grams (example: python)
    for token in tokens:
        if token.lower() in programming_skills_list:
            skillset.append(token)
   
    for token in noun_chunks:
        token = token.text.lower().strip()
        if token in programming_skills_list:
            skillset.append(token)
    return [i.capitalize() for i in set([i.lower() for i in skillset])]
  
print ('Skills',programming_skills(text))

Skills ['Docker', 'Slack', 'Agile methodologies', 'Postgresql', 'Python', 'Spark', 'Mysql', 'Machine learning', 'Git', 'Mongodb', 'Jira', 'Sql']


# Languages

In [19]:
languages_list = [
    "English",
    "Spanish",
    "French",
    "German",
    "Chinese",
    "Mandarin",
    "Arabic",
    "Hindi",
    "Portuguese",
    "Bengali",
    "Russian",
    "Japanese",
    "Lahnda",
    "Javanese",
    "Wu",
    "Telugu"]

In [20]:
# Convert languages to lower case
languages_list = [i.lower() for i in languages_list]

In [21]:
language_level = [
    "Elementary Proficiency",
    "Limited Working Proficiency",
    "Professional Working Proficiency",
    "Full Professional Proficiency",
    "Native or Bilingual Proficiency",
    "Native",
    "Advanced",
    "Intermediate",
    "A1",
    "A2",
    "B1",
    "B2",
    "C1",
    "C2"
]

In [22]:
# Convert language levels to lower case
language_level = [i.lower() for i in language_level]

In [28]:
def language_skill(resume_text):

    resume_text = re.sub(r'\.(?!\))', r'. ', resume_text)

    nlp_text = nlp(resume_text)

    # Removing stop words and implementing word tokenization
    tokens = [token.text for token in nlp_text if not token.is_stop]

    # Drop every token that is equal to a special character
    tokens = [token for token in tokens if not token in string.punctuation]

    skillset = {}

    # check for one-grams (example: python)
    for token in tokens:
        if token.lower() in languages_list:
            skillset[token] = 'Level Not Specified'
            if tokens[tokens.index(token) + 1].lower() in language_level:
                skillset[token] = tokens[tokens.index(token) + 1]

    return skillset
  
print ('Languages:' ,language_skill(text))

Languages: {'Spanish': 'Native', 'English': 'Advanced', 'Portuguese': 'Intermediate'}


# Bonus Points



In [24]:
bonus_points_list = [
    'Projects',
    'Achievements',
    'Hobbies'
]

In [25]:
def bonus(resume_text):
    nlp_text = nlp(resume_text)

    # removing stop words and implementing word tokenization
    tokens = [token.text for token in nlp_text if not token.is_stop]

    # Put tokens list in lower case
    tokens = [token.lower() for token in tokens]

    bonus_points = {}

    # check for one-grams (example: python)
    for bonus_piece in bonus_points_list:
        if bonus_piece.lower() in tokens:
            bonus_points[bonus_piece] = 'Yes'
        else:
            bonus_points[bonus_piece] = 'No'

    return bonus_points
  
print ('Bonus:' , bonus(text))

Bonus: {'Projects': 'Yes', 'Achievements': 'No', 'Hobbies': 'No'}


# All data

In [85]:
org_name("C:/Users/cfont/Downloads/Cristian Fontana - CV-Resume.pdf")
print('Predicted Profile:', predict_profile.predict(new_inputs(pd.Series(text)))[0])
extract_education(text)
print('Mail id: ',extract_email_addresses(text))
print('Mobile Number: ',extract_mobile_number(text))
print ('General Skills:',general_skills(text))
print ('Programming Skills:',programming_skills(text))
print ('Languages:' ,language_skill(text))
print ('Bonus:' , bonus(text))

# Add github.
# add linkedin.

Name: ['Cristian Fontana']
Predicted Profile: Data Science
Full sentence:  {"Experience\nEDUCATION\n2014-2020\nBachelor's degree in Economics."} 
Degree:  Bachelors 
Subject:  economic 
GPA:  [] 
Years:  ['2014', '2020']
Mail id:  ['c.fontana95@gmail.com']
Mobile Number:  3934444659
General Skills: []
Programming Skills: ['Docker', 'Slack', 'Agile methodologies', 'Postgresql', 'Python', 'Spark', 'Mysql', 'Machine learning', 'Git', 'Mongodb', 'Jira', 'Sql']
Languages: {'Spanish': 'Native', 'English': 'Advanced', 'Portuguese': 'Intermediate'}
Bonus: {'Projects': 'Yes', 'Achievements': 'No', 'Hobbies': 'No'}


In [27]:
el punto 2, si usas Python, y haces una Lambda en AWS que de entrada tiene un Json ó CSV
con los datos del CSV y de salida te devuelve un Json o CSV con los datos del candidato,
ya eso Ayrton puede consumirlo en la app de HR q esta haciendo con PHP Laravel

y de respuesta, es un json q diga por ej: 

Perfil: front end developer
Edad: 20 años
Tech principal: JavaScript
Framework principal: React Native
Tech secundaria: CSS
Ultima empresa donde trabajo: Amazon
Años en ultima empresa: 4.5
Idioma principal: Ingles
Nivel del idioma Principal: 8

SyntaxError: invalid non-printable character U+00A0 (14378996.py, line 3)