# NER (Named Entity Recognition)

- Person
- Organization
- Location
- Time
- Measurements or Quantities
- String patterns like email addresses, phone numbers, or IP addresses 

In [8]:
#spacy
import spacy
from spacy.pipeline import EntityRuler
from spacy.lang.en import English
from spacy.tokens import Doc


#Visualization
from spacy import displacy
from wordcloud import WordCloud
import plotly.express as px
import matplotlib.pyplot as plt

#Data loading/ Data manipulation
import pandas as pd
import numpy as np

#nltk
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download(['stopwords','wordnet'])

#warning
import warnings 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Anass\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Anass\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
# Charger le modèle de traitement du NLP en anglais
nlp = spacy.load("en_core_web_sm")
text = "My name is Anass Nabil , I have 21 years old ."
doc = nlp(text)
 
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


**By default, the spaCy pipeline loads the part-of-speech tagger, dependency parser, and NER...**

In [10]:
displacy.render(doc, style="ent", jupyter=True)

In [11]:
string = "Antiretroviral therapy ( ART ) is recommended for all HIV-infected individuals"
doc = nlp(string)
displacy.render(doc, style="ent", jupyter=True)



In [13]:
data = pd.read_csv("Cleaned_data.csv")

In [14]:
data['Cleaned_Resume'] = data['Cleaned_Resume'].str.lower()

In [15]:
resume1 = data.Cleaned_Resume[0]
resume2 = data.Cleaned_Resume[1]
resume3 = data.Cleaned_Resume[2]
resume1

'skills programming languages python pandas numpy scipy scikitlearn matplotlib sql java javascriptjquery machine learning regression svm nave bayes knn random forest decision trees boosting techniques cluster analysis word embedding sentiment analysis natural language processing dimensionality reduction topic modelling lda nmf pca neural nets database visualizations mysql sqlserver cassandra hbase elasticsearch d3js dcjs plotly kibana matplotlib ggplot tableau others regular expression html css angular 6 logstash kafka python flask git docker computer vision open cv and understanding of deep learningeducation details data science assurance associate data science assurance associate ernst young llp skill details javascript exprience 24 months jquery exprience 24 months python exprience 24 monthscompany details company ernst young llp description fraud investigations and dispute services assurance technology assisted review tar technology assisted review assists in accelerating the revie

In [16]:
nlp = spacy.load("en_core_web_lg")

In [17]:
doc = nlp(resume1)
displacy.render(doc, style="ent", jupyter=True)

## Extracting Text from Resumes

In [18]:
# The jobzilla skill dataset is jsonl file containing different skills that can be used to create spaCy entity_ruler.
# The data set contains label and pattern-> diferent words used to descibe skills in various resume.
skill_pattern_path = "jz_skill_patterns.jsonl"

In [35]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [19]:
ruler = nlp.add_pipe("entity_ruler")
ruler.from_disk(skill_pattern_path)
nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'entity_ruler']

In [20]:
def get_skills(text):
    doc = nlp(text)
    myset = []
    subset = []
    for ent in doc.ents:
        if ent.label_ == "SKILL":
            subset.append(ent.text)
    myset.append(subset)
    return subset


def unique_skills(x):
    return list(set(x))

In [21]:
data.head(3)

Unnamed: 0.1,Unnamed: 0,Category,Resume,Cleaned_Resume
0,0,Data Science,Skills * Programming Languages: Python (pandas...,skills programming languages python pandas num...
1,1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...,education details may 2013 to may 2017 be uitr...
2,2,Data Science,"Areas of Interest Deep Learning, Control Syste...",areas of interest deep learning control system...


In [22]:
data["skills"] = data.head(200)["Cleaned_Resume"].str.lower().apply(get_skills)
data["skills"] = data.head(200)["skills"].apply(unique_skills)
data.head()

Unnamed: 0.1,Unnamed: 0,Category,Resume,Cleaned_Resume,skills
0,0,Data Science,Skills * Programming Languages: Python (pandas...,skills programming languages python pandas num...,"[pandas, analytics, jquery, elasticsearch, tab..."
1,1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...,education details may 2013 to may 2017 be uitr...,"[ml, keras, outlier, feature selection, dimens..."
2,2,Data Science,"Areas of Interest Deep Learning, Control Syste...",areas of interest deep learning control system...,"[analytics, deep learning, linux, electrical e..."
3,3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,skills r python sap hana tableau sap hana sql ...,"[analytics, deep learning, algorithms, visual ..."
4,4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab...",education details mca ymcaust faridabad haryan...,"[data analysis, database, data science, data s..."


In [23]:
# options=[{"ents": "Job-Category", "colors": "#ff3232"},{"ents": "SKILL", "colors": "#56c426"}]
colors = {
    "Job-Category": "linear-gradient(90deg, #aa9cfc, #fc9ce7)",
    "SKILL": "linear-gradient(90deg, #9BE15D, #00E3AE)",
    "ORG": "#ffd966",
    "PERSON": "#e06666",
    "GPE": "#9fc5e8",
    "DATE": "#c27ba0",
    "ORDINAL": "#674ea7",
    "PRODUCT": "#f9cb9c",
}
options = {
    "ents": [
        "Job-Category",
        "SKILL",
        "ORG",
        "PERSON",
        "GPE",
        "DATE",
        "ORDINAL",
        "PRODUCT",
    ],
    "colors": colors,
}

In [24]:
doc = nlp(resume3)
displacy.render(doc, style="ent", jupyter=True,options=options)

In [25]:
import tika
tika.initVM()
from tika import parser
import os
import re

In [26]:
from tika import parser

def parse_pdf_with_tika(pdf_path):
  parsed_pdf = parser.from_file(pdf_path)

  if parsed_pdf:
    return parsed_pdf["content"]
  else:
    return None

In [27]:
# Example usage
pdf_file_path = "C:/Users/Anass/Desktop/CVANB.pdf"  # Test avec Mon CV
extracted_text = parse_pdf_with_tika(pdf_file_path)

if extracted_text:
  print("Extracted Text:")
  print(extracted_text)
else:
  print("Failed to extract text from PDF.")

Extracted Text:






































Untitled design


LANGAGES

Arabe : Maternelle
Français : Bilingue
Anglais : Avancée

EXPERIENCE PROFESSIONEL
07/2022 - 08/2022Stage d'initiation

Développement d´une application mobile
(Framework Flutter)

société : GM-Soft 

04/2023 - 06/2023Stage de fin d'études

Réalisation d'une application mobile pour la
localisation des pharmacies de garde
Développé l'application en utilisant Laravel
React Native , Javascript et MySQL. 
Création d’API REST pour l’accès et la
manipulation des données 
Utilisation du Web Scraping pour la collecte
des données
L'application web est hébergé sous le domaine :
pha.ma

société : Soft Hight Tech

Benni Mellal

Salé

COMPÉTENCES

PROJETS

PROFILE

FORMATION

N A B I L
ANASS

06  53  47  04  05

Anassnabil067@gmail.com

https://anass-nabil.vercel.app/

Étudiant en Licence Professionnelle à la
recherche d'un stage de projet de fin dʼétude à
partir du mois de Avril pour une période de 2
mois dans le domai

In [28]:
extracted_text = extracted_text.replace("\n"," ")
extracted_text = extracted_text.replace("[^a-zA-Z0-9]", " ");  
re.sub('\W+','', extracted_text)
extracted_text = extracted_text.lower()
print(extracted_text)

                                      untitled design   langages  arabe : maternelle français : bilingue anglais : avancée  experience professionel 07/2022 - 08/2022stage d'initiation  développement d´une application mobile (framework flutter)  société : gm-soft   04/2023 - 06/2023stage de fin d'études  réalisation d'une application mobile pour la localisation des pharmacies de garde développé l'application en utilisant laravel react native , javascript et mysql.  création d’api rest pour l’accès et la manipulation des données  utilisation du web scraping pour la collecte des données l'application web est hébergé sous le domaine : pha.ma  société : soft hight tech  benni mellal  salé  compétences  projets  profile  formation  n a b i l anass  06  53  47  04  05  anassnabil067@gmail.com  https://anass-nabil.vercel.app/  étudiant en licence professionnelle à la recherche d'un stage de projet de fin dʼétude à partir du mois de avril pour une période de 2 mois dans le domaine de lʼingénier

In [31]:
doc = nlp(extracted_text)

In [32]:
displacy.render(doc, style="ent", jupyter=True,options=options)

## 1- Using Regular expressions

The first step in resume parsing is to extract the text from resumes in various formats

In [None]:
import re

def extract_contact_number_from_resume(text):
    contact_number = None

    # Use regex pattern to find a potential contact number
    pattern = r"\b(?:\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b"
    match = re.search(pattern, text)
    if match:
        contact_number = match.group()

    return contact_number

In [None]:
extract_contact_number_from_resume(resume1)

In [None]:
extract_contact_number_from_resume(resume2)

In [None]:
extract_contact_number_from_resume(resume3)

### Extracting Email Address

In [None]:
import re

def extract_email_from_resume(text):
    email = None

    # Use regex pattern to find a potential email address
    pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"
    match = re.search(pattern, text)
    if match:
        email = match.group()

    return email

In [None]:
import re

def extract_skills_from_resume(text, skills_list):
    skills = []

    for skill in skills_list:
        pattern = r"\b{}\b".format(re.escape(skill))
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            skills.append(skill)

    return skills

In [None]:

skills_list = ['Python', 'Data Analysis', 'Machine Learning', 'Communication', 'Project Management', 'Deep Learning', 'SQL', 'Tableau']

extracted_skills = extract_skills_from_resume(resume1, skills_list)

if extracted_skills:
    print("Skills:", extracted_skills)
else:
    print("No skills found")

In [None]:
import re

def extract_education_from_resume(text):
    education = []

    # List of education keywords to match against
    education_keywords = ['Bsc', 'B. Pharmacy', 'B Pharmacy', 'Msc', 'M. Pharmacy', 'Ph.D', 'Bachelor', 'Master']

    for keyword in education_keywords:
        pattern = r"(?i)\b{}\b".format(re.escape(keyword))
        match = re.search(pattern, text)
        if match:
            education.append(match.group())

    return education

In [None]:
extract_education_from_resume(resume3)

In [None]:
import spacy
from spacy.matcher import Matcher

def extract_name(resume_text):
    nlp = spacy.load('en_core_web_sm')
    matcher = Matcher(nlp.vocab)

    # Define name patterns
    patterns = [
        [{'POS': 'PROPN'}, {'POS': 'PROPN'}],  # First name and Last name
        [{'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}],  # First name, Middle name, and Last name
        [{'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}]  # First name, Middle name, Middle name, and Last name
        # Add more patterns as needed
    ]

    for pattern in patterns:
        matcher.add('NAME', patterns=[pattern])

    doc = nlp(resume_text)
    matches = matcher(doc)

    for match_id, start, end in matches:
        span = doc[start:end]
        return span.text

    return None


In [None]:
 resume_text = "John Doe\n\nContact Information: 123-456-7890, john.doe@example.com\n\nSkills: Python, Data Analysis, Communication\n\nEducation: Bachelor of Science in Computer Science\n\nExperience: Software Engineer at XYZ Company"

print("Resume:")
print(resume_text)

name = extract_name(resume_text)
if name:
    print("Name:", name)
else:
    print("Name not found")

contact_number = extract_contact_number_from_resume(resume_text)
if contact_number:
    print("Contact Number:", contact_number)
else:
    print("Contact Number not found")

email = extract_email_from_resume(resume_text)
if email:
    print("Email:", email)
else:
    print("Email not found")

skills_list = ['Python', 'Data Analysis', 'Machine Learning', 'Communication']
extracted_skills = extract_skills_from_resume(resume_text, skills_list)
if extracted_skills:
    print("Skills:", extracted_skills)
else:
    print("No skills found")

extracted_education = extract_education_from_resume(resume_text)
if extracted_education:
    print("Education:", extracted_education)
else:
    print("No education information found")