In [1]:
import os
import io
import spacy
import docx2txt
import constants as cs
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFSyntaxError


In [72]:
def extract_text_from_pdf(pdf_path):
    '''
    Helper function to extract the plain text from .pdf files

    :param pdf_path: path to PDF file to be extracted (remote or local)
    :return: iterator of string of extracted text
    '''
    # https://www.blog.pythonlibrary.org/2018/05/03/exporting-data-from-pdfs-with-python/
    if not isinstance(pdf_path, io.BytesIO):
        # extract text from local pdf file
        with open(pdf_path, 'rb') as fh:
            try:
                for page in PDFPage.get_pages(fh,caching=True,check_extractable=True):
                    resource_manager = PDFResourceManager()
                    fake_file_handle = io.StringIO()
                    converter = TextConverter(resource_manager,fake_file_handle,codec='utf-8',laparams=LAParams())
                    page_interpreter = PDFPageInterpreter(resource_manager,converter)
                    page_interpreter.process_page(page)
                    text = fake_file_handle.getvalue()
                    yield text

                    # close open handles
                    converter.close()
                    fake_file_handle.close()
            except PDFSyntaxError:
                return
    else:
        # extract text from remote pdf file
        try:
            for page in PDFPage.get_pages(
                pdf_path,
                caching=True,
                check_extractable=True
            ):
                resource_manager = PDFResourceManager()
                fake_file_handle = io.StringIO()
                converter = TextConverter(
                    resource_manager,
                    fake_file_handle,
                    codec='utf-8',
                    laparams=LAParams()
                )
                page_interpreter = PDFPageInterpreter(
                    resource_manager,
                    converter
                )
                page_interpreter.process_page(page)

                text = fake_file_handle.getvalue()
                yield text

                # close open handles
                converter.close()
                fake_file_handle.close()
        except PDFSyntaxError:
            return


In [27]:
for resume in os.listdir('C:\\Users\\mmazhar\\Downloads\\CV Mining\\pyresparser\\data'):
    #print(resume)
    path='data/'+resume
    text_raw = extract_text_from_pdf(path)
    print(text_raw)
    print('----------------------------')

<generator object extract_text_from_pdf at 0x000002B784CF13C0>
----------------------------
<generator object extract_text_from_pdf at 0x000002B784AC0CF0>
----------------------------
<generator object extract_text_from_pdf at 0x000002B784AC0820>
----------------------------
<generator object extract_text_from_pdf at 0x000002B784AC0AC0>
----------------------------
<generator object extract_text_from_pdf at 0x000002B784AC0A50>
----------------------------
<generator object extract_text_from_pdf at 0x000002B784AC0BA0>
----------------------------
<generator object extract_text_from_pdf at 0x000002B784AC05F0>
----------------------------
<generator object extract_text_from_pdf at 0x000002B78448CE40>
----------------------------
<generator object extract_text_from_pdf at 0x000002B78448CF90>
----------------------------
<generator object extract_text_from_pdf at 0x000002B78448CD60>
----------------------------
<generator object extract_text_from_pdf at 0x000002B78448CF20>
-----------------

In [40]:
def get_number_of_pages(file_name):
    try:
        if isinstance(file_name, io.BytesIO):
            # for remote pdf file
            count = 0
            for page in PDFPage.get_pages(file_name,caching=True,check_extractable=True):
                count += 1
            return count
        else:
            # for local pdf file
            if file_name.endswith('.pdf'):
                count = 0
                with open(file_name, 'rb') as fh:
                    for page in PDFPage.get_pages(
                        fh,
                        caching=True,
                        check_extractable=True
                    ):
                        count += 1
                return count
            else:
                return None
    except PDFSyntaxError:
        return None



In [28]:
for resume in os.listdir('C:\\Users\\mmazhar\\Downloads\\CV Mining\\pyresparser\\data'):
    #print(resume)
    path='data/'+resume
    noOfLines = get_number_of_pages(path)
    print(noOfLines)
    print('----------------------------')

2
----------------------------
1
----------------------------
2
----------------------------
2
----------------------------
1
----------------------------
1
----------------------------
2
----------------------------
2
----------------------------
1
----------------------------
1
----------------------------
1
----------------------------
3
----------------------------
1
----------------------------
1
----------------------------
1
----------------------------
2
----------------------------
2
----------------------------
1
----------------------------
1
----------------------------
1
----------------------------
1
----------------------------
1
----------------------------
1
----------------------------
1
----------------------------
1
----------------------------
1
----------------------------
1
----------------------------
1
----------------------------
1
----------------------------
2
----------------------------
2
----------------------------
1
----------------------------
1
------

In [4]:
def extract_text_from_docx(doc_path):
    '''
    Helper function to extract plain text from .docx files

    :param doc_path: path to .docx file to be extracted
    :return: string of extracted text
    '''
    try:
        temp = docx2txt.process(doc_path)
        text = [line.replace('\t', ' ') for line in temp.split('\n') if line]
        return ' '.join(text)
    except KeyError:
        return ' '


In [73]:
for resume in os.listdir('C:\\Users\\mmazhar\\Downloads\\CV Mining\\pyresparser\\data'):
    #print(resume)
    path='data/'+resume
    raw_text = extract_text_from_docx(path)
    print(raw_text)
    print('----------------------------')

BadZipFile: File is not a zip file

In [5]:
def extract_text_from_doc(doc_path):
    '''
    Helper function to extract plain text from .doc files

    :param doc_path: path to .doc file to be extracted
    :return: string of extracted text
    '''
    try:
        try:
            import textract
        except ImportError:
            return ' '
        temp = textract.process(doc_path).decode('utf-8')
        text = [line.replace('\t', ' ') for line in temp.split('\n') if line]
        return ' '.join(text)
    except KeyError:
        return ' '



In [31]:
for resume in os.listdir('C:\\Users\\mmazhar\\Downloads\\CV Mining\\pyresparser\\data'):
    #print(resume)
    path='data/'+resume
    raw_text = extract_text_from_doc(path)
    print(raw_text)
    print('----------------------------')

 
----------------------------
 
----------------------------
 
----------------------------
 
----------------------------
 
----------------------------
 
----------------------------
 
----------------------------
 
----------------------------
 
----------------------------
 
----------------------------
 
----------------------------
 
----------------------------
 
----------------------------
 
----------------------------
 
----------------------------
 
----------------------------
 
----------------------------
 
----------------------------
 
----------------------------
 
----------------------------
 
----------------------------
 
----------------------------
 
----------------------------
 
----------------------------
 
----------------------------
 
----------------------------
 
----------------------------
 
----------------------------
 
----------------------------
 
----------------------------
 
----------------------------
 
----------------------------
 
------

In [6]:
def extract_text(file_path, extension):
    '''
    Wrapper function to detect the file extension and call text
    extraction function accordingly

    :param file_path: path of file of which text is to be extracted
    :param extension: extension of file `file_name`
    '''
    text = ''
    if extension == '.pdf':
        for page in extract_text_from_pdf(file_path):
            text += ' ' + page
    elif extension == '.docx':
        text = extract_text_from_docx(file_path)
    elif extension == '.doc':
        text = extract_text_from_doc(file_path)
    return text


In [36]:
for resume in os.listdir('C:\\Users\\mmazhar\\Downloads\\CV Mining\\pyresparser\\data'):
    #print(resume)
    path='data/'+resume
    raw_text = extract_text(path,'.pdf')
    print(raw_text)
    print('----------------------------')

  

 

Contact

mohamednassar2016@gmail.com

www.linkedin.com/in/
mohamednassar96 (LinkedIn)

Top Skills

Computer Vision

Data Analysis

Natural Language Processing (NLP)

Certifications

twitter hall of fame

Microsoft Hall of fame

Microsoft Students Stars
Competition

Yahoo hall of fame

 

Mohamed Nassar

Machine Learning Team Lead at Synapse Analytics|MSc Candidate
in Computer Communication Engineering at Cairo University | AI
Instructor
Giza

Summary

- AI Team Lead with 3 years of experience in Computer Vision

and Natural Language Processing handling the whole pipeline of

design, development, testing, and deployment into production.

Currently, Leading Azka-OCR Division in Synapse Analytics. Azka-

OCR is involved in Arabic Documents end-to-end analysis through

Document Detection, Layout Analysis, Text Extraction, OCR, and
finally Information Extraction.

Optical mark recognition (OMR)

- AI instructor.

Experience

Synapse Analytics
2 years 1 month

Machine Learning Team Le

  

 

Contact

egypt
01149948099 (Mobile)
ahmedfathi520@gmail.com

www.linkedin.com/in/ahmed-
fathi-382441133 (LinkedIn)

Top Skills

Web Development

Design

Graphics

Certifications

Certificate of completion -
Introduction to Data and Data
Science

 

Ahmed Fathi

data science enthusiast || civil engineer || future data scientist
Giza

Summary

Hello,

I Am Ahmed Fathi

Front End Developer

Develop Myself

Experience

I Work In Front-End Web Development, And I Have Experience In

Dealing With Programming And Modern Web Technologies, And I

Have Built A Lot Of Projects That Have Increased My Experience

And Trained A Lot, I Am A Fast Learner And Always Love To

National Telecommunication Institute (NTI)
Student
October 2021 - Present (3 months)

PETROJET
Trainee
June 2019 - August 2020 (1 year 3 months)

worked as a trainee civil engineer in somid project 

Education

technology 

technical, Computer Systems Networking  · (2013 - 2015)

Ain Shams University

Bachelor's degree, Civil

 Contact

6 october, giza
01113709072 (Mobile)
redamahmoud722@gmail.com

www.linkedin.com/in/
mahmoud3899 (LinkedIn)
github.com/MAHMOUDRR707
(Personal)

Top Skills

Leadership

Arduino

Electronics

Certifications

Convolutional Neural Networks

HCIA-AI

Structuring Machine Learning
Projects

Computer Vision Basics

Introduction to Internet of things

 

 

 

Mahmoud Reda

Machine Learning Engineer
Cairo

Summary

Majoring in Computer Engineering, a technology and science

enthusiast with the ability to learn rapidly and cooperate. Very

flexible and willing to be pushed to the limits when it comes to

working. Ex-intern at DevisonX as well as YAT both worked as

a software engineer as Machine Learning Engineer and Mobile

Android Developer.. Experienced in student activities as being a

DSC founder , Member at IEEE ZSB CS and 

vice chairman at IEEE ZSB RAS. 

Experience

Omdena
Junior Machine Learning Engineer
May 2021 - July 2021 (3 months)
Egypt

Spurwing
Software Engineer Intern


 Contact

El-Haram,Giza,Egypt
01060070084 (Mobile)
medny2010@gmail.com

www.linkedin.com/in/medny-2010
(LinkedIn)
www.kaggle.com/
abdelrahmanmedany (Personal)
github.com/medny (Personal)

Top Skills

Data Science

Machine Learning

Deep Learning

Certifications

AWS Academy Graduate - AWS
Academy Machine Learning
Foundations

 

 

 

AbdEl-Rahman Medany

Data Scientist
Al Jizah

Summary

Data Scientist graduated from the Faculty of Information and Artificial

Intelligence, Cairo University. and enrolled in the Artificial Intelligence

Professional (AI-Pro) training program in Machine Learning and

Artificial Intelligence at Information Technology Institute (ITI) which

was developed in partnership with the French Graduate School of

Computer Science and Advanced Technologies (EPITA).

Passionate Engineer & Thriving analyst with the ability to apply

Machine and Deep Learning techniques and algorithm development

to solve a real-world business problem. With proven success in

building 

 Top Skills

AJAX

HTML 5

CSS

Languages

English

Urdu

 

 

 

Contact

Ali Garden Block A, Street#1,
House#2 , Faisalabad, Pakistan
+923217865510 (Mobile)
sajidali2444@gmail.com

Sajid Ali

Lead Software Engineer, Full stack (.NET | MERN)
Lahore District

www.linkedin.com/in/sajidali2444
(LinkedIn)

Summary

I'm a passionate and pragmatic software engineer with 10 years

of professional experience, specializing in full-stack development

using Microsoft Technologies on the backend. Strong in object-

oriented analysis and design and experience with a wide range of

front-end and back-end frameworks. I can assist in all stages of the

software development lifecycle and deliver working software with

clean architecture and clean code.

Experience

AllZone Tech.
Principal Software Engineer
September 2019 - Present (2 years 4 months)
Lahore, Pakistan

I'm a passionate and pragmatic software engineer with 6 years of professional

experience, specializing in full-stack development using

  

 

 

Fatima Imran

|| Web developer ||
Faisalabad District

Experience

Iconsols
Intern
September 2021 - Present (4 months)
Faisalabad, Punjab, Pakistan

Contact

fatimaimran1441@gmail.com

www.linkedin.com/in/fatima-
imran-293401220 (LinkedIn)

Top Skills

JavaScript

Web Development

PHP

Languages

English

Urdu (Full Professional)

Education

Government  College University, Faisalabad

Bachelor's degree, Information Technology · (October 2020 - June 2024)

Government  College University, Faisalabad

Bs, Information Technology · (2020 - 2024)

Government  College University, Faisalabad

Bs, Information Technology · (2020 - 2024)

Page 1 of 1


----------------------------
  

 

 

Contact

Amman,jordan
00962798638109 (Mobile)
moh.momani@gmail.com

Mohammad Ibrahim Momani

Data scientist at Orange Jordan
Amman

www.linkedin.com/in/mohammad-
ibrahim-momani-50b80517
(LinkedIn)

Summary

AI, machine learning, data science and python are my passion .

Python (Programming Language)

  

 

 

لاصتالا

aliwagdy2580@gmail.com

www.linkedin.com/in/ali-wagdy-
a01510196 (LinkedIn)

Ali Wagdy

Machine Learning | Deep Learning | NLP
رصم

زجوم

تاراهملا لضفأ

machine learning

Deep Learning

Natural language processing

(NLP)

I'm a software engineer adept at analyzing datasets, machine

learning algorithms , deep learning and natural language processing

Hoping to become a valuable asset in any organization that I join

Languages

English (Professional Working)

ىبرع (Native or Bilingual)

Certifications

machine learning nanodegree 

Python 3: Programming beginner to
advanced

machine learning 

Neural Networks and Deep Learning

Contestant at ACM-ICPC

Honors-Awards

Participated at ACM Egyptian
Collegiate Programming Contest

ةربخلا

iNetworks
Machine Learning Intern
٢٠٢١ ربوتكأ‏ - Present (3 روهش)

ميلعتلا

MET

Computer Science · (2018 - 2022)

Stanford University

machine learning 

Page 1 of 1


----------------------------
  

 

 

Contact

elhamamsy.m.a@gmail.

  

 

 

Contact

01100866990 (Mobile)
muhammadalaa5@gmail.com

www.linkedin.com/in/imhmdd
(LinkedIn)

Top Skills

Architectural Design

Interior Design

Design Research

Certifications

Data Analysis Professional Udacity
Nanodegree

Data Analysis Challenger Track -
FWD

Muhammad Alaa El-Deen

Personalized Ads Assessor at TELUS International
Alexandria

Summary

My objectives are to develop my skills and gain more experience.

I am looking forward to be working in an architectural firm where I

can continue gaining knowledge and developing the technical skills I

have gained throughout my studies. My goal is to improve my career

in order to gain a better chance at being a member of the team that

will give me the opportunity for continued professional growth, also

improving myself at all times as I believe one should never stop

learning as it is the best way to become successful in the future.

Therefore I always strive towards self development in all aspects.

Experience

TELUS In

In [50]:
#[resume_section.lower() for resume_section in cs.RESUME_SECTIONS_GRAD}
x=set((map(lambda x: x.lower(), cs.RESUME_SECTIONS_GRAD)))
x

{'accomplishments',
 'career objective',
 'certifications',
 'education',
 'experience',
 'interests',
 'leadership',
 'objective',
 'professional experience',
 'projects',
 'publications',
 'skills',
 'summary'}

In [69]:
def extract_entity_sections_grad(text):
    '''
    Helper function to extract all the raw text from sections of resume
    specifically for graduates and undergraduates

    :param text: Raw text of resume
    :return: dictionary of entities
    '''
    text_split = [i.strip() for i in text.split('\n')]
    sections_set =set((map(lambda x: x.lower(), cs.RESUME_SECTIONS_GRAD)))
    sections_lst =list((map(lambda x: x.lower(), cs.RESUME_SECTIONS_GRAD)))
    # sections_in_resume = [i for i in text_split if i.lower() in sections]
    entities = {}
    key = False
    for phrase in text_split:
        if len(phrase) == 1:
            p_key = phrase
        else:
            p_key = set(phrase.lower().split()) & sections_set
        try:
            p_key = list(p_key)[0]
        except IndexError:
            pass
        if p_key in sections_lst:
            entities[p_key] = []
            key = p_key
        elif key and phrase.strip():
            entities[key].append(phrase)
    return entities


In [70]:
for resume in os.listdir('C:\\Users\\mmazhar\\Downloads\\CV Mining\\pyresparser\\data'):
    #print(resume)
    path='data/'+resume
    raw_text= extract_text(path, '.pdf')
    entities = extract_entity_sections_grad(raw_text)
    print(entities)
    print('----------------------------')

{'skills': ['Computer Vision', 'Data Analysis', 'Natural Language Processing (NLP)'], 'certifications': ['twitter hall of fame', 'Microsoft Hall of fame', 'Microsoft Students Stars', 'Competition', 'Yahoo hall of fame', 'Mohamed Nassar', 'Machine Learning Team Lead at Synapse Analytics|MSc Candidate', 'in Computer Communication Engineering at Cairo University | AI', 'Instructor', 'Giza'], 'summary': [], 'experience': ['Synapse Analytics', '2 years 1 month', 'Machine Learning Team Lead', 'May 2021\xa0-\xa0Present\xa0(8 months)', 'Cairo, Egypt', 'Deep Learning Engineer/Data Scientist.', 'December 2019\xa0-\xa0November 2021\xa0(2 years)', 'Cairo Governorate, Egypt', 'Self Employed', '9 years', '.Net Developer', 'January 2010\xa0-\xa0December 2018\xa0(9 years)', 'Security Researcher', 'January 2013\xa0-\xa0January 2016\xa0(3 years 1 month)', 'Valeo', 'Embedded Software Engineer', 'Page 1 of 2', 'July 2018\xa0-\xa0November 2018\xa0(5 months)', 'Cairo Governorate, Egypt', 'ERC Team - Faculty

{'certifications': ['Python for Data Science', 'Machine Learning', 'IBM Watson V3', 'Mathematics for Machine Learning:', 'Linear Algebra', 'Artificial Intelligence V2', 'Ashraf Elnagar', 'Business Intelligence Developer |Data Analyst', 'رصم', 'ةربخلا', 'Information Technology Institute (ITI)', 'Business Intelligence Specialist', '٢٠٢١ ربمفون\u200f\xa0-\xa0Present\xa0(نارهش)', 'Markaz El Mansoura, Ad Daqahliyah, Egypt', 'CAT Reloaded', 'Member of Data Science circle at CAT Reloaded', 'دحاو رهش دحاو ماع)\xa0٢٠٢١ سرام\u200f\xa0-\xa0٢٠٢٠ سرام\u200f)', 'TIEC', 'Tiec Ambassador', 'دحاو رهش دحاو ماع)\xa0٢٠٢١ رياني\u200f\xa0-\xa0٢٠٢٠ رياني\u200f)', 'Egypt', 'IBM', 'Artificial Intelligence Analyst Trainee', 'دحاو رهش)\xa0٢٠٢٠ ربوتكأ\u200f\xa0-\xa0٢٠٢٠ ربوتكأ\u200f)', 'ميلعتلا', 'قيزاقزلا ةعماج', 'بلاط,\xa0computer science\xa0·\xa0(2017\xa0-\xa02021)', 'Page 1 of 1']}
----------------------------
{'skills': ['12-Business', '13-Feature Engineering', '14-Data Exploration', '15-Web Scraping', '16- 

{'skills': ['Python', 'Machine Learning', 'Data Science', 'Languages', 'English (Professional Working)', 'Arabic (Native or Bilingual)'], 'certifications': ['Applied Data Science with Python', 'Specialization', 'The Data Scientist’s Toolbox', 'Applied Plotting, Charting & Data', 'Representation in Python', 'Introduction to Probability and Data', 'Applied Text Mining in Python', 'Ahmed Hindi', 'Senior Machine Learning Engineer at Indeed.com', 'Cairo'], 'summary': ['Data Scientist familiar with gathering, cleaning, and organizing data', 'for use by technical and non-technical personnel. Due to working'], 'experience': ['Indeed.com', 'Senior Machine Learning Engineer', 'August 2021\xa0-\xa0Present\xa0(5 months)', 'Upwork', 'Data Scientist/ Analyst', 'October 2016\xa0-\xa0Present\xa0(5 years 3 months)', 'https://www.upwork.com/freelancers/~01a5e81e04f550d5b5'], 'projects': ['allowed me to be adaptive, flexible, and efficient at adopting new', "technologies and getting up to speed in no tim

{'skills': ['Microsoft Power BI', 'Power bi', 'BI Analyst', 'Ali Shah'], 'summary': ['Visualization.'], 'experience': ['DeepCloud', 'Business Intelligence Analyst', 'November 2019\xa0-\xa0Present\xa0(2 years 2 months)'], 'education': ['University of Engineering and Technology, Lahore', 'BSCS,\xa0Web Page, Digital/Multimedia and Information Resources', 'Design\xa0·\xa0(2016\xa0-\xa02020)', 'Page 1 of 1']}
----------------------------
{'skills': ['English', 'Teamwork', 'Communication'], 'summary': ['IT Undergraduate, AI Enthusiast.', 'A fast learner and a good team worker, able to work in different', 'circumstances and working environments.', 'Have some working experiences in C, C++, Python, TensorFlow', 'HTML, CSS, Java Script, My SQL etc.'], 'experience': ['Freelance', 'January 2019\xa0-\xa0Present\xa0(3 years)', 'Kalmunai, Sri Lanka'], 'education': ['Rajarata University of Sri Lanka', 'Bachelor of Science - BS,\xa0Information Technology\xa0·\xa0(2017\xa0-\xa02021)', 'Aquinas College o

{'summary': [], 'skills': ['Python (Basic)', 'Graduated from Modern Academy with a Bachelor degree in'], 'certifications': ['Being unique in your career', 'Introduction to the Internet of Things', 'and Embedded Systems', 'An End to End Deep Learning', 'Training'], 'projects': ['Technology with a deep level of details and working real-world apps.', 'topics: image processing techniques, feature matching, Stereo Vision, camera', 'calibration, Epi polar', 'Geometry, Depth Measurement, segmentation, object detection, etc', 'Electro Pi', 'Deep Learning Trainee', 'October 2020\xa0-\xa0February 2021\xa0(5 months)', 'Cairo, Egypt', '• Implementing machine learning models: supervised and unsupervised.', '• implementing. Deep learning models: ANN, CNN, and RNN.', '• Tools and libraries: NumPy, Pandas, scikit-learn, OpenCV, TensorFlow,', 'Keras, etc.', '• make data preprocessing', '• working with different data analysis types', 'Enactus', 'Project Manager', 'August 2019\xa0-\xa0August 2020\xa0(1 y

{'summary': [], 'experience': ['Blessings Company', 'Web Developer', 'January 2019\xa0-\xa0Present\xa0(3 years)', 'Cairo Governorate, Egypt', 'Freelance, self-employed', 'Full Stack Web Developer', 'January 2018\xa0-\xa0Present\xa0(4 years)', 'Technocolabs', 'Computer Vision Intern', 'March 2021\xa0-\xa0May 2021\xa0(3 months)', 'Karizma', 'Shop Manager', 'June 2018\xa0-\xa0January 2019\xa0(8 months)', 'Cairo Governorate, Egypt', 'FRESH ELECTRIC FOR HOME APPLIANCES', 'Customer Service Representative', 'April 2018\xa0-\xa0June 2018\xa0(3 months)', 'Cairo Governorate, Egypt'], 'skills': ['Deep Learning', 'Full-Stack Development', 'Computer Vision', 'Languages', 'Arabic (Native or Bilingual)', 'German (Limited Working)', 'English (Full Professional)'], 'certifications': ['Improving Deep Neural Networks:', 'Hyperparameter tuning,', 'Regularization and Optimization', 'Neural Networks and Deep Learning'], 'education': ['Page 1 of 2', "Faculty of Computers and AI, Helwan's University", "Bachel

{'skills': ['Problem Solving', 'C++', 'jQuery', 'Ahmed Shaheen', 'Data Science Student', 'Al Qalyubiyah'], 'summary': ['Student'], 'experience': ['Benha University', 'Student', 'September 2018\xa0-\xa0Present\xa0(3 years 4 months)'], 'education': ['benha university faculty of science', "Associate's degree,\xa0Computer Science\xa0·\xa0(2019\xa0-\xa02022)", 'Page 1 of 1']}
----------------------------
{'skills': ['have gained throughout my studies. My goal is to improve my career', 'in order to gain a better chance at being a member of the team that', 'will give me the opportunity for continued professional growth, also', 'improving myself at all times as I believe one should never stop', 'learning as it is the best way to become successful in the future.', 'Therefore I always strive towards self development in all aspects.'], 'certifications': ['Data Analysis Professional Udacity', 'Nanodegree', 'Data Analysis Challenger Track -', 'FWD', 'Muhammad Alaa El-Deen', 'Personalized Ads Assess

In [14]:
#£__file__
# 'C:\Users\mmazhar\Downloads\CV Mining\data'
nlp = spacy.load(os.path.dirname(os.path.abspath('C:\\Users\\mmazhar\\Downloads\\CV Mining\\pyresparser\\data')))
# resumes = '/home/omkarpathak27/Documents/GITS/resumeparser/resumes/'
# text_raw    = extract_text(resume, '.pdf')
# text        = ' '.join(text_raw.split())
# print(text)
# for resume in os.listdir(resumes):
# text_raw = extract_text(
#     '../data/Profile (1).pdf',
#     '.pdf'
# )




In [26]:
for resume in os.listdir('C:\\Users\\mmazhar\\Downloads\\CV Mining\\pyresparser\\data'):
    #print(resume)
    path='data/'+resume
    text_raw = extract_text(path,'.pdf')
    #print(text_raw)
    entity   = extract_entity_sections_grad(text_raw)
    print(entity)
    print('----------------------------')
# if 'experience' in entity.keys():
#     doc2 = nlp(text_raw)
#     entities = {}
#     for ent in doc2.ents:
#         if ent.label_ not in entities.keys():
#             entities[ent.label_] = [ent.text]
#         else:
#             entities[ent.label_].append(ent.text)
#     for key in entities.keys():
#         entities[key] = list(set(entities[key]))
#     print(entities)
#     print(doc2.ents)

{'skills': ['Computer Vision', 'Data Analysis', 'Natural Language Processing (NLP)'], 'certifications': ['twitter hall of fame', 'Microsoft Hall of fame', 'Microsoft Students Stars', 'Competition', 'Yahoo hall of fame', 'Mohamed Nassar', 'Machine Learning Team Lead at Synapse Analytics|MSc Candidate', 'in Computer Communication Engineering at Cairo University | AI', 'Instructor', 'Giza'], 'summary': [], 'experience': ['Synapse Analytics', '2 years 1 month', 'Machine Learning Team Lead', 'May 2021\xa0-\xa0Present\xa0(8 months)', 'Cairo, Egypt', 'Deep Learning Engineer/Data Scientist.', 'December 2019\xa0-\xa0November 2021\xa0(2 years)', 'Cairo Governorate, Egypt', 'Self Employed', '9 years', '.Net Developer', 'January 2010\xa0-\xa0December 2018\xa0(9 years)', 'Security Researcher', 'January 2013\xa0-\xa0January 2016\xa0(3 years 1 month)', 'Valeo', 'Embedded Software Engineer', 'Page 1 of 2', 'July 2018\xa0-\xa0November 2018\xa0(5 months)', 'Cairo Governorate, Egypt', 'ERC Team - Faculty

{'skills': ['Web Development', 'Design', 'Graphics'], 'certifications': ['Certificate of completion -', 'Introduction to Data and Data', 'Science', 'Ahmed Fathi', 'data science enthusiast || civil engineer || future data scientist', 'Giza'], 'summary': ['Hello,', 'I Am Ahmed Fathi', 'Front End Developer', 'Develop Myself'], 'experience': ['And Trained A Lot, I Am A Fast Learner And Always Love To', 'National Telecommunication Institute (NTI)', 'Student', 'October 2021\xa0-\xa0Present\xa0(3 months)', 'PETROJET', 'Trainee', 'June 2019\xa0-\xa0August 2020\xa0(1 year 3 months)', 'worked as a trainee civil engineer in somid project'], 'education': ['technology', 'technical,\xa0Computer Systems Networking \xa0·\xa0(2013\xa0-\xa02015)', 'Ain Shams University', "Bachelor's degree,\xa0Civil Engineering\xa0·\xa0(2015\xa0-\xa02020)", 'National Telecommunication Institute (NTI)', 'Higher National Diploma,\xa0web design\xa0·\xa0(2021\xa0-\xa02021)', 'Page 1 of 1']}
----------------------------
{}
-

{'skills': [], 'leadership': ['Arduino', 'Electronics'], 'certifications': ['Convolutional Neural Networks', 'HCIA-AI', 'Structuring Machine Learning'], 'projects': ['Start an introduction on UX,UI , HTML and CSS', 'DevisionX', 'Machine Learning Intern', 'September 2020\xa0-\xa0November 2020\xa0(3 months)', 'Cairo, Egypt', 'Page 1 of 2', 'Machine Learning Engineer Internship specifically in Computer Vision', 'Working on Building AutoAI tool such as AutoKeras, KerasTuner, and Optuna', 'Build classification models and Object Detection models ( with different formats', 'like COCO,UDT , YOLO)', 'Deployment the models on the web', 'Build Script to convert between UDT format to COCO format'], 'summary': ['Majoring in Computer Engineering, a technology and science', 'enthusiast with the ability to learn rapidly and cooperate. Very', 'flexible and willing to be pushed to the limits when it comes to', 'working. Ex-intern at DevisonX as well as YAT both worked as', 'a software engineer as Machin

{'certifications': ['AWS Machine Learning Foundations', 'Intermediate Python', 'Introduction to Data Visualization', 'with Plotly in Python', 'Amr El Agoz', 'Artificial Intelligence Intern at ITI | Machine Learning Engineer |', 'Embedded Systems Engineer', 'Egypt'], 'summary': ['Fresh graduate embedded systems engineer, worked with various', 'platforms and programming languages including:', 'Introduction to Python', 'Microcontrollers: AVR, Tiva C LaunchPad, STM32F4xx, ESP8266', 'Introduction to Statistics in Python', 'NodeMCU', 'Programming languages: C, C++, Python', 'Tools: Eclipse IDE, KEIL IDE, Atmel Studio', '>Also worked with machine learning and communication protocols', 'such:', 'Analysis)', 'Machine learning algorithms: Linear Regression, Logistic', 'Regression, SVM (support vector machine), K-Means Clustering,', 'ANN (Artificial Neural Networks), PCA (Principal Component', 'Communication protocols: MQTT, HTTP', 'Platforms: AzureML, Adafruit, IFTTT'], 'experience': ['ITI', 'Ar

{'skills': ['SQL', 'C#', 'Machine Learning', 'Languages', 'English (Professional Working)', 'Arabic (Native or Bilingual)'], 'certifications': ['Big Data Engineer - Explorer Award', 'for Students 2018', 'Machine Learning Using SAS Viya', 'Big Data Engineer - Mastery Award', 'for Students 2018', 'Deep Learning Specialization', 'OFSAA - Financial Crime and', 'Compliance Management (FCCM)', 'Solution Engineer Specialist', 'Fakhraddin Alwajih', 'Data Scientist at Ibtikar Technologies', 'Egypt'], 'experience': ['Ibtikar Technologies', 'Data Scientist', 'July 2021\xa0-\xa0Present\xa0(6 months)', 'DataGear', '4 years 8 months', 'Data Scientist', 'January 2020\xa0-\xa0Present\xa0(2 years)', 'ETL Developer', 'May 2017\xa0-\xa0December 2019\xa0(2 years 8 months)', 'Faculty of Computers and Information - Cairo University', 'PhD Candidate', 'February 2017\xa0-\xa0Present\xa0(4 years 11 months)', 'Giza, Al Jizah, Egypt', 'Ibb University', 'Software Engineer | Assistant Lecturer', 'April 2014\xa0-\x

----------------------------
{'skills': [], 'certifications': ['Applied Plotting, Charting & Data', 'Representation in Python', 'Improving Deep Neural Networks:', 'Hyperparameter Tuning,', 'Regularization and Optimization', 'Convolutional Neural Networks', 'Applied Machine Learning in Python', 'Machine Learning', 'Islam Elsayed', 'AI trainee at ITI | Machine learning engineer | Machine learning', 'intern at Tekomoro', 'Alexandria'], 'summary': ['I love writing code. Ever since writing my first program in python and', 'manipulating it to produce a desired output, I have been obsessed', 'with the idea of using software to solve practical problems. I believe', 'in the power of programming to transform and improve the lives of', 'people around the world.', 'I am a data scientist, I graduated from faculty of electronic', 'engineering computer science department. I love programming. I'], 'projects': [], 'experience': ['Tekomoro', 'Machine Learning Intern', 'October 2021\xa0-\xa0Present\xa0(3

{'skills': ['Python', 'Data Analysis', 'Machine Learning', 'English (Professional Working)', 'Arabic (Native or Bilingual)'], 'certifications': ['sololearn JavaScript', 'python', 'Process Data from Dirty to Clean', 'Prepare Data for Exploration', 'Google Data Analytics Certificate', 'Ahmed Ayman', 'data analyst', 'Egypt'], 'summary': [], 'experience': ['Self-employed', 'Developer', 'January 2019\xa0-\xa0Present\xa0(3 years)', 'Egypt'], 'education': ['assuit university', "Bachelor's degree,\xa0Computer Science , Information System \xa0·\xa0(2014\xa0-\xa02018)", 'Page 1 of 1']}
----------------------------
{'skills': ['Teamwork', 'Java', 'C (Programming Language)', 'Muhammad Arslan', 'Software Quality Assurance Engineer', 'Lahore'], 'summary': ['I am learning the aesthetics of software quality assurance and', 'procuring competence in automation technologies. I am more', 'interested in seamless delivery and maintainence of projects.'], 'experience': ['Devsinc', '1 year 4 months', 'Softwar

In [71]:
# entity   = extract_entity_sections_grad(text_raw)
# if 'experience' in entity.keys():
doc2 = nlp(text_raw)
entities = {}
for ent in doc2.ents:
    if ent.label_ not in entities.keys():
        entities[ent.label_] = [ent.text]
    else:
        entities[ent.label_].append(ent.text)
for key in entities.keys():
    entities[key] = list(set(entities[key]))
print(entities)
# print(doc2.ents)


TypeError: object of type 'generator' has no len()