In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
import warnings
warnings.filterwarnings('ignore')
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

In [8]:
import PyPDF2 as pdf
file = open('android-developer-1559034496.pdf', 'rb')
file

<_io.BufferedReader name='android-developer-1559034496.pdf'>

In [9]:
pdf_reader = pdf.PdfFileReader(file)
pdf_reader

<PyPDF2._reader.PdfFileReader at 0x1a514f8d9d0>

In [10]:
pdf_reader.getNumPages()
page1 = pdf_reader.getPage(0)
resumeText = page1.extractText()[:1050]
resumeText                                                                                       ##Extracted text = "resumeText"

'Android Developer\nROBERT SMITHPhone: (123) 456 78 99 \nEmail: info@qwikresume.com\nWebsite: www.qwikresume.com\nLinkedIn:\nlinkedin.com/qwikresume\nAddress: 1737 Marshville Road,\nAlabama.\nObjective\nOver 6 years of IT industry experience with 4+ years of experience as Mobile application \ndeveloper in the field of Android. Experience in developing front end applications for Android \nphones. Experience developing mobile applications on Android platform, building Custom UI using \nViews, ViewGroups, Layouts, Widgets and graphics that scale based on the screen size using 9-\npatch images, localization, testing and publishing the applications to the Android Market.\nSkills\nPython, Java, C, Javascript, Matlab, R.\nWork Experience\nAndroid Developer\nABC Corporation \xa0\xad\xa0January 2011 – March 2012  \n\uf09fEnvironment Eclipse IDE, Android Studio, GenyMotion, Java, Android SDK, Android \nDevelopment T ools (ADT), JSON, XML,\n\uf09fInvolved in the full life cycle of the project inc

### Regular Expression

In [11]:
def cleanResume(resumeText):
    resumeText = re.sub('http\S+\s*', ' ', resumeText)  # remove URLs
    resumeText = re.sub('RT|cc', ' ', resumeText)  # remove RT and cc
    resumeText = re.sub('#\S+', '', resumeText)  # remove hashtags
    resumeText = re.sub('@\S+', '  ', resumeText)  # remove mentions
    resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', resumeText)  # remove punctuations
    resumeText = re.sub(r'[^\x00-\x7f]',r' ', resumeText) 
    resumeText = re.sub('\s+', ' ', resumeText)  # remove extra whitespace
    return resumeText

# 1. Tokenize

In [13]:
removed = cleanResume(resumeText)
print(removed)

Android Developer ROBE SMITHPhone 123 456 78 99 Email info Website www qwikresume com LinkedIn linkedin com qwikresume Address 1737 Marshville Road Alabama Objective Over 6 years of IT industry experience with 4 years of experience as Mobile application developer in the field of Android Experience in developing front end applications for Android phones Experience developing mobile applications on Android platform building Custom UI using Views ViewGroups Layouts Widgets and graphics that scale based on the screen size using 9 patch images localization testing and publishing the applications to the Android Market Skills Python Java C Javascript Matlab R Work Experience Android Developer ABC Corporation January 2011 March 2012 Environment Eclipse IDE Android Studio GenyMotion Java Android SDK Android Development T ools ADT JSON XML Involved in the full life cycle of the project including analysis design development debugging testing and deployment Developed the app


In [15]:
import nltk
import re
from nltk.corpus import stopwords
sentences = nltk.sent_tokenize(removed)
print(sentences)

['Android Developer ROBE SMITHPhone 123 456 78 99 Email info Website www qwikresume com LinkedIn linkedin com qwikresume Address 1737 Marshville Road Alabama Objective Over 6 years of IT industry experience with 4 years of experience as Mobile application developer in the field of Android Experience in developing front end applications for Android phones Experience developing mobile applications on Android platform building Custom UI using Views ViewGroups Layouts Widgets and graphics that scale based on the screen size using 9 patch images localization testing and publishing the applications to the Android Market Skills Python Java C Javascript Matlab R Work Experience Android Developer ABC Corporation January 2011 March 2012 Environment Eclipse IDE Android Studio GenyMotion Java Android SDK Android Development T ools ADT JSON XML Involved in the full life cycle of the project including analysis design development debugging testing and deployment Developed the app']


In [16]:
from nltk.corpus import stopwords
stop_words=set(stopwords.words("english"))

words_without_stopwords = [word for word in sentences if word not in stop_words]
print(words_without_stopwords)

['Android Developer ROBE SMITHPhone 123 456 78 99 Email info Website www qwikresume com LinkedIn linkedin com qwikresume Address 1737 Marshville Road Alabama Objective Over 6 years of IT industry experience with 4 years of experience as Mobile application developer in the field of Android Experience in developing front end applications for Android phones Experience developing mobile applications on Android platform building Custom UI using Views ViewGroups Layouts Widgets and graphics that scale based on the screen size using 9 patch images localization testing and publishing the applications to the Android Market Skills Python Java C Javascript Matlab R Work Experience Android Developer ABC Corporation January 2011 March 2012 Environment Eclipse IDE Android Studio GenyMotion Java Android SDK Android Development T ools ADT JSON XML Involved in the full life cycle of the project including analysis design development debugging testing and deployment Developed the app']


In [17]:
all_words = [nltk.word_tokenize(sent) for sent in sentences]
print(all_words)


[['Android', 'Developer', 'ROBE', 'SMITHPhone', '123', '456', '78', '99', 'Email', 'info', 'Website', 'www', 'qwikresume', 'com', 'LinkedIn', 'linkedin', 'com', 'qwikresume', 'Address', '1737', 'Marshville', 'Road', 'Alabama', 'Objective', 'Over', '6', 'years', 'of', 'IT', 'industry', 'experience', 'with', '4', 'years', 'of', 'experience', 'as', 'Mobile', 'application', 'developer', 'in', 'the', 'field', 'of', 'Android', 'Experience', 'in', 'developing', 'front', 'end', 'applications', 'for', 'Android', 'phones', 'Experience', 'developing', 'mobile', 'applications', 'on', 'Android', 'platform', 'building', 'Custom', 'UI', 'using', 'Views', 'ViewGroups', 'Layouts', 'Widgets', 'and', 'graphics', 'that', 'scale', 'based', 'on', 'the', 'screen', 'size', 'using', '9', 'patch', 'images', 'localization', 'testing', 'and', 'publishing', 'the', 'applications', 'to', 'the', 'Android', 'Market', 'Skills', 'Python', 'Java', 'C', 'Javascript', 'Matlab', 'R', 'Work', 'Experience', 'Android', 'Develo

In [2]:
# Import the required Module
import tabula
# Read a PDF File
df = tabula.read_pdf("c.pdf", pages='all')[0]
# convert PDF into CSV
tabula.convert_into("c.pdf", "iplmatch.csv", output_format="csv", pages='all')
print(df)

IndexError: list index out of range

In [19]:

import spacy
nlp = spacy.load("en_core_web_sm")

names_string = removed
doc = nlp(names_string)

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

LinkedIn 86 94 PERSON
Address 1737 Marshville Road Alabama Objective 119 165 WORK_OF_ART
6 years 171 178 DATE
4 years 210 217 DATE
Mobile 235 241 GPE
Android 340 347 ORG
Android 400 407 ORG
Custom UI 426 435 ORG
Views 442 447 ORG
ViewGroups Layouts Widgets 448 474 PRODUCT
9 530 531 CARDINAL
the Android Market Skills Python 601 633 ORG
January 2011 March 2012 711 734 DATE
ADT 829 832 ORG
Developed 960 969 PERSON


In [23]:
import spacy
nlp = spacy.load("en_core_web_sm")
names_stringg = names_string
doc = nlp(names_stringg)

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

LinkedIn 86 94 PERSON
Address 1737 Marshville Road Alabama Objective 119 165 WORK_OF_ART
6 years 171 178 DATE
4 years 210 217 DATE
Mobile 235 241 GPE
Android 340 347 ORG
Android 400 407 ORG
Custom UI 426 435 ORG
Views 442 447 ORG
ViewGroups Layouts Widgets 448 474 PRODUCT
9 530 531 CARDINAL
the Android Market Skills Python 601 633 ORG
January 2011 March 2012 711 734 DATE
ADT 829 832 ORG
Developed 960 969 PERSON
