# Load Dataset

In [1]:
import pandas as pd

In [2]:
resumes = pd.read_csv("./resume.csv", delimiter=",")

In [3]:
resumes

Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR
...,...,...,...,...
2479,99416532,RANK: SGT/E-5 NON- COMMISSIONED OFFIC...,"<div class=""fontsize fontface vmargins hmargin...",AVIATION
2480,24589765,"GOVERNMENT RELATIONS, COMMUNICATIONS ...","<div class=""fontsize fontface vmargins hmargin...",AVIATION
2481,31605080,GEEK SQUAD AGENT Professional...,"<div class=""fontsize fontface vmargins hmargin...",AVIATION
2482,21190805,PROGRAM DIRECTOR / OFFICE MANAGER ...,"<div class=""fontsize fontface vmargins hmargin...",AVIATION


In [91]:
categories = dict.fromkeys(resumes["Category"])

In [92]:
categories

{'HR': None,
 'DESIGNER': None,
 'INFORMATION-TECHNOLOGY': None,
 'TEACHER': None,
 'ADVOCATE': None,
 'BUSINESS-DEVELOPMENT': None,
 'HEALTHCARE': None,
 'FITNESS': None,
 'AGRICULTURE': None,
 'BPO': None,
 'SALES': None,
 'CONSULTANT': None,
 'DIGITAL-MEDIA': None,
 'AUTOMOBILE': None,
 'CHEF': None,
 'FINANCE': None,
 'APPAREL': None,
 'ENGINEERING': None,
 'ACCOUNTANT': None,
 'CONSTRUCTION': None,
 'PUBLIC-RELATIONS': None,
 'BANKING': None,
 'ARTS': None,
 'AVIATION': None}

# Data preprocessing

In [4]:
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/deepalimane/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lemmatize = WordNetLemmatizer()

In [6]:
corpus = []

for i in range(len(resumes)):
    resume = re.sub("^[a-zA-z]", " ", str(resumes["Resume_str"][i]))
    resume = resume.lower()
    resume = resume.split()
    
    #remove stopwords and get words to normal form
    resume = [lemmatize.lemmatize(word) for word in resume if word not in stopwords.words('english')]
    resume = " ".join(resume)
    corpus.append(resume)

In [7]:
corpus

["hr administrator/marketing associate hr administrator summary dedicated customer service manager 15+ year experience hospitality customer service management. respected builder leader customer-focused teams; strives instill shared, enthusiastic commitment customer service. highlight focused customer satisfaction team management marketing savvy conflict resolution technique training development skilled multi-tasker client relation specialist accomplishment missouri dot supervisor training certification certified ihg customer loyalty marketing segment hilton worldwide general manager training certification accomplished trainer cross server hospitality system hilton onq , micros opera pm , fidelio opera reservation system (ors) , holidex completed course seminar customer service, sale strategies, inventory control, loss prevention, safety, time management, leadership performance assessment. experience hr administrator/marketing associate hr administrator dec 2013 current company name － c

# Words to Vectors using TFIDF

In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=2000)

In [52]:
X = tfidf.fit_transform(corpus).toarray()

In [53]:
X

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0332, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0364, 0.0691, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0455, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0177, 0, 0, 0, 0],
       [0, 0.0474, 0, 0, 0, 0.0276, 0, 0, 0, 0.0575, 0.0426, 0.0315, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0.0446, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0.025, 0.025, 0.025, 0.0458, 0.0226, 0, 0, 0.0477, 0.0177, 0, 0, 0, 0, 0, 0.0266, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0306, 0.03, 0, 0.0553, ..., 0, 0, 0, 0, 0, 0, 0.0204, 0.0247, 0, 0.0282, 0, 0, 0, 0.0547, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.071, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0691, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0585, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0.0796, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0495, 0, 0, 0, 0.0278, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0173, 0, 0, 0, 0, 0, 0, 0, 

In [54]:
import numpy as np
np.set_printoptions(edgeitems=30, linewidth=100000, 
                    formatter=dict(float=lambda x : "%.3g" % x))

In [55]:
X

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0332, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0364, 0.0691, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0455, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0177, 0, 0, 0, 0],
       [0, 0.0474, 0, 0, 0, 0.0276, 0, 0, 0, 0.0575, 0.0426, 0.0315, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0.0446, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0.025, 0.025, 0.025, 0.0458, 0.0226, 0, 0, 0.0477, 0.0177, 0, 0, 0, 0, 0, 0.0266, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0306, 0.03, 0, 0.0553, ..., 0, 0, 0, 0, 0, 0, 0.0204, 0.0247, 0, 0.0282, 0, 0, 0, 0.0547, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.071, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0691, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0585, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0.0796, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0495, 0, 0, 0, 0.0278, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0173, 0, 0, 0, 0, 0, 0, 0, 

In [56]:
y = resumes["Category"]

In [57]:
from sklearn.preprocessing import LabelEncoder

In [58]:
le = LabelEncoder()
y = le.fit_transform(resumes["Category"])

In [59]:
y

array([19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, ...,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6])

# Model Building (Using Decision Tress as this is mutiple classifier problem)

In [60]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test=train_test_split(X,y, test_size=0.3 ,random_state=500)

In [61]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()

In [62]:
model.fit(x_train, y_train)

In [63]:
y_pred = model.predict(x_test)

In [64]:
y_pred

array([ 7,  9,  7, 11, 11,  3, 19,  4, 21, 17, 13,  0,  2, 11, 23, 23, 12, 23, 19,  0,  2,  4,  7,  7, 10, 13, 22, 10,  9,  6,  7,  2, 23, 23,  8,  4, 14, 20, 17, 16, 14, 16, 19, 19, 12,  4,  9, 19,  0,  3,  1,  3,  4,  4, 23, 11, 16, 14, 17,  7,  1, 15, 19, 19,  6, 23, 12, 13,  5,  1, 15, 18, 21,  4, 23,  3, 18, 10,  2,  4,  0, 22, 14,  6, 18,  0, 15,  1,  4, 13,  1, 17, 16, 18, 20, 20, 11,  6, 17,  1, 12, 19,  6, 16,  5,  4, 18,  4, 23, 12, 14, 23, 15, 19, 11, 19,  3,  9, 18,  0, 10,  6, 15, 12, 23,  1, 16, 20,  6, 21, 15,  9, 20,  6, 23, 20, 14, 17,  1, 10, 17, 12,  1, 14,  3,  0,  0, 19,  7,  0, 22, 20, 16, 12, 15, 17, 23,  9, 17, 21, 14, 12,  5, 22,  2, 20, 21, 23,  4, 15, 22,  0, 18, 15, 13,  7,  9, 13, 17, 20,  4, 17,  6, 21,  1, 21,  9, 18, 13,  7, 15, 16, 13, 14, 15,  9, 14,  1,  5,  5, 18, 12, 20,  0, 21, 16, 15, 16, 11,  1,  4, 11, 22, 10, 14, 14, 17, 13, 16,  3, 19,  0, 18, 23, 11,  2, 19,  1,  0, 18,  9, 10, 13, 14,  1,  4, 16,  6, 17, 15, 22, 18, 20, 21, 18,  3, 13, 23,  

# Classifier Report

In [65]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.74      0.75        38
           1       0.72      0.61      0.66        38
           2       0.33      0.32      0.32        19
           3       0.41      0.28      0.33        39
           4       0.12      0.16      0.14        31
           5       0.07      0.10      0.08        10
           6       0.69      0.58      0.63        38
           7       0.38      0.37      0.37        30
           8       0.33      0.14      0.20         7
           9       0.43      0.42      0.42        36
          10       0.71      0.79      0.75        28
          11       0.73      0.85      0.79        26
          12       0.46      0.35      0.40        37
          13       0.74      0.69      0.72        42
          14       0.60      0.64      0.62        33
          15       0.65      0.72      0.68        36
          16       0.56      0.61      0.58        31
          17       0.77    

# Predicted Labels (Decoded)

In [66]:
decoded = le.inverse_transform(y_pred)
print(decoded)

['BANKING' 'BUSINESS-DEVELOPMENT' 'BANKING' 'CONSTRUCTION' 'CONSTRUCTION' 'APPAREL' 'HR' 'ARTS' 'PUBLIC-RELATIONS' 'FITNESS' 'DESIGNER' 'ACCOUNTANT' 'AGRICULTURE' 'CONSTRUCTION' 'TEACHER' 'TEACHER' 'CONSULTANT' 'TEACHER' 'HR' 'ACCOUNTANT' 'AGRICULTURE' 'ARTS' 'BANKING' 'BANKING' 'CHEF' 'DESIGNER' 'SALES' 'CHEF' 'BUSINESS-DEVELOPMENT' 'AVIATION' 'BANKING' 'AGRICULTURE' 'TEACHER' 'TEACHER' 'BPO' 'ARTS' 'DIGITAL-MEDIA' 'INFORMATION-TECHNOLOGY' 'FITNESS' 'FINANCE' 'DIGITAL-MEDIA' 'FINANCE' 'HR' 'HR' 'CONSULTANT' 'ARTS' 'BUSINESS-DEVELOPMENT' 'HR' 'ACCOUNTANT' 'APPAREL' 'ADVOCATE' 'APPAREL' 'ARTS' 'ARTS' 'TEACHER' 'CONSTRUCTION' 'FINANCE' 'DIGITAL-MEDIA' 'FITNESS' 'BANKING' 'ADVOCATE' 'ENGINEERING' 'HR' 'HR' 'AVIATION' 'TEACHER' 'CONSULTANT' 'DESIGNER' 'AUTOMOBILE' 'ADVOCATE' 'ENGINEERING' 'HEALTHCARE' 'PUBLIC-RELATIONS' 'ARTS' 'TEACHER' 'APPAREL' 'HEALTHCARE' 'CHEF' 'AGRICULTURE' 'ARTS' 'ACCOUNTANT' 'SALES' 'DIGITAL-MEDIA' 'AVIATION' 'HEALTHCARE' 'ACCOUNTANT' 'ENGINEERING' 'ADVOCATE' 'ARTS

# Accuracy Report

In [67]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test, y_pred))

0.5482573726541555


In [84]:
sample = ["""HR SPECIALIST, US HR OPERATIONS

Summary:
Versatile media professional with background in Communications, Marketing, Human Resources and Technology.

Experience:
09/2015 – Present  
HR Specialist, US HR Operations  
- Managed communication regarding launch of Operations group, policy changes and system outages  
- Designed standard work and job aids to create training for new employees and contractors  
- Audited job postings, background checks, and drug screens  
- Conducted new hire benefits briefings  
- Liaised between HR Managers and vendors to resolve system issues  
- Provided real-time process improvement feedback  
- Rebranded US HR Operations SharePoint site  
- Led RFI/RFP for Background Check and Drug Screen vendors  

01/2014 – 05/2015  
IT, Marketing and Communications Co-op  
- Maintained and updated corporate SharePoint site with content and graphics  
- Drafted internal articles and newsletters  
- Supported internal/external event communications  
- Led launches for Digital Asset Management tools  
- Created executive presentations and video production for IT meetings  

10/2012 – 01/2014  
Relationship Coordinator / Marketing Specialist  
- Managed in-house advertising and marketing campaigns  
- Coordinated trade show activities and materials  
- Produced internal promotional content and tracked sales leads  
- Partnered with design team for branded marketing material  

09/2009 – 10/2012  
Assistant Head Teller  
- Achieved 100% internal audit score  
- Conducted ATM and teller audits  
- Promoted products and educated customers on services  
- Organized campus events and coached peers on programs  

Other:  
Senior Producer – 2014 SHU Media Exchange  
- Organized media event with industry professionals and 110+ attendees  
- Led panel discussion and post-event networking  

Education:  
2014 – MA, Corporate Communication & Public Relations – Sacred Heart University  
2013 – BA, Relational Communication – Western Connecticut State University  

Skills:  
Adobe Photoshop, ADP, Asset Management, Branding, Final Cut Pro, SharePoint, Lotus Notes, Marketing, HR, InDesign, Illustrator, Employee Development, RFP, Recruitment, Project Management, Video Production, Web Design, Microsoft Office, Quality, Content Creation, Customer Care

Label: **HR Specialist**
""","""
Summary
Detail-oriented Software Engineer with experience in full-stack development using modern frameworks and cloud infrastructure. Strong background in developing scalable web applications and cross-functional collaboration.

Experience
06/2020 – Present
Full Stack Developer
Company Name – City, State

Developed responsive web apps using React, Node.js, and MongoDB.

Implemented RESTful APIs and integrated third-party services like Stripe and Firebase.

Optimized performance and reduced page load times by 40%.

Led CI/CD automation using Jenkins and Docker.

Collaborated with UI/UX teams to translate designs into functional components.

Maintained 80%+ test coverage using Jest and Mocha.

07/2018 – 05/2020
Software Developer Intern
Company Name – City, State

Built internal tools for employee tracking using Angular and .NET Core.

Created SQL stored procedures and optimized legacy queries.

Participated in agile sprints and code reviews.

Education
2018
Bachelor of Technology: Computer Science
XYZ University – City, State

Skills
JavaScript, React, Node.js, Express, MongoDB, SQL, Git, Docker, Jenkins, REST APIs, Agile, CI/CD, Firebase""","""Summary
Customer-focused Product Manager with strong technical acumen and experience in SaaS. Known for translating customer needs into successful product features and delivering end-to-end product life cycle.

Experience
01/2021 – Present
Product Manager
Company Name – City, State

Led cross-functional teams to launch two new features, resulting in 25% user retention increase.

Defined product roadmap and prioritized backlog based on user feedback and data.

Conducted competitive analysis and market research.

Coordinated with engineering and design teams for agile development.

Measured product success through KPIs and user feedback.

08/2018 – 12/2020
Business Analyst
Company Name – City, State

Created user stories and acceptance criteria for product features.

Worked with stakeholders to define requirements and align product goals.

Managed JIRA dashboards and sprint reports.

Education
2018
Bachelor of Engineering: Information Technology
University Name – City, State

Skills
Product Roadmap, Agile, Scrum, User Stories, JIRA, Wireframing, Market Research, KPI Metrics, Product Strategy, SaaS, A/B Testing"""]

In [85]:
vectors = tfidf.transform(sample).toarray()

In [86]:
vectors

array([[0, 0.0494, 0, 0, 0, 0.0288, 0, 0, 0, 0.0599, 0.0444, 0.0329, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0.0155, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0.0525, 0.0518, 0.0547, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0.044, 0, 0, 0, 0, 0, 0, 0.0498, 0, 0, 0, 0, 0.046, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0.0406, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [87]:
predictions = model.predict(vectors)
print(le.inverse_transform(predictions))

['HR' 'INFORMATION-TECHNOLOGY' 'ENGINEERING']
