In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
df = pd.read_csv("UpdatedResumeDataset.csv")
df

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."
...,...,...
957,Testing,Computer Skills: â¢ Proficient in MS office (...
958,Testing,â Willingness to accept the challenges. â ...
959,Testing,"PERSONAL SKILLS â¢ Quick learner, â¢ Eagerne..."
960,Testing,COMPUTER SKILLS & SOFTWARE KNOWLEDGE MS-Power ...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 962 entries, 0 to 961
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  962 non-null    object
 1   Resume    962 non-null    object
dtypes: object(2)
memory usage: 15.2+ KB


In [4]:
df['Category'].value_counts()

Java Developer               84
Testing                      70
DevOps Engineer              55
Python Developer             48
Web Designing                45
HR                           44
Hadoop                       42
Blockchain                   40
ETL Developer                40
Operations Manager           40
Data Science                 40
Sales                        40
Mechanical Engineer          40
Arts                         36
Database                     33
Electrical Engineering       30
Health and fitness           30
PMO                          30
Business Analyst             28
DotNet Developer             28
Automation Testing           26
Network Security Engineer    25
SAP Developer                24
Civil Engineer               24
Advocate                     20
Name: Category, dtype: int64

In [5]:
def cleanResume(resumeText):
    resumeText = re.sub('http\S+\s*', ' ', resumeText)  # remove URLs
    resumeText = re.sub('RT|cc', ' ', resumeText)  # remove RT and cc
    resumeText = re.sub('#\S+', '', resumeText)  # remove hashtags
    resumeText = re.sub('@\S+', '  ', resumeText)  # remove mentions
    resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', resumeText)  # remove punctuations
    resumeText = re.sub(r'[^\x00-\x7f]',r' ', resumeText) 
    resumeText = re.sub('\s+', ' ', resumeText)  # remove extra whitespace
    return resumeText

In [6]:
df['cleanedResume'] = df['Resume'].apply(lambda x: cleanResume(x))
df

Unnamed: 0,Category,Resume,cleanedResume
0,Data Science,Skills * Programming Languages: Python (pandas...,Skills Programming Languages Python pandas num...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...,Education Details May 2013 to May 2017 B E UIT...
2,Data Science,"Areas of Interest Deep Learning, Control Syste...",Areas of Interest Deep Learning Control System...
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,Skills R Python SAP HANA Tableau SAP HANA SQL ...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab...",Education Details MCA YMCAUST Faridabad Haryan...
...,...,...,...
957,Testing,Computer Skills: â¢ Proficient in MS office (...,Computer Skills Proficient in MS office Word B...
958,Testing,â Willingness to accept the challenges. â ...,Willingness to a ept the challenges Positive ...
959,Testing,"PERSONAL SKILLS â¢ Quick learner, â¢ Eagerne...",PERSONAL SKILLS Quick learner Eagerness to lea...
960,Testing,COMPUTER SKILLS & SOFTWARE KNOWLEDGE MS-Power ...,COMPUTER SKILLS SOFTWARE KNOWLEDGE MS Power Po...


In [7]:
import warnings
warnings.filterwarnings('ignore')

In [8]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.preprocessing import FunctionTransformer

In [9]:
models = {
    'logistic_regression' : {
        'model' : LogisticRegression(max_iter=10000),
        'params' : {
            'C' : [1,5,10,15,20],
            'solver' : ['lbfgs', 'liblinear']
        }
    },
    'decision_tree' : {
        'model' : DecisionTreeClassifier(),
        'params' : {
            'criterion' : ['gini', 'entropy']
        }
    },
    'random_forest' : {
        'model' : RandomForestClassifier(),
        'params' : {
            'criterion' : ['gini', 'entropy'],
            'n_estimators' : [10,20,30,40,50,100]
        }
    },
#     'gaussian_nb' : {
#         'model' : GaussianNB(),
#         'params' : {}
#     },
    'svm' : {
        'model' : SVC(gamma='auto'),
        'params' : {
            'C' : [1,5,10,15,20],
            'kernel' : ['linear', 'rbf']
        }
    },
    'knn' : {
        'model' : KNeighborsClassifier(),
        'params' : {
            'n_neighbors' : [3,5,7,11,13,15,17,19,21],
            'p' : [1,2]
        }
    }
}

In [11]:
le = LabelEncoder()

In [12]:
df['new_categ'] = le.fit_transform(df['Category'])
df

Unnamed: 0,Category,Resume,cleanedResume,new_categ
0,Data Science,Skills * Programming Languages: Python (pandas...,Skills Programming Languages Python pandas num...,6
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...,Education Details May 2013 to May 2017 B E UIT...,6
2,Data Science,"Areas of Interest Deep Learning, Control Syste...",Areas of Interest Deep Learning Control System...,6
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,Skills R Python SAP HANA Tableau SAP HANA SQL ...,6
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab...",Education Details MCA YMCAUST Faridabad Haryan...,6
...,...,...,...,...
957,Testing,Computer Skills: â¢ Proficient in MS office (...,Computer Skills Proficient in MS office Word B...,23
958,Testing,â Willingness to accept the challenges. â ...,Willingness to a ept the challenges Positive ...,23
959,Testing,"PERSONAL SKILLS â¢ Quick learner, â¢ Eagerne...",PERSONAL SKILLS Quick learner Eagerness to lea...,23
960,Testing,COMPUTER SKILLS & SOFTWARE KNOWLEDGE MS-Power ...,COMPUTER SKILLS SOFTWARE KNOWLEDGE MS Power Po...,23


In [13]:
le.fit(df['Category'])
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
le_name_mapping

{'Advocate': 0,
 'Arts': 1,
 'Automation Testing': 2,
 'Blockchain': 3,
 'Business Analyst': 4,
 'Civil Engineer': 5,
 'Data Science': 6,
 'Database': 7,
 'DevOps Engineer': 8,
 'DotNet Developer': 9,
 'ETL Developer': 10,
 'Electrical Engineering': 11,
 'HR': 12,
 'Hadoop': 13,
 'Health and fitness': 14,
 'Java Developer': 15,
 'Mechanical Engineer': 16,
 'Network Security Engineer': 17,
 'Operations Manager': 18,
 'PMO': 19,
 'Python Developer': 20,
 'SAP Developer': 21,
 'Sales': 22,
 'Testing': 23,
 'Web Designing': 24}

In [14]:
def trans_func(inp):
    inp = inp.apply(lambda x: cleanResume(x))
    return inp.values

In [15]:
transformer = FunctionTransformer(trans_func)

In [16]:
pipeline = Pipeline([
    ('cleaner',transformer),
    ('vectorizer',TfidfVectorizer()),
    ('model', LogisticRegression())
]) 

In [17]:
x_train, x_test, y_train, y_test = train_test_split(df['Resume'],df['new_categ'],test_size = 0.25,random_state=10,stratify=df['new_categ'])

In [18]:
pipeline.fit(x_train,y_train)

In [19]:
pipeline.score(x_test,y_test)

0.966804979253112

In [20]:
res = df['Resume'][134]
res

"â\x80¢ Good communication skill â\x80¢ Quick learner â\x80¢ Keen to find solutionsEducation Details \r\n MBA Marketing and International Business Management Pune, Maharashtra Pune University\r\n B-Tech Tech Nagpur, Maharashtra RTM Nagpur University\r\nG.M. Arts, Commerce & Science \r\n\r\nG.M. Arts, Commerce & Science\r\nSkill Details \r\nCompany Details \r\ncompany - Samarth College\r\ndescription - of Engineering        30          7        210 \r\n5      College to campus             VJ College of Pharmacy         10 days' workshop       10\r\n\r\nG.M. Arts, Commerce & Science 6          Soft Skills                                               6 days' workshop           6\r\nCollege\r\n\r\nPersonality             G.M. Institute of Agricultural 7\t\t6 days' workshop           6\r\nDevelopment                        Diploma \r\n8          Soft Skills           Samarth College of Polytechnic     20 days' workshop       20\r\n\r\nTOTAL                                              350\

In [21]:
pipeline.predict(pd.Series([res]))

array([1])

In [22]:
scores = []
for model_name, mp in models.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    pipeline = Pipeline([
    ('cleaner',transformer),
    ('vectorizer',TfidfVectorizer()),
    ('model', clf)
    ]) 
    pipeline.fit(df['Resume'],df['new_categ'])
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
df1 = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df1

Unnamed: 0,model,best_score,best_params
0,logistic_regression,0.995855,"{'C': 1, 'solver': 'lbfgs'}"
1,decision_tree,0.995855,{'criterion': 'gini'}
2,random_forest,0.997927,"{'criterion': 'gini', 'n_estimators': 20}"
3,svm,0.995855,"{'C': 1, 'kernel': 'linear'}"
4,knn,0.987543,"{'n_neighbors': 3, 'p': 2}"


In [27]:
pipeline = Pipeline([
    ('cleaner',transformer),
    ('vectorizer',TfidfVectorizer()),
    ('model', RandomForestClassifier(criterion='gini',n_estimators=10))
]) 

In [28]:
x_train, x_test, y_train, y_test = train_test_split(df['Resume'],df['new_categ'],test_size = 0.25,random_state=10,stratify=df['new_categ'])

In [29]:
pipeline.fit(x_train,y_train)

In [30]:
pipeline.score(x_test,y_test)

0.979253112033195

In [31]:
import pickle
with open('model_pipe','wb') as f:
    pickle.dump(pipeline,f)

In [32]:
with open('model_pipe','rb') as f:
    mod2 = pickle.load(f)

In [33]:
mod2.predict(df['Resume'])

array([ 6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
        6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
        6,  6,  6,  6,  6,  6, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
       12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
       12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
       24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
       24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 16, 16,
       16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
       16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
       16, 16, 16, 16, 22