### NLP Project - Resume Classification

#### Model Building

In [1]:
# Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Importing Libraries
import pandas as pd
import numpy as np
import os

#Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import  DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.multiclass import OneVsRestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB 

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix

#NLP PreProcessing
import nltk
import re
import string
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [3]:
#Declaring the Classess Name Based on Our Resumes
ModelPath = '/content/drive/MyDrive/Original_Resumes'
Resume_Classes = ['Peoplesoft resumes','React resumes','SQL Developer Lightning insight','Workday resumes','Different resumes']

In [4]:
label = []
key = []
#Created a Function which will get a resume as 'key' & its coresponding class as 'labels'
def create_training_data():
    for category in Resume_Classes:
        path = os.path.join(ModelPath,category)
        class_num = Resume_Classes.index(category)
        for resume in os.listdir(path):
            label.append(category)
            key.append(resume)            

#Calling the Function
create_training_data()        

In [5]:
#Printing the Labels & Key First 10 Only.
print('Labels: \n{}\nLength of Labels: {}\nKeys: \n{}\nLength of Keys: {}'.format(label[:10], len(label), key[:10], len(key)))

Labels: 
['Peoplesoft resumes', 'Peoplesoft resumes', 'Peoplesoft resumes', 'Peoplesoft resumes', 'Peoplesoft resumes', 'Peoplesoft resumes', 'Peoplesoft resumes', 'Peoplesoft resumes', 'Peoplesoft resumes', 'Peoplesoft resumes']
Length of Labels: 79
Keys: 
['Resume_Subha Santosh_Peoplesoft FSCM.docx', 'Peoplesoft Finance_Arun Venu.doc', 'Peoplesoft FSCM_R Ahmed.doc', 'Peoplesoft Finance_Rahul Ahuja.doc', 'Peoplesoft Admin_Priyanka Ramadoss.doc', 'Peoplesoft Admin_Varkala Vikas.docx', 'PeopleSoft DBA_Ganesh Alladi.doc', 'Peoplesoft Admin_Gangareddy.doc', 'Peoplesoft Admin_AnubhavSingh.docx', 'Peoplesoft FSCM_Murali.docx']
Length of Keys: 79


In [6]:
# Creating a Dictionary of Key and Label (It will store Resumes as key & classs as value)
labelDict = dict(zip(key, label))

In [7]:
# Now Sorting it in alphabetical order of keys
finalDict = {}
for i in sorted(labelDict.keys()):
    finalDict[i] = labelDict[i]

In [8]:
# Let's Import Features.csv in which we had did feature extraction
Features = pd.read_csv('/content/Features.csv')
Features.head()

Unnamed: 0,Names,Mobile_Number,Email,Skills,Education,Years_of_Experience,links,Universities_Names
0,kotani durga,9112345678,abc@xyz.com,"['Visual', 'Interactive', 'Usability', 'Html',...",['BTech'],3.1,"['https://www.linkedin.com/fake', 'https://www...",['Education Details: B.Tech (Computer Science...
1,developer/ react,9112345678,abc@xyz.com,"['Scrum', 'Html', 'Website', 'Technical skills...",['B-Tech'],3.2,"['https://www.linkedin.com/fake', 'https://www...",['QUALIFICATION: B-Tech from JNTU-Kakinada Uni...
2,mareedu lokesh,9112345678,abc@xyz.com,"['Sales', 'Visual', 'Database', 'Testing', 'Se...",['BTech'],2.0,"['https://www.linkedin.com/fake', 'https://www...",[]
3,kamalakar reddy,777682196,abc@xyz.com,"['Visual', 'Ubuntu', 'Html', 'Html5', 'Js', 'U...",[],3.0,"['https://www.linkedin.com/fake', 'https://www...",['TITLE : lernbook DESCRIPTION: Fortunapix wo...
4,thirupathamma balla,9112345678,abc@xyz.com,"['Engineering', 'Technical skills', 'Electrica...","[('SSC', '2014')]",2.8,"['https://www.linkedin.com/fake', 'https://www...","[""Education Course Institution Percentage Year..."


In [9]:
#Checking for NA Values
Features.isnull().sum()

Names                   0
Mobile_Number           0
Email                   0
Skills                  0
Education               0
Years_of_Experience    10
links                   0
Universities_Names      0
dtype: int64

In [10]:
# Creating a Dataframe for Modelling
Model_Features = pd.DataFrame(Features['Names'])
Model_Features['Skills'] = Features['Skills']
Model_Features['Label'] = labelDict.values()
Model_Features.head()

Unnamed: 0,Names,Skills,Label
0,kotani durga,"['Visual', 'Interactive', 'Usability', 'Html',...",Peoplesoft resumes
1,developer/ react,"['Scrum', 'Html', 'Website', 'Technical skills...",Peoplesoft resumes
2,mareedu lokesh,"['Sales', 'Visual', 'Database', 'Testing', 'Se...",Peoplesoft resumes
3,kamalakar reddy,"['Visual', 'Ubuntu', 'Html', 'Html5', 'Js', 'U...",Peoplesoft resumes
4,thirupathamma balla,"['Engineering', 'Technical skills', 'Electrica...",Peoplesoft resumes


In [11]:
# Store the file into a csv file
Model_Features.to_csv('Model_features.csv', index = None)

In [12]:
#Loading the Dataset we created
df = pd.read_csv('/content/Model_features.csv')
df.head()

Unnamed: 0,Names,Skills,Label
0,kotani durga,"['Visual', 'Interactive', 'Usability', 'Html',...",Peoplesoft resumes
1,developer/ react,"['Scrum', 'Html', 'Website', 'Technical skills...",Peoplesoft resumes
2,mareedu lokesh,"['Sales', 'Visual', 'Database', 'Testing', 'Se...",Peoplesoft resumes
3,kamalakar reddy,"['Visual', 'Ubuntu', 'Html', 'Html5', 'Js', 'U...",Peoplesoft resumes
4,thirupathamma balla,"['Engineering', 'Technical skills', 'Electrica...",Peoplesoft resumes


In [13]:
#Checking for NA Values
df.isnull().sum()

Names     0
Skills    0
Label     0
dtype: int64

##### Data cleaning

In [14]:
#Initializing the Stop Words
lemmetizer = WordNetLemmatizer()
stop_words = stopwords.words('english')

In [15]:
#Function to get clean Skills
cleaned_data = []
def clean_data(text):
    text_clean = []
    text_tokens = word_tokenize(text)
    for word in text_tokens:
        if (word not in stop_words and # remove stopwords
            word not in string.punctuation): # remove punctuation
            stem_word = lemmetizer.lemmatize(word) # stemming word
            text_clean.append(stem_word)
    list_to_str = ' '.join([str(ele) for ele in text_clean])
    list_to_str = re.sub("'",'',list_to_str)   
    return list_to_str.lower() 

#Calling Function
for text in df['Skills']:
    cleaned_data.append(clean_data(text))

In [16]:
#Adding the Clean Skills to the Df
df['Cleaned Skills'] = cleaned_data
df.head()

Unnamed: 0,Names,Skills,Label,Cleaned Skills
0,kotani durga,"['Visual', 'Interactive', 'Usability', 'Html',...",Peoplesoft resumes,visual interactive usability html routing webs...
1,developer/ react,"['Scrum', 'Html', 'Website', 'Technical skills...",Peoplesoft resumes,scrum html website technical skill programming...
2,mareedu lokesh,"['Sales', 'Visual', 'Database', 'Testing', 'Se...",Peoplesoft resumes,sales visual database testing servers end user...
3,kamalakar reddy,"['Visual', 'Ubuntu', 'Html', 'Html5', 'Js', 'U...",Peoplesoft resumes,visual ubuntu html html5 js ui coding communic...
4,thirupathamma balla,"['Engineering', 'Technical skills', 'Electrica...",Peoplesoft resumes,engineering technical skill electrical program...


##### Label Encoding

In [17]:
# Label Encoder
le = LabelEncoder()
df['Label'] = le.fit_transform(df['Label'])
df.head()

Unnamed: 0,Names,Skills,Label,Cleaned Skills
0,kotani durga,"['Visual', 'Interactive', 'Usability', 'Html',...",1,visual interactive usability html routing webs...
1,developer/ react,"['Scrum', 'Html', 'Website', 'Technical skills...",1,scrum html website technical skill programming...
2,mareedu lokesh,"['Sales', 'Visual', 'Database', 'Testing', 'Se...",1,sales visual database testing servers end user...
3,kamalakar reddy,"['Visual', 'Ubuntu', 'Html', 'Html5', 'Js', 'U...",1,visual ubuntu html html5 js ui coding communic...
4,thirupathamma balla,"['Engineering', 'Technical skills', 'Electrica...",1,engineering technical skill electrical program...


In [18]:
#Checking the Class value count
df['Label'].value_counts()

2    22
4    21
1    20
3    14
0     2
Name: Label, dtype: int64

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79 entries, 0 to 78
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Names           79 non-null     object
 1   Skills          79 non-null     object
 2   Label           79 non-null     int64 
 3   Cleaned Skills  79 non-null     object
dtypes: int64(1), object(3)
memory usage: 2.6+ KB


In [20]:
# Store the file into a csv file
df.to_csv('Final_features.csv', index = None)

##### Train Test Split

In [21]:
#Taking only Skills & Label
x = df['Cleaned Skills'].values
y = df['Label'].values
#Splitting 80%-20%
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size= 0.20, random_state= 38)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((63,), (16,), (63,), (16,))

##### Model Building

In [22]:
#Vectoriation 
CV = CountVectorizer(max_features=20,stop_words = 'english')
x_train_cv = CV.fit_transform(x_train)
x_test_cv = CV.transform(x_test)

In [23]:
#Logistic regression and fit the model
classifier = LogisticRegression()
classifier.fit(x_train_cv,y_train)
classifier.score(x_test_cv, y_test)*100

37.5

##### Find best Algorithms using GridSearchCV

In [24]:
#Vectorizing x & y
x_cv = CV.fit_transform(x).toarray()

In [25]:
#GridSearchCV

def find_best_model_using_gridsearchcv(x,y):
    algos = {
        'Logistic' :{'Classifier': LogisticRegression(),'params':{'max_iter': [120,130,150]}
        },
        'Decision Tree': {'Classifier': DecisionTreeClassifier(),'params': {'criterion': ['gini','entropy'],'max_depth': [3,4,5]}
        },
        'Random Forest': {'Classifier': RandomForestClassifier(),'params':{'n_estimators':[100,125,150,175,200],'max_features': [4,5,6],'random_state':[7]}
        },
        'AdaBoost': {'Classifier': AdaBoostClassifier(),'params': {'n_estimators' : [10,15,20],'random_state':[7]}
        },
        'Gradient Boosting': {'Classifier': GradientBoostingClassifier(),'params': {'learning_rate':[0.1,0.01,0.2],'random_state':[7]}
        },
        'XGBM': {'Classifier': XGBClassifier(),'params': {'n_estimators' : [70,80,90,100],'max_depth': [3,4,5,7],
                                                          'learning_rate':[0.1,0.2],'random_state':[7]}
        },
        'SVM': {'Classifier': SVC(),'params': {'kernel':['rbf','poly'],'gamma':[50,100],'C':[10,15,20]}
        },
        'KNN': {'Classifier': KNeighborsClassifier(),'params': {'n_neighbors':[5,10,15,20]}
        },
        'MNB': {'Classifier': MultinomialNB(),'params': {'alpha':[0.5,1.0],'fit_prior':['True','False']}
        }
    }
    scores = []
    kfold = KFold(n_splits=5, random_state=10, shuffle=True)
    for algo_name, config in algos.items():
        gs =  GridSearchCV(config['Classifier'], config['params'], cv=kfold)
        gs.fit(x,y)
        scores.append({
            'Classifier': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return pd.DataFrame(scores,columns=['Classifier','best_score','best_params'])

find_best_model_using_gridsearchcv(x_cv,y)

Unnamed: 0,Classifier,best_score,best_params
0,Logistic,0.4825,{'max_iter': 120}
1,Decision Tree,0.608333,"{'criterion': 'gini', 'max_depth': 5}"
2,Random Forest,0.533333,"{'max_features': 6, 'n_estimators': 175, 'rand..."
3,AdaBoost,0.4575,"{'n_estimators': 10, 'random_state': 7}"
4,Gradient Boosting,0.47,"{'learning_rate': 0.2, 'random_state': 7}"
5,XGBM,0.520833,"{'learning_rate': 0.1, 'max_depth': 3, 'n_esti..."
6,SVM,0.47,"{'C': 10, 'gamma': 50, 'kernel': 'poly'}"
7,KNN,0.548333,{'n_neighbors': 15}
8,MNB,0.445,"{'alpha': 0.5, 'fit_prior': 'True'}"


Inference: The Best Algorithm we got from Grid Search is Decision Tree so will Create the final Model on Decision Tree Algorithms

##### Final Model (Decision Tree)

In [26]:
#Decision Tree Classifier and fit the model
DT_classifier = DecisionTreeClassifier(criterion = 'gini', max_depth = 5)
DT_classifier.fit(x_cv,y)
#Prediction Score
DT_classifier.score(x_cv,y)*100

87.34177215189874

#### Saving the Model

In [27]:
import pickle
filename = "resume.pickle"
pickle.dump(DT_classifier,open(filename,'wb'))

In [28]:
#Prediction
DT_classifier.predict(x_cv)

array([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,
       2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 2, 1, 4, 4, 4, 4, 4, 4, 4, 4,
       2, 4, 4, 4, 2, 4, 4, 4, 4, 4, 3, 0, 0])

In [29]:
#Function to Get the Skills as a single list for Deployment
match = []
def match_words(text):
    lower = [x.lower() for x in text]

    for temp in lower:
        f_text = temp.split()
        found = False
        for text_word in f_text:
            match.append(text_word)
                                
#Calling the Function
match_words(df['Cleaned Skills'])

#Removing Duplicates From List 
final_skills = []
for i in match:
  if i not in final_skills:
    final_skills.append(i)

#Checking length of Final List
len(final_skills)

273

In [30]:
#Adding Data Science Skills Too.
DS = ['python','sql','excel','tableau','statistics','eda','feature engineering','machine learning','deep learning',
      'nlp','time series analysis','deployment']
final_skills.extend(DS)
len(final_skills)

285

In [31]:
#saving this skills in a text file to use for deployment
for skills in final_skills:
  with open("skills.txt", "a") as text_file:
      print(skills, file=text_file)