In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('resume_data.csv')

In [3]:
df = df.drop(columns=["Unnamed: 0"],axis=1)

In [4]:
df['label'] = df['label'].map({'React Developer':0,'workday':1,'Peoplesoft':2,'SQL Developer':3})

In [5]:
df

Unnamed: 0,Text,label
0,Anubhav Kumar Singh\t\t\n\n To work in a gl...,2
1,Anubhav Kumar Singh\t\t\n\n To work in a gl...,2
2,Anubhav Kumar Singh\t\t\n\n To work in a gl...,2
3,Murali\n\nExperience Summary \n\nI have 6 year...,2
4,Murali\n\nExperience Summary \n\nI have 6 year...,2
...,...,...
75,Ramesh A\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t...,1
76,Seeking suitable positions in Workday HCM as ...,1
77,\nWORKDAY | HCM | FCM\nName \t\t: Kumar S.S\nR...,1
78,\nWORKDAY | HCM | FCM\nName \t\t: Kumar S.S\nR...,1


In [6]:
df['Text'][45]

'ANIL KUMAR MADDUKURI  \t\t\nSQL & MSBI Developer   \n\n\t\n\nSummary\n\n2.4 years of experience in MS SQL Server (SSMS) and creating SSIS packages, SSRS Reports by using Microsoft Business Intelligence (MSBI) tools.\nExpertise in various types of Joins and Sub Queries for writing complex queries involving multiple tables.\nHandled data manipulation and data consistency by creating Views, Triggers, and Synonyms.\nHands on experience in creation, optimization and debugging Stored Procedure and Functions.\nFamiliar in writing queries using CTE, Temporary Tables and Table Variables.\nGood experience in using Set Operators like Union, Union All, Except and Intersect to assist required data.\nExperience in manipulate the data from multiple table and report to the client using Aggregate Functions, Windows Functions and String Functions.\nWorked extensively on Data Extraction, Transformation and Loading (ETL) process in SQL Server Integration Services.\nUsed containers such as for each loop c

## Data Preprocessing

In [7]:
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\VICTUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\VICTUS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\VICTUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\VICTUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [8]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [9]:
Lemmatizer = WordNetLemmatizer()

In [10]:
def preprocess_data(text):
    cleantext = re.sub(r'http\S+|www\.\S+', '',text)
    cleantext = re.sub(r'@\S+','',cleantext) 
    cleantext = re.sub(r'#\S+','',cleantext)
    cleantext = re.sub(r'[^\w\s]','',cleantext)
    cleantext = cleantext.lower()
    words = word_tokenize(cleantext)
    words = [Lemmatizer.lemmatize(word,pos='v') for word in words if not word in stopwords.words('english')]
    cleantext = ' '.join(words)
    return cleantext 

In [11]:
def preprocess_data2(text):
  cleantext = re.sub('[^a-zA-Z]',' ',text)
  cleantext = re.sub('\s+',' ',cleantext)
  return cleantext 

In [12]:
df = df.sample(frac=1,random_state=42)

In [13]:
df.reset_index(inplace=True,drop=True)

In [14]:
df

Unnamed: 0,Text,label
0,\n\nThirupathamma Balla\n\nSUMMARY:\n\n2.8 yea...,0
1,Anubhav Kumar Singh\t\t\n\n To work in a gl...,2
2,Kanumuru Deepak Reddy\n\n\n\nCAREER OBJECTIVE:...,0
3,\nUi-Developer/ React JS Developer \nNAME: KRI...,0
4,Tanna Sujatha \n\n\n\nOBJECTIVE\nSeeking a cha...,2
...,...,...
75,Name: Ravali P \n\n ...,0
76,\t\n\n\nName : Gopi Krishna Reddy\n\t\...,1
77,VENKATA SAIKRISHNA\n Workday Consultant\n\n\nP...,1
78,\nHaving 4.6 years of experience in PeopleSoft...,2


In [15]:
df['Text'] = df['Text'].apply(preprocess_data)

In [16]:
df['Text'] = df['Text'].apply(preprocess_data2)

In [17]:
df

Unnamed: 0,Text,label
0,thirupathamma balla summary year experience re...,0
1,anubhav kumar singh work globally competitive ...,2
2,kanumuru deepak reddy career objective secure ...,0
3,uideveloper react js developer name krishna pr...,0
4,tanna sujatha objective seek challenge role ar...,2
...,...,...
75,name ravali p curriculum vitae specialization ...,0
76,name gopi krishna reddy professional summery w...,1
77,venkata saikrishna workday consultant profssio...,1
78,years experience peoplesoft application enhan...,2


## Word2vec

In [18]:
import gensim

In [19]:
from gensim.models import Word2Vec, KeyedVectors
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [20]:
words = []
for sent in df['Text']:
    sent_token = sent_tokenize(sent)
    for sent in sent_token:
        words.append(simple_preprocess(sent))

In [21]:
model = gensim.models.Word2Vec(words)

In [22]:
model

<gensim.models.word2vec.Word2Vec at 0x158b31bb970>

In [23]:
model.epochs

5

In [24]:
len(model.wv.index_to_key)

1003

In [25]:
model.wv['workday']

array([-0.22488979,  0.11196441,  0.06720041, -0.06941546,  0.09640287,
       -0.6350908 ,  0.29284614,  0.75958204, -0.36680874, -0.03065286,
       -0.14133239, -0.46558332, -0.08494774,  0.14881614, -0.00759282,
       -0.17757307, -0.03938251, -0.4798172 , -0.06492578, -0.6719561 ,
        0.18301588,  0.06339918,  0.14963351, -0.0738179 , -0.1612031 ,
       -0.03173562, -0.11568008, -0.25195107, -0.29700643, -0.14380136,
        0.49644375,  0.15111105,  0.16963476, -0.07985434, -0.09941012,
        0.42605165, -0.00373043, -0.4184901 , -0.2690931 , -0.595833  ,
        0.00180953, -0.27407554, -0.18670928,  0.01327765,  0.40733582,
       -0.09240649, -0.23163387,  0.09466214,  0.19189544,  0.21478583,
        0.14723969, -0.25657502, -0.06145908, -0.02818761, -0.12807241,
        0.2615253 ,  0.10718788, -0.21451439, -0.33908102,  0.0464712 ,
        0.11283215,  0.18704426, -0.02564189, -0.17700693, -0.4562658 ,
        0.11915432,  0.3900335 ,  0.4713445 , -0.41107908,  0.48

## AVG_Word2Vec

In [26]:
def avg_word_2_vec(doc):
  return np.mean([model.wv[word] for word in doc.split() if word in model.wv.index_to_key],axis=0)

In [27]:
from sklearn.model_selection import train_test_split

In [28]:
Train_df,Test_df = train_test_split(df,test_size=0.2,random_state=42)

In [29]:
x_Train,y_Train = Train_df['Text'],Train_df['label']
x_Test,y_Test = Test_df['Text'],Test_df['label']

In [30]:
x_Train_Final = []
for sent in x_Train:
  x_Train_Final.append(avg_word_2_vec(sent))

In [31]:
len(x_Train_Final)

64

In [32]:
x_Test_Final = []
for sent in x_Test:
  x_Test_Final.append(avg_word_2_vec(sent))

In [33]:
len(x_Test_Final)

16

## Model Building

In [34]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [35]:
rdm_clf = RandomForestClassifier()

In [36]:
rdm_clf.fit(x_Train_Final,y_Train)

In [37]:
train_pred = rdm_clf.predict(x_Train_Final)

In [38]:
Train_Accu = accuracy_score(y_Train,train_pred)
print('Training_Accuracy:-',Train_Accu)

Training_Accuracy:- 1.0


In [39]:
test_pred = rdm_clf.predict(x_Test_Final)

In [40]:
Test_Accu = accuracy_score(y_Test,test_pred)
print('Testing_Accuracy:-',Test_Accu)

Testing_Accuracy:- 0.875


In [52]:
print(classification_report(y_Test, test_pred))

              precision    recall  f1-score   support

           0       0.75      1.00      0.86         3
           1       1.00      1.00      1.00         3
           2       1.00      0.50      0.67         4
           3       0.86      1.00      0.92         6

    accuracy                           0.88        16
   macro avg       0.90      0.88      0.86        16
weighted avg       0.90      0.88      0.86        16



## Hyperparameter Tunning using Optuna

In [41]:
import optuna
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [42]:
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators',50,250)
    max_depth = trial.suggest_int('max_depth',3,25)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])

    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        bootstrap=True,
        oob_score=True,
        random_state=42
    )
    scores = cross_val_score(model, x_Train_Final, y_Train, cv=5,scoring='accuracy').mean()

    return scores


In [43]:
study = optuna.create_study(direction='maximize')

study.optimize(objective, n_trials=50)

[I 2024-11-27 15:59:51,065] A new study created in memory with name: no-name-a484e2de-273f-4f4c-ac2a-fc1a1b559264
[I 2024-11-27 15:59:52,101] Trial 0 finished with value: 0.8448717948717949 and parameters: {'n_estimators': 138, 'max_depth': 25, 'min_samples_split': 19, 'min_samples_leaf': 8, 'max_features': 'log2'}. Best is trial 0 with value: 0.8448717948717949.
[I 2024-11-27 15:59:53,692] Trial 1 finished with value: 0.8602564102564102 and parameters: {'n_estimators': 204, 'max_depth': 13, 'min_samples_split': 19, 'min_samples_leaf': 4, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.8602564102564102.
[I 2024-11-27 15:59:54,483] Trial 2 finished with value: 0.953846153846154 and parameters: {'n_estimators': 57, 'max_depth': 24, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': None}. Best is trial 2 with value: 0.953846153846154.
[I 2024-11-27 15:59:55,250] Trial 3 finished with value: 0.8602564102564102 and parameters: {'n_estimators': 101, 'max_depth': 3, 'min_s

In [44]:
params = study.best_params

In [45]:
study.best_trial.value

0.953846153846154

In [46]:
final_model = RandomForestClassifier(**params)

In [47]:
final_model.fit(x_Train_Final, y_Train)

In [48]:
y_pred = final_model.predict(x_Test_Final)

In [49]:
Test_Accu = accuracy_score(y_Test,y_pred)
print('Testing_Accuracy:-',Test_Accu) 

Testing_Accuracy:- 0.8125


In [50]:
print(classification_report(y_Test,y_pred))  

              precision    recall  f1-score   support

           0       0.75      1.00      0.86         3
           1       1.00      1.00      1.00         3
           2       0.67      0.50      0.57         4
           3       0.83      0.83      0.83         6

    accuracy                           0.81        16
   macro avg       0.81      0.83      0.82        16
weighted avg       0.81      0.81      0.80        16



## Prediction

In [54]:
test_pred

array([3, 0, 2, 1, 3, 2, 3, 3, 0, 1, 3, 3, 0, 3, 0, 1], dtype=int64)

In [55]:
if test_pred[4] == 0:
    print('React Developer')
elif test_pred[4] == 1:
    print('workday')
elif test_pred[4] == 2:
    print('Peoplesoft')
else:
    print('SQL Developer')

SQL Developer


## Pickle Files

In [64]:
import pickle
import joblib

In [None]:
# Pickle file for word2vec
pickle.dump(model,open('word2vec.pkl','wb'))

In [58]:
# Pickle file for Avg_word2vec
pickle.dump(avg_word_2_vec,open('avg_word_2_vec.pkl','wb')) 

In [65]:
joblib.dump(rdm_clf,'final_model.pkl')

['final_model.pkl']