In [1]:
# required libraries
import os
import pandas as pd
import numpy as np
from pdfminer import high_level

#paths
train_path = "dataset/trainResumes/"
test_path = "dataset/testResumes/"

# epty list for resumes text
train_resumes = []
test_resumes = []

# pdf2string
def pdf2string(path, resumes):
    for i in os.listdir(path):
        main_path = path+i
        text = high_level.extract_text(main_path)
        str_list = text.split()
        str_list = str_list[:]
        string = ' '.join(str_list)
        resumes.append(string)

pdf2string(train_path, train_resumes)
pdf2string(test_path, test_resumes)

In [2]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
nlp = spacy.load('en_core_web_sm')

needless_words = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U',
'V', 'W', 'X', 'Y', 'Z']
punctuations = list('''!()-[]{};:'"\,<>./?@#$%^&*_~''')

def text_processing(resume):   

    resume = nlp(resume)
    token_list = []
    for token in resume:
        token_list.append(token.text)

    filtered_sentence =[] 
    for word in token_list:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            filtered_sentence.append(word) 

    # further filter
    filtered_sentence_2 = []
    for word in filtered_sentence:
        if word not in needless_words:
            filtered_sentence_2.append(word)

    filtered_sentence_3 = []
    for word in filtered_sentence_2:
        if word not in punctuations:
            filtered_sentence_3.append(word)
    
    Stem_words = []
    sentence = ' '.join(filtered_sentence_3)
    doc = nlp(sentence)
    for word in doc:
        Stem_words.append(word.lemma_)

    main_text = ' '.join(Stem_words)
    main_text = main_text.lower()
    return main_text

In [3]:
processed_resumes_train = []
processed_resumes_test = []

for  resume in train_resumes:
    processed_resume = text_processing(resume)
    processed_resumes_train.append(processed_resume)

for  resume in test_resumes:
    processed_resume = text_processing(resume)
    processed_resumes_test.append(processed_resume)

In [4]:
print(len(processed_resumes_train))
print(len(processed_resumes_test))

90
60


In [5]:
print(processed_resumes_train[2])

associate analyst skills certified data analyst degree electronics engineering hand experience analyze interpret datum good numerical accuracy python machine learning mysql data mining deep learning data analysis computer vision flask api predictive modeling aws scikit learn numpy statistical analysis multivariate analysis decision trees random forest xgboost nlp project work experience deep learning base pattern match auto color grade python amz loans mortgages erc analytics jun 2019 till date qualification google cloud certified handling datum employee find retention factor employee satisfaction worked closely hr team find balance beneficial employee company education b.tech b.e. electronics telecommunication nagpur university 2019


In [6]:
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')

def dataframe(resume_list, df):
    resumes =  pd.DataFrame(resume_list, columns = ['resumes'])
    dataframe = pd.concat([df, resumes], axis = 1)
    dataframe.drop('CandidateID', axis = 1, inplace = True)
    return dataframe

train_df = dataframe(processed_resumes_train, train)
test_df = dataframe(processed_resumes_test, test)

In [7]:
train_df.head()

Unnamed: 0,Match Percentage,resumes
0,13.6,jacob smith personal profile work background a...
1,36.63,brianna williams executive profile work experi...
2,54.93,associate analyst skills certified data analys...
3,41.46,python machine learn deep learning data analys...
4,48.91,jennifer armstrong fresher computer vision mac...


In [8]:
filtered_resume_list_train = list(train_df.resumes)
filtered_resume_list_test = list(test_df.resumes)

In [9]:
vocab_size = 1000
embedding_dim = 16
max_length = 100
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

In [10]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [11]:
#tokenizing sentences
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(filtered_resume_list_train)

In [12]:
word_index = tokenizer.word_index
total_words = len(word_index)+1 #1 for oov word
print(total_words)
print(word_index)

1897
{'<OOV>': 1, 'data': 2, 'learning': 3, 'machine': 4, 'analysis': 5, 'python': 6, '2019': 7, 'work': 8, 'datum': 9, '2020': 10, 'learn': 11, 'b': 12, 'tech': 13, 'project': 14, 'business': 15, 'science': 16, 'experience': 17, 'deep': 18, 'analytics': 19, 'model': 20, 'university': 21, 'analyst': 22, 'engineer': 23, 'base': 24, 'processing': 25, 'profile': 26, 'software': 27, 'skills': 28, 'language': 29, 'education': 30, 'natural': 31, 'intern': 32, 'college': 33, 'system': 34, 'skill': 35, 'developer': 36, 'development': 37, 'engineering': 38, 'till': 39, 'date': 40, 'intelligence': 41, 'management': 42, 'sql': 43, 'nlp': 44, 'computer': 45, 'text': 46, 'solution': 47, 'problem': 48, '2018': 49, 'history': 50, 'mining': 51, 'regression': 52, 'statistical': 53, 'visualization': 54, 'personal': 55, 'cloud': 56, 'executive': 57, 'fresher': 58, 'professional': 59, 'artificial': 60, 'create': 61, 'tableau': 62, 'analytic': 63, 'different': 64, 'technology': 65, 'application': 66, 'time

In [13]:
train_sequences = tokenizer.texts_to_sequences(filtered_resume_list_train)
train_padded = pad_sequences(train_sequences, padding=padding_type, maxlen=max_length)

In [14]:
train_padded[0]

array([810, 811,  55,  26,   8,  93, 535, 184,  99,   2,  22,   4,   3,
        23, 812, 813, 235,  15, 277,  61, 323,   9, 120, 121, 100, 536,
        15, 537, 236, 538,  32,   7,  84, 132,  75, 237, 539,  34,  24,
       206,  94,  35,  30,  12,  13, 278, 324, 402,  21,  10,   6,  43,
       185,  62, 186, 187, 133, 109, 162, 122,   4,   3,  73, 814, 403,
       815,  56, 238, 816, 122,  14, 404, 540, 101, 207, 541, 542,  94,
       102,  18,   3, 543,   4,   3,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0])

In [15]:
validation_sequences = tokenizer.texts_to_sequences(filtered_resume_list_test)
validation_padded = pad_sequences(validation_sequences, padding=padding_type, maxlen=max_length)

In [16]:
validation_padded[0]

array([ 57, 114,   2,  23,  47, 371,   1,   9,  98,   9, 120, 128, 345,
         1,  55,  35,   4,   3,   2,  19,  14,  42,  27,  37, 715, 511,
        15, 410,  70,  88,  70,  88,   8,  17, 530,   1, 113,  10,  85,
         1,  64,  15,   1, 217,   9, 188,   1,  47, 128, 266, 190, 188,
       165, 365,  14, 424, 389, 246,   1,   1,  37,   1,  27,  36,  22,
         1, 119,  49, 305,   7, 168,  27,  36,  22,  14,   1,   1, 535,
         1,  37,   1,  74,  74, 677,  14, 115,  26,   1, 145,  65,  12,
        13, 348,  38,  49,   0,   0,   0,   0,   0])

In [17]:
train_padded_df = pd.DataFrame(train_padded)
test_padded_df = pd.DataFrame(validation_padded)

In [18]:
train_df = pd.concat([train_df, train_padded_df], axis = 1)
test_df = pd.concat([test_df, test_padded_df], axis = 1)

In [19]:
import re

In [20]:
# train features
train_df['words_counts'] = train_df['resumes'].apply(lambda x: len(str(x).split()))
train_df['char_counts'] = train_df['resumes'].apply(lambda x: len(str(x)))
train_df['avg_word_len'] = train_df['char_counts']/train_df['words_counts']
train_df['ml_counts'] = train_df['resumes'].apply(lambda x: len(re.findall('machine learning', x)))
train_df['ml_engineer_counts'] = train_df['resumes'].apply(lambda x: len(re.findall('machine learning engineer', x)))
train_df['analytics_counts'] = train_df['resumes'].apply(lambda x: len(re.findall('analytics', x)))
train_df['degree_counts'] = train_df['resumes'].apply(lambda x: len(re.findall('master degree', x)))
train_df['degree_counts_2'] = train_df['resumes'].apply(lambda x: len(re.findall('msc', x)))
train_df['degree_counts_3'] = train_df['resumes'].apply(lambda x: len(re.findall('degree', x)))
train_df['deep_learning_counts'] = train_df['resumes'].apply(lambda x: len(re.findall('deep learning', x)))
train_df['tf_counts'] = train_df['resumes'].apply(lambda x: len(re.findall('tensorflow', x)))
train_df['neural_network_counts'] = train_df['resumes'].apply(lambda x: len(re.findall('neural network', x)))
train_df['nlp_counts'] = train_df['resumes'].apply(lambda x: len(re.findall('natural language processing', x)))
train_df['nlp_counts_2'] = train_df['resumes'].apply(lambda x: len(re.findall('nlp', x)))
train_df['pyspark_counts'] = train_df['resumes'].apply(lambda x: len(re.findall('pyspark', x)))
train_df['hadoop_counts'] = train_df['resumes'].apply(lambda x: len(re.findall('hadoop', x)))
train_df['data_analysis_counts'] = train_df['resumes'].apply(lambda x: len(re.findall('data analysis', x)))
train_df['lustering_counts'] = train_df['resumes'].apply(lambda x: len(re.findall('clustering', x)))
train_df['lr_counts'] = train_df['resumes'].apply(lambda x: len(re.findall('logistic regression', x)))
train_df['classification_counts'] = train_df['resumes'].apply(lambda x: len(re.findall('classification', x)))
train_df['sk_counts'] = train_df['resumes'].apply(lambda x: len(re.findall('sciKit learn', x)))
train_df['pytorch_counts'] = train_df['resumes'].apply(lambda x: len(re.findall('pytorch', x)))
train_df['cnn_counts'] = train_df['resumes'].apply(lambda x: len(re.findall('cnn', x)))
train_df['rnn_counts'] = train_df['resumes'].apply(lambda x: len(re.findall('rnn', x)))
train_df['gans_counts'] = train_df['resumes'].apply(lambda x: len(re.findall('gans', x)))
train_df['nltk_counts'] = train_df['resumes'].apply(lambda x: len(re.findall('nltk', x)))
train_df['spacy_counts'] = train_df['resumes'].apply(lambda x: len(re.findall('spacy', x)))
train_df['transformer_counts'] = train_df['resumes'].apply(lambda x: len(re.findall('transformer', x)))
train_df['django_counts'] = train_df['resumes'].apply(lambda x: len(re.findall('django', x)))

In [21]:
# test features
test_df['words_counts'] = test_df['resumes'].apply(lambda x: len(str(x).split()))
test_df['char_counts'] = test_df['resumes'].apply(lambda x: len(str(x)))
test_df['avg_word_len'] = test_df['char_counts']/train_df['words_counts']
test_df['ml_counts'] = test_df['resumes'].apply(lambda x: len(re.findall('machine learning', x)))
test_df['ml_engineer_counts'] = test_df['resumes'].apply(lambda x: len(re.findall('machine learning engineer', x)))
test_df['analytics_counts'] = test_df['resumes'].apply(lambda x: len(re.findall('analytics', x)))
test_df['degree_counts'] = test_df['resumes'].apply(lambda x: len(re.findall('master degree', x)))
test_df['degree_counts_2'] = test_df['resumes'].apply(lambda x: len(re.findall('msc', x)))
test_df['degree_counts_3'] = test_df['resumes'].apply(lambda x: len(re.findall('degree', x)))
test_df['deep_learning_counts'] = test_df['resumes'].apply(lambda x: len(re.findall('deep learning', x)))
test_df['tf_counts'] = test_df['resumes'].apply(lambda x: len(re.findall('tensorflow', x)))
test_df['neural_network_counts'] = test_df['resumes'].apply(lambda x: len(re.findall('neural network', x)))
test_df['nlp_counts'] = test_df['resumes'].apply(lambda x: len(re.findall('natural language processing', x)))
test_df['nlp_counts_2'] = test_df['resumes'].apply(lambda x: len(re.findall('nlp', x)))
test_df['pyspark_counts'] = test_df['resumes'].apply(lambda x: len(re.findall('pyspark', x)))
test_df['hadoop_counts'] = test_df['resumes'].apply(lambda x: len(re.findall('hadoop', x)))
test_df['data_analysis_counts'] = test_df['resumes'].apply(lambda x: len(re.findall('data analysis', x)))
test_df['lustering_counts'] = test_df['resumes'].apply(lambda x: len(re.findall('clustering', x)))
test_df['lr_counts'] = test_df['resumes'].apply(lambda x: len(re.findall('logistic regression', x)))
test_df['classification_counts'] = test_df['resumes'].apply(lambda x: len(re.findall('classification', x)))
test_df['sk_counts'] = test_df['resumes'].apply(lambda x: len(re.findall('sciKit learn', x)))
test_df['pytorch_counts'] = test_df['resumes'].apply(lambda x: len(re.findall('pytorch', x)))
test_df['cnn_counts'] = test_df['resumes'].apply(lambda x: len(re.findall('cnn', x)))
test_df['rnn_counts'] = test_df['resumes'].apply(lambda x: len(re.findall('rnn', x)))
test_df['gans_counts'] = test_df['resumes'].apply(lambda x: len(re.findall('gans', x)))
test_df['nltk_counts'] = test_df['resumes'].apply(lambda x: len(re.findall('nltk', x)))
test_df['spacy_counts'] = test_df['resumes'].apply(lambda x: len(re.findall('spacy', x)))
test_df['transformer_counts'] = test_df['resumes'].apply(lambda x: len(re.findall('transformer', x)))
test_df['django_counts'] = test_df['resumes'].apply(lambda x: len(re.findall('django', x)))


In [22]:
from sklearn.feature_selection import mutual_info_regression
def make_mi_scores(X, y):
    mi_scores = mutual_info_regression(X, y)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return round(mi_scores, 3)


In [23]:
print(train_df.head(5))

   Match Percentage                                            resumes    0  \
0             13.60  jacob smith personal profile work background a...  810   
1             36.63  brianna williams executive profile work experi...  545   
2             54.93  associate analyst skills certified data analys...  111   
3             41.46  python machine learn deep learning data analys...    6   
4             48.91  jennifer armstrong fresher computer vision mac...  857   

     1   2    3    4    5    6    7  ...  classification_counts  sk_counts  \
0  811  55   26    8   93  535  184  ...                      1          0   
1  407  14   30  326  239  546   21  ...                      0          0   
2   22  28  412    2   22  210  150  ...                      0          0   
3    4  11   18    3    2    5   45  ...                      0          0   
4  858  58   45   79    4    3   23  ...                      0          0   

   pytorch_counts  cnn_counts  rnn_counts  gans_counts  

In [24]:
train_df.shape, test_df.shape

((90, 131), (60, 130))

In [25]:
train_df.drop('resumes', axis = 1, inplace=True)
test_df.drop('resumes', axis = 1, inplace=True)

In [26]:
x = train_df.drop('Match Percentage', axis = 1)
y = train_df['Match Percentage']

In [27]:
#Feature scaling
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
x = scale.fit_transform(x)
test_df = scale.transform(test_df)

In [30]:
x

array([[ 2.41049604,  2.79696195, -0.39026728, ..., -0.10599979,
        -0.21566555, -0.24253563],
       [ 1.44686328,  1.18366283, -0.56807766, ..., -0.10599979,
        -0.21566555, -0.24253563],
       [-0.13131264, -0.35376331, -0.50736192, ..., -0.10599979,
        -0.21566555, -0.24253563],
       ...,
       [-0.53131115, -0.43762292, -0.3815936 , ..., -0.10599979,
        -0.21566555, -0.24253563],
       [ 0.17050441, -0.43762292, -0.59409869, ..., -0.10599979,
         4.63680925, -0.24253563],
       [-0.53131115, -0.43762292, -0.39026728, ..., -0.10599979,
        -0.21566555, -0.24253563]])

In [None]:
mi_score1 = make_mi_scores(x, y)
print(mi_score1)

In [None]:
# feature selection
from sklearn.feature_selection import SelectKBest, chi2, f_regression
selector = SelectKBest(f_regression, k=100)
X = selector.fit_transform(x, y)
test_df = selector.transform(test_df)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error 
from matplotlib import pyplot as plt
import seaborn as sb
import matplotlib.pyplot as plt


In [None]:
train_df.shape[1]

In [None]:
NN_model = Sequential()

# The Input Layer :
NN_model.add(Dense(128, kernel_initializer='normal', activation='relu'))

# The Hidden Layers :
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))

# The Output Layer :
NN_model.add(Dense(1, kernel_initializer='normal',activation='linear'))

In [None]:
# Compile the network :
NN_model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error'])

In [None]:
NN_model.fit(x, y, epochs=100, batch_size=32)

In [31]:
from xgboost import XGBRegressor
XGBModel = XGBRegressor()
XGBModel.fit(x,y , verbose=False)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [32]:
def submission(model, test):
    test1 = pd.read_csv('dataset/test.csv')
    preds = model.predict(test)
    prediction = pd.DataFrame(preds, columns = ['Match Percentage'])
    sub_df = pd.concat([test1, prediction], axis = 1)
    return sub_df

sub = submission(XGBModel, test_df)
sub.to_csv('submission file/Submission-17.csv')
print(sub.head())

     CandidateID  Match Percentage
0  candidate_014         44.282951
1  candidate_098         47.256233
2  candidate_075         24.159756
3  candidate_016         28.871172
4  candidate_131         22.501472


In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV