In [1]:
# required libraries
import os
import pandas as pd
import numpy as np
from pdfminer import high_level

train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')

#paths
train_path = "dataset/trainResumes/"
test_path = "dataset/testResumes/"

# epty list for resumes text
train_resumes = []
test_resumes = []

# ids
ids = list(train.CandidateID)
test_ids = list(test.CandidateID)

# pdf2string
def pdf2string_train(path, ids, resumes):
    for i in ids:
        main_path = path+i+'.pdf'
        text = high_level.extract_text(main_path)
        str_list = text.split()
        str_list = str_list[:]
        string = ' '.join(str_list)
        resumes.append(string)
        

def pdf2string_test(path, test_ids, resumes):
    for i in test_ids:
        main_path = path+i+'.pdf'
        text = high_level.extract_text(main_path)
        str_list = text.split()
        str_list = str_list[:]
        string = ' '.join(str_list)
        resumes.append(string)


pdf2string_train(train_path, ids, train_resumes)
pdf2string_test(test_path,  test_ids, test_resumes)


In [2]:
import numpy as np
import pandas as pd
import re
import nltk
import spacy
import string
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

train_resumes_lower = []
for resume in train_resumes:
    train_resumes_lower.append(resume.lower())

test_resumes_lower = []
for resume in test_resumes:
    test_resumes_lower.append(resume.lower())

PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

train_punc_removed = []
for resume in train_resumes_lower:
    punc_removed = remove_punctuation(resume)
    train_punc_removed.append(punc_removed)

test_punc_removed = []
for resume in test_resumes_lower:
    punc_removed = remove_punctuation(resume)
    test_punc_removed.append(punc_removed)

STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

train_stopwords_removed = []
for resume in train_punc_removed:
    stopwords_removed = remove_stopwords(resume)
    train_stopwords_removed.append(stopwords_removed)

test_stopwords_removed = []
for resume in test_punc_removed:
    stopwords_removed = remove_stopwords(resume)
    test_stopwords_removed.append(stopwords_removed)

lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

train_lemma = []
for resume in train_stopwords_removed:
    lemma = lemmatize_words(resume)
    train_lemma.append(lemma)

test_lemma = []
for resume in test_stopwords_removed:
    lemma = lemmatize_words(resume)
    test_lemma.append(lemma)


In [3]:
train_lemma[0]

'l n r e w f r e h e r work experience dictis make intern trainee jan 2020 apr 2020 responsible perform help decision executive summary fresher strong statistical analytic capability someone drive passion problem solve though civil engineering background always fascinate data machine learn evolve iit personal skill data analyst data mining data visualization machine learn linear regression statistical model predictive modeling sql server oracle python project extracurriculars data preprocessing python data visualization power bi academic profile railway signal determiner use relay weight system 2020 btechcivil garodia institute technosciences'

In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')

train_df = pd.concat([train, pd.DataFrame(train_lemma, columns=['resumes'])], axis = 1)
test_df = pd.concat([test, pd.DataFrame(test_lemma, columns=['resumes'])], axis = 1)

print(train_df.head())
print(test_df.head())

tfidf = TfidfVectorizer(max_features=10000, 
                        strip_accents='unicode', 
                        analyzer='word',
                        lowercase=False,
                        ngram_range=(1, 1), 
                        stop_words = 'english')

tfidf_matrix_train = tfidf.fit_transform(train_df['resumes'])
tfidf_matrix_test = tfidf.transform(test_df['resumes'])
print(tfidf_matrix_train.shape)
print(tfidf_matrix_test.shape)

     CandidateID  Match Percentage  \
0  candidate_011             13.60   
1  candidate_113             36.63   
2  candidate_123             54.93   
3  candidate_012             41.46   
4  candidate_002             48.91   

                                             resumes  
0  l n r e w f r e h e r work experience dictis m...  
1  ellie mackey f r e h e r n e r n executive pro...  
2  f e l x w n n fresher skills project activites...  
3  jimmy gartner n g e r professional profile emp...  
4  n q u r associate analyst skill certify data a...  
     CandidateID                                            resumes
0  candidate_014  grace bailry c h n e l e r n n g e v e l p e n...
1  candidate_098  l e l e u n software engineer skill assistant ...
2  candidate_075  keiron pavard e n g n e e r personal profile w...
3  candidate_016  e l r n c e n c e j r work experience na acade...
4  candidate_131  zachary perez n e r n p r f l e k l l good kno...
(90, 1873)
(60, 1873)


In [4]:
import xgboost as XGB

In [5]:
y = train_df['Match Percentage']

In [6]:
from sklearn.model_selection import GridSearchCV
xgb_grid = {'learning_rate': [0.1, 0.3, 0.2, 0.01, 0.02, 0.03, 0.04, 0.05, 0.07, 0.005, 0.006, 0.007, 0.0075, 0.008],
            'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200],
            'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 20]}


xgb = GridSearchCV(XGB.XGBRegressor(),
                    param_grid = xgb_grid,
                    cv=2,
                    verbose=True)
xgb.fit(tfidf_matrix_train, y)
xgb.best_params_

Fitting 2 folds for each of 2184 candidates, totalling 4368 fits


{'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 1000}

In [67]:

xgb = XGB.XGBRegressor(learning_rate=0.005, 
                        n_estimators=700, 
                        objective='reg:squarederror', 
                        max_depth=8, 
                        reg_lambda = 1.3,
                        gamma = 1,
                        min_child_weight =1.5,
                        max_delta_step = 100,
                        random_state = 31).fit(tfidf_matrix_train, y)

In [153]:
from lightgbm import LGBMRegressor
lgbm = LGBMRegressor(num_leaves=31,
                    learning_rate = 0.01,
                    n_estimators = 1000,
                    reg_lambda = 2.5,
                    reg_alpha = 2,
                    random_state=31).fit(tfidf_matrix_train, y)

In [154]:
def submission(model, test_sentences):
    test1 = pd.read_csv('dataset/test.csv')
    preds = model.predict(test_sentences)
    prediction = pd.DataFrame(preds, columns = ['Match Percentage'])
    sub_df = pd.concat([test1, prediction], axis = 1)
    return sub_df

sub = submission(lgbm, tfidf_matrix_test)
sub.to_csv('submission file/lgbm sub.csv')
print(sub.head(10))

     CandidateID  Match Percentage
0  candidate_014         23.700393
1  candidate_098         32.999942
2  candidate_075         38.796772
3  candidate_016         35.225443
4  candidate_131         33.061261
5  candidate_056         35.019762
6  candidate_141         48.173307
7  candidate_044         55.757834
8  candidate_029         31.043669
9  candidate_120         36.911600


In [148]:
print(sub.head(10))

     CandidateID  Match Percentage
0  candidate_014         23.700393
1  candidate_098         32.999942
2  candidate_075         38.796772
3  candidate_016         35.225443
4  candidate_131         33.061261
5  candidate_056         35.019762
6  candidate_141         48.173307
7  candidate_044         55.757834
8  candidate_029         31.043669
9  candidate_120         36.911600
