In [2]:
# required libraries
import os
import pandas as pd
import numpy as np
from pdfminer import high_level

#paths
train_path = "dataset/trainResumes/"
test_path = "dataset/testResumes/"

# epty list for resumes text
train_resumes = []
test_resumes = []

# pdf2string
def pdf2string(path, resumes):
    for i in os.listdir(path):
        main_path = path+i
        text = high_level.extract_text(main_path)
        str_list = text.split()
        str_list = str_list[:]
        string = ' '.join(str_list)
        resumes.append(string)

pdf2string(train_path, train_resumes)
pdf2string(test_path, test_resumes)

In [3]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
nlp = spacy.load('en_core_web_sm')

needless_words = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U',
'V', 'W', 'X', 'Y', 'Z']
punctuations = list('''!()-[]{};:'"\,<>./?@#$%^&*_~''')

def text_processing(resume):   

    resume = nlp(resume)
    token_list = []
    for token in resume:
        token_list.append(token.text)

    filtered_sentence =[] 
    for word in token_list:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            filtered_sentence.append(word) 

    # further filter
    filtered_sentence_2 = []
    for word in filtered_sentence:
        if word not in needless_words:
            filtered_sentence_2.append(word)

    filtered_sentence_3 = []
    for word in filtered_sentence_2:
        if word not in punctuations:
            filtered_sentence_3.append(word)
    
    Stem_words = []
    sentence = ' '.join(filtered_sentence_3)
    doc = nlp(sentence)
    for word in doc:
        Stem_words.append(word.lemma_)

    main_text = ' '.join(Stem_words)
    main_text = main_text.lower()
    return main_text

In [4]:
processed_resumes_train = []
processed_resumes_test = []

for  resume in train_resumes:
    processed_resume = text_processing(resume)
    processed_resumes_train.append(processed_resume)

for  resume in test_resumes:
    processed_resume = text_processing(resume)
    processed_resumes_test.append(processed_resume)

In [5]:
print(len(processed_resumes_train))
print(len(processed_resumes_test))

90
60


In [6]:
print(processed_resumes_train[2])

associate analyst skills certified data analyst degree electronics engineering hand experience analyze interpret datum good numerical accuracy python machine learning mysql data mining deep learning data analysis computer vision flask api predictive modeling aws scikit learn numpy statistical analysis multivariate analysis decision trees random forest xgboost nlp project work experience deep learning base pattern match auto color grade python amz loans mortgages erc analytics jun 2019 till date qualification google cloud certified handling datum employee find retention factor employee satisfaction worked closely hr team find balance beneficial employee company education b.tech b.e. electronics telecommunication nagpur university 2019


In [7]:
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')

def dataframe(resume_list, df):
    resumes =  pd.DataFrame(resume_list, columns = ['resumes'])
    dataframe = pd.concat([df, resumes], axis = 1)
    dataframe.drop('CandidateID', axis = 1, inplace = True)
    return dataframe

train_df = dataframe(processed_resumes_train, train)
test_df = dataframe(processed_resumes_test, test)

In [8]:
train_df.head()

Unnamed: 0,Match Percentage,resumes
0,13.6,jacob smith personal profile work background a...
1,36.63,brianna williams executive profile work experi...
2,54.93,associate analyst skills certified data analys...
3,41.46,python machine learn deep learning data analys...
4,48.91,jennifer armstrong fresher computer vision mac...


In [19]:
x = train_df.drop('Match Percentage', axis = 1)
y = train_df['Match Percentage']

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [21]:
count_vectorizer = CountVectorizer(stop_words='english')
count_train = count_vectorizer.fit_transform(x)
count_test = count_vectorizer.transform(test_df)

In [22]:
count_train

<1x1 sparse matrix of type '<class 'numpy.int64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [None]:
import re

In [None]:
# train features
train_df['words_counts'] = train_df['resumes'].apply(lambda x: len(str(x).split()))
train_df['char_counts'] = train_df['resumes'].apply(lambda x: len(str(x)))
train_df['avg_word_len'] = train_df['char_counts']/train_df['words_counts']
train_df['ml_counts'] = train_df['resumes'].apply(lambda x: len(re.findall('machine learning', x)))
train_df['ml_engineer_counts'] = train_df['resumes'].apply(lambda x: len(re.findall('machine learning engineer', x)))
train_df['analytics_counts'] = train_df['resumes'].apply(lambda x: len(re.findall('analytics', x)))
train_df['degree_counts'] = train_df['resumes'].apply(lambda x: len(re.findall('master degree', x)))
train_df['degree_counts_2'] = train_df['resumes'].apply(lambda x: len(re.findall('msc', x)))
train_df['degree_counts_3'] = train_df['resumes'].apply(lambda x: len(re.findall('degree', x)))
train_df['deep_learning_counts'] = train_df['resumes'].apply(lambda x: len(re.findall('deep learning', x)))
train_df['tf_counts'] = train_df['resumes'].apply(lambda x: len(re.findall('tensorflow', x)))
train_df['neural_network_counts'] = train_df['resumes'].apply(lambda x: len(re.findall('neural network', x)))
train_df['nlp_counts'] = train_df['resumes'].apply(lambda x: len(re.findall('natural language processing', x)))
train_df['nlp_counts_2'] = train_df['resumes'].apply(lambda x: len(re.findall('nlp', x)))
train_df['pyspark_counts'] = train_df['resumes'].apply(lambda x: len(re.findall('pyspark', x)))
train_df['hadoop_counts'] = train_df['resumes'].apply(lambda x: len(re.findall('hadoop', x)))
train_df['data_analysis_counts'] = train_df['resumes'].apply(lambda x: len(re.findall('data analysis', x)))
train_df['lustering_counts'] = train_df['resumes'].apply(lambda x: len(re.findall('clustering', x)))
train_df['lr_counts'] = train_df['resumes'].apply(lambda x: len(re.findall('logistic regression', x)))
train_df['classification_counts'] = train_df['resumes'].apply(lambda x: len(re.findall('classification', x)))
train_df['sk_counts'] = train_df['resumes'].apply(lambda x: len(re.findall('sciKit learn', x)))
train_df['pytorch_counts'] = train_df['resumes'].apply(lambda x: len(re.findall('pytorch', x)))
train_df['cnn_counts'] = train_df['resumes'].apply(lambda x: len(re.findall('cnn', x)))
train_df['rnn_counts'] = train_df['resumes'].apply(lambda x: len(re.findall('rnn', x)))
train_df['gans_counts'] = train_df['resumes'].apply(lambda x: len(re.findall('gans', x)))
train_df['nltk_counts'] = train_df['resumes'].apply(lambda x: len(re.findall('nltk', x)))
train_df['spacy_counts'] = train_df['resumes'].apply(lambda x: len(re.findall('spacy', x)))
train_df['transformer_counts'] = train_df['resumes'].apply(lambda x: len(re.findall('transformer', x)))
train_df['django_counts'] = train_df['resumes'].apply(lambda x: len(re.findall('django', x)))

In [None]:
train_df['feature1'] = train_df['neural_network_counts'] + train_df['tf_counts'] + train_df['ml_counts']


In [None]:
# test features
test_df['words_counts'] = test_df['resumes'].apply(lambda x: len(str(x).split()))
test_df['char_counts'] = test_df['resumes'].apply(lambda x: len(str(x)))
test_df['avg_word_len'] = test_df['char_counts']/train_df['words_counts']
test_df['ml_counts'] = test_df['resumes'].apply(lambda x: len(re.findall('machine learning', x)))
test_df['ml_engineer_counts'] = test_df['resumes'].apply(lambda x: len(re.findall('machine learning engineer', x)))
test_df['analytics_counts'] = test_df['resumes'].apply(lambda x: len(re.findall('analytics', x)))
test_df['degree_counts'] = test_df['resumes'].apply(lambda x: len(re.findall('master degree', x)))
test_df['degree_counts_2'] = test_df['resumes'].apply(lambda x: len(re.findall('msc', x)))
test_df['degree_counts_3'] = test_df['resumes'].apply(lambda x: len(re.findall('degree', x)))
test_df['deep_learning_counts'] = test_df['resumes'].apply(lambda x: len(re.findall('deep learning', x)))
test_df['tf_counts'] = test_df['resumes'].apply(lambda x: len(re.findall('tensorflow', x)))
test_df['neural_network_counts'] = test_df['resumes'].apply(lambda x: len(re.findall('neural network', x)))
test_df['nlp_counts'] = test_df['resumes'].apply(lambda x: len(re.findall('natural language processing', x)))
test_df['nlp_counts_2'] = test_df['resumes'].apply(lambda x: len(re.findall('nlp', x)))
test_df['pyspark_counts'] = test_df['resumes'].apply(lambda x: len(re.findall('pyspark', x)))
test_df['hadoop_counts'] = test_df['resumes'].apply(lambda x: len(re.findall('hadoop', x)))
test_df['data_analysis_counts'] = test_df['resumes'].apply(lambda x: len(re.findall('data analysis', x)))
test_df['lustering_counts'] = test_df['resumes'].apply(lambda x: len(re.findall('clustering', x)))
test_df['lr_counts'] = test_df['resumes'].apply(lambda x: len(re.findall('logistic regression', x)))
test_df['classification_counts'] = test_df['resumes'].apply(lambda x: len(re.findall('classification', x)))
test_df['sk_counts'] = test_df['resumes'].apply(lambda x: len(re.findall('sciKit learn', x)))
test_df['pytorch_counts'] = test_df['resumes'].apply(lambda x: len(re.findall('pytorch', x)))
test_df['cnn_counts'] = test_df['resumes'].apply(lambda x: len(re.findall('cnn', x)))
test_df['rnn_counts'] = test_df['resumes'].apply(lambda x: len(re.findall('rnn', x)))
test_df['gans_counts'] = test_df['resumes'].apply(lambda x: len(re.findall('gans', x)))
test_df['nltk_counts'] = test_df['resumes'].apply(lambda x: len(re.findall('nltk', x)))
test_df['spacy_counts'] = test_df['resumes'].apply(lambda x: len(re.findall('spacy', x)))
test_df['transformer_counts'] = test_df['resumes'].apply(lambda x: len(re.findall('transformer', x)))
test_df['django_counts'] = test_df['resumes'].apply(lambda x: len(re.findall('django', x)))


In [None]:
test_df['feature1'] = test_df['neural_network_counts'] + test_df['tf_counts'] + test_df['ml_counts']

In [None]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
max_vocab_length = 10000
max_length = 100

text_vectorizer = TextVectorization(max_tokens=None,
                                    standardize="lower_and_strip_punctuation",
                                    split="whitespace", 
                                    ngrams=None,
                                    output_mode="int",
                                    output_sequence_length=None)



text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode="int",
                                    output_sequence_length=max_length)

In [None]:
filtered_resume_list_train = list(train_df.resumes)
filtered_resume_list_test = list(test_df.resumes)

In [None]:
filtered_resume_list_train[0]

In [None]:
import warnings
warnings.filterwarnings(action = 'ignore')
  
import gensim
from gensim.models import Word2Vec

In [None]:
text_vectorizer.adapt(filtered_resume_list_train)
text_vectorizer.adapt(filtered_resume_list_test)

In [None]:
from sklearn.feature_selection import mutual_info_regression
def make_mi_scores(X, y):
    mi_scores = mutual_info_regression(X, y)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return round(mi_scores, 3)


In [None]:
print(train_df.head(5))

In [None]:
train_df.shape, test_df.shape

In [None]:
train_df.drop('resumes', axis = 1, inplace=True)
test_df.drop('resumes', axis = 1, inplace=True)

In [None]:
x = train_df.drop('Match Percentage', axis = 1)
y = train_df['Match Percentage']

In [None]:
mi_score1 = make_mi_scores(x, y)
print(mi_score1)

In [None]:
# feature selection
from sklearn.feature_selection import SelectKBest, chi2, f_regression
selector = SelectKBest(f_regression, k=25)
X = selector.fit_transform(x, y)
test_df = selector.transform(test_df)

In [None]:
#splitting the dataset into train and test set.
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,y, test_size=0.15, random_state = 31)

In [None]:
len(x_train), len(x_test), len(y_train), len(y_test)

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf_reg = RandomForestRegressor().fit(x_train, y_train)
rf_reg.score(x_test, y_test)

In [None]:
def submission(model, test):
    test1 = pd.read_csv('dataset/test.csv')
    preds = model.predict(test)
    prediction = pd.DataFrame(preds, columns = ['Match Percentage'])
    sub_df = pd.concat([test1, prediction], axis = 1)
    return sub_df

sub = submission(rf_reg, test_df)
sub.to_csv('submission file/Submission-7.csv')
print(sub.head())

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [None]:
%%time

rfcv_grid = {"n_estimators": np.arange(100, 1200, 100),
            'criterion' : ['mse', 'mae'],
           "max_depth": [None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
           "min_samples_split": np.arange(2, 20, 2),
           "min_samples_leaf": np.arange(1, 20, 2),
           'max_features' : ['auto', 'sqrt']}

rfcv_clf = RandomizedSearchCV(RandomForestRegressor(),
                           param_distributions = rfcv_grid,
                           cv=5,
                           n_iter=300,
                           verbose=True)

rfcv_clf.fit(x, y)

In [None]:
rfcv_clf.best_params_