In [1]:
import numpy as np 
import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

In [2]:
resumes = pd.read_csv('data/Resume1.csv')
resumes.head()

Unnamed: 0,Job category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."


In [3]:
resumes2 = pd.read_csv('data/Resume2.csv')
resumes2.head()

Unnamed: 0,Job category,Resume
0,Data or business analyst;Data scientist or mac...,Skills 6-8 yearsNANANANA
1,Data or business analyst;Data scientist or mac...,Skills 3-5 yearsNANANANA
2,Data or business analyst;Data scientist or mac...,Skills 3-5 yearsNANANANA
3,Data or business analyst;Data scientist or mac...,Skills 3-5 yearsNANANANA
4,Data or business analyst;Data scientist or mac...,Skills 12-14 yearsC#;JavaScript;Python;Ruby;SQ...


In [4]:
dataset= pd.concat([resumes, resumes2], axis=0)
dataset.tail()

Unnamed: 0,Job category,Resume
1031,Data or business analyst;Data scientist or mac...,Skills 6-8 yearsC#;CoffeeScript;JavaScript;SQL...
1032,Data or business analyst;Data scientist or mac...,Skills 6-8 yearsJava;SQL;Bash/ShellMySQL;SQLit...
1033,Data or business analyst;Data scientist or mac...,Skills 12-14 yearsRuby;Swift;HTML;CSS;Bash/She...
1034,Data or business analyst;Data scientist or mac...,Skills 27-29 yearsC;C++;Python;Bash/ShellNALin...
1035,Data or business analyst;Data scientist or mac...,Skills 0-2 yearsNANANANA


In [6]:
dataset['Resume']=dataset['Resume'].str.lower()
print(dataset['Resume'])

0       skills * programming languages: python (pandas...
1       education details \r\nmay 2013 to may 2017 b.e...
2       areas of interest deep learning, control syste...
3       skills â¢ r â¢ python â¢ sap hana â¢ table...
4       education details \r\n mca   ymcaust,  faridab...
                              ...                        
1031    skills 6-8 yearsc#;coffeescript;javascript;sql...
1032    skills 6-8 yearsjava;sql;bash/shellmysql;sqlit...
1033    skills 12-14 yearsruby;swift;html;css;bash/she...
1034    skills 27-29 yearsc;c++;python;bash/shellnalin...
1035                             skills 0-2 yearsnananana
Name: Resume, Length: 1739, dtype: object


In [7]:
import re
def cleanResume(resumeText):
    resumeText = re.sub('http\S+\s*', ' ', str(resumeText))  # remove URLs
    resumeText = re.sub('RT|cc', ' ', str(resumeText))  # remove RT and cc
    resumeText = re.sub('#\S+', '', str(resumeText))  # remove hashtags
    resumeText = re.sub('@\S+', '  ', str(resumeText))  # remove mentions
    resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', str(resumeText))  # remove punctuations
    resumeText = re.sub(r'[^\x00-\x7f]',r' ', str(resumeText)) 
    resumeText = re.sub('nananana', ' ', str(resumeText))  # NANANA
    resumeText = re.sub('\s+', ' ', str(resumeText))  # remove extra whitespace
    return resumeText

In [8]:
dataset['cleaned_resume'] = dataset['Resume'].apply(lambda x: cleanResume(x))
print (dataset['cleaned_resume'][31])

31    education details may 2013 to may 2017 b e uit...
31    skills 27 29 yearsjava javascript sql vba vb n...
Name: cleaned_resume, dtype: object


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack


tfidf = TfidfVectorizer(sublinear_tf= True, #use a logarithmic form for frequency
                       min_df = 5, #minimum numbers of documents a word must be present in to be kept
                       norm= 'l2', #ensure all our feature vectors have a euclidian norm of 1
                       ngram_range= (1,2), #to indicate that we want to consider both unigrams and bigrams.
                       stop_words ='english')
features = tfidf.fit_transform(dataset['cleaned_resume']).toarray()

In [11]:
print(features)


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [12]:
train_set=["""Overall 7-10+ years experience in technology, software services and / or implementation
5-6 years+ experienced Data analyst with experience in data structure, data mapping, migration activities
Strong experience in investigating an application from a data source, data mapping into database / other sources, field mapping standpoint
Strong experience in SQL, PLSQL, Oracle SQL to query and understand data mapping syntax
Past Data Analysis experience in an area that uses ETL tools like SSIS, Informatica, Datastage
Analyst roles in Data governance project will be considered beneficial
Strong communication, workshop facilitation, requirements gathering skills by working with a range of stakeholders – business, tech application owners, various technology sub-teams within anz bank
Ability to document data analysis outcome attribute in word / excel in concise and succinct manner
Ideally from banking technology background – Need to have knowledge of 1 banking domain for ex: payments, corp / institutional, markets, Lending
For strong candidates well versed in application Data analysis – this banking domain requirement can be waived off
Past experience in kafka – event streaming, understanding pub-sub mechanisms, understand the structure of data and understanding API calls will be highly desired
Strong willed, problem solving mindset and ability to deal with ambiguity"""]

In [13]:
y=tfidf.transform(train_set)


In [14]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(features,y)

In [16]:
dataset["similarity"]=cosine_sim

In [18]:
df=dataset.sort_values(by=['similarity'],ascending=False)
df.head()

Unnamed: 0,Job category,Resume,cleaned_resume,similarity
14,Data Science,"education details \r\n mca ymcaust, faridab...",education details mca ymcaust faridabad haryan...,0.11
4,Data Science,"education details \r\n mca ymcaust, faridab...",education details mca ymcaust faridabad haryan...,0.11
24,Data Science,"education details \r\n mca ymcaust, faridab...",education details mca ymcaust faridabad haryan...,0.11
34,Data Science,"education details \r\n mca ymcaust, faridab...",education details mca ymcaust faridabad haryan...,0.11
343,DevOps Engineer,technical skills key skills ms technology .net...,technical skills key skills ms technology net ...,0.109709


In [19]:
print(cosine_sim)

[[0.08091123]
 [0.02474166]
 [0.07881404]
 ...
 [0.00238835]
 [0.00141725]
 [0.06131366]]
