In [6]:
# import libraries  
import numpy as np
import pandas as pd
import nltk
import re, random, os
import string, pprint
import matplotlib.pyplot as plt
import seaborn as sns

# spacy for basic preprocessing, optional, can use nltk as well (lemmatisation etc.)
import spacy

import warnings
warnings.filterwarnings('ignore')

## 1. Read and inspect the data

In [7]:
data = pd.read_csv("potential-talents.csv")
data.head()

Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,


In [8]:
data.fit.value_counts()

Series([], Name: count, dtype: int64)

In [9]:
data['fit'].unique()

array([nan])

We see that the fit column has all values as Nan, so we can safely drop it.

In [10]:
data.drop('fit', axis=1, inplace=True)

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          104 non-null    int64 
 1   job_title   104 non-null    object
 2   location    104 non-null    object
 3   connection  104 non-null    object
dtypes: int64(1), object(3)
memory usage: 3.4+ KB


## 2. Data Cleaning and Preprocessing

In [12]:
# Checking for null values
data.isnull().sum()

id            0
job_title     0
location      0
connection    0
dtype: int64

In [13]:
# Finding unique values in each column
for col in data.columns:
    print(f'No. of unique values in {col} column = {data[col].nunique()}')

No. of unique values in id column = 104
No. of unique values in job_title column = 52
No. of unique values in location column = 41
No. of unique values in connection column = 33


Here we see that id column values are almost double the job title column values. So there is a **possibility for duplicate entries** which can be deleted.

In [14]:
# Check for duplicate entries without id column
temp = data.drop(['id'], axis=1)
temp.head()

Unnamed: 0,job_title,location,connection
0,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85
1,Native English Teacher at EPIK (English Progra...,Kanada,500+
2,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44
3,People Development Coordinator at Ryan,"Denton, Texas",500+
4,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+


In [15]:
temp.shape

(104, 3)

In [16]:
# Now finding duplicates and deleting them
temp.duplicated().sum()

51

As we see there are 51 columns which are duplicate of each other, we can delete these.

In [17]:
new_df = temp.drop_duplicates()
new_df.shape

(53, 3)

In [18]:
df = pd.concat([data['id'], new_df], axis=1).dropna()
df = df.reset_index(drop = True)


In [19]:
df.shape

(53, 4)

In [21]:
df.head()

Unnamed: 0,id,job_title,location,connection
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+


#### a) Now taking the main query column that is JOB_TITLE and preprocessing it.

In [22]:
df['job_title'].value_counts()

job_title
Aspiring Human Resources Professional                                                                                    2
2019 C.T. Bauer College of Business Graduate (Magna Cum Laude) and aspiring Human Resources professional                 1
Lead Official at Western Illinois University                                                                             1
Senior Human Resources Business Partner at Heil Environmental                                                            1
Aspiring Human Resources Professional | An energetic and Team-Focused Leader                                             1
HR Manager at Endemol Shine North America                                                                                1
Human Resources professional for the world leader in GIS software                                                        1
RRP Brand Portfolio Executive at JTI (Japan Tobacco International)                                                       1
Inform

From the above we can see a number of things for **text pre-processing**:
1. Expanding abbreviations and new lines and numbers
2. Convert to lower case and remove unwanted characters like punctuation symbols from the text. - what we see is characters like | or ! or \n or opening closing brackets which need to be handled.
3. Then we will remove **stop words** and go for stemming or/and lemmatization.

In [23]:
# Expanding abbreviations and removing \n and numbers
df.replace({'job_title' : { 'HR' : 'human resources', 'GIS': 'geographic information system',
                           'EY': 'Ernst & Young Global Limited', 'MES': 'Manufacturing execution systems',
                           'CHRO' : 'chief human resources officer', 'SVP' : 'senior vice president',
                           'CSR' : 'corporate social responsibility', 'GPHR' : 'global professional in human resources',
                           'SPHR' : 'strategic and policy-making certification', 'HRIS' : 'human resources management system',  
                           '\n': '', '\w*\d\w*': ''
    }}, regex=True, inplace=True)


In [24]:
# Removing punctuation and changing to lowercase
punct_chars = set(string.punctuation)
df['job_title'] = df['job_title'].apply(lambda x: ''.join(char for char in x if char not in punct_chars))
df['job_title'] = df['job_title'].str.lower()

df['job_title'].value_counts()

job_title
aspiring human resources professional                                                                                                                                                     2
 ct bauer college of business graduate magna cum laude and aspiring human resources professional                                                                                          1
lead official at western illinois university                                                                                                                                              1
senior human resources business partner at heil environmental                                                                                                                             1
aspiring human resources professional  an energetic and teamfocused leader                                                                                                                1
human resources manager at endemol shine north ame

In [25]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [26]:
stop_nltk = set(stopwords.words('english'))
lemmatiser = WordNetLemmatizer()

In [27]:
def preprocess_text(dataframe, col):
    for i in range(len(dataframe)):
        tokens = word_tokenize(dataframe.iloc[i][col])
        tokens_no_stopwords = [word for word in tokens if word not in stop_nltk]
        final_sentence = []
        for word in tokens_no_stopwords:
            final_sentence.append(lemmatiser.lemmatize(word))
        dataframe[col][i]=' '.join([str(elem) for elem in final_sentence])

In [28]:
nltk.download('punkt')
nltk.download('wordnet')
preprocess_text(df, 'job_title')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\CHARU\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\CHARU\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [29]:
df

Unnamed: 0,id,job_title,location,connection
0,1,ct bauer college business graduate magna cum l...,"Houston, Texas",85
1,2,native english teacher epik english program korea,Kanada,500+
2,3,aspiring human resource professional,"Raleigh-Durham, North Carolina Area",44
3,4,people development coordinator ryan,"Denton, Texas",500+
4,5,advisory board member celal bayar university,"İzmir, Türkiye",500+
5,6,aspiring human resource specialist,Greater New York City Area,1
6,7,student humber college aspiring human resource...,Kanada,61
7,8,human resource senior specialist,San Francisco Bay Area,500+
8,10,seeking human resource human resourcesis gener...,Greater Philadelphia Area,500+
9,11,student chapman university,"Lake Forest, California",2


### b) Now similar preprocessing for LOCATION

In [30]:
df['location'].value_counts()

location
Houston, Texas Area                    4
Raleigh-Durham, North Carolina Area    3
Greater New York City Area             3
Austin, Texas Area                     2
Amerika Birleşik Devletleri            2
Kanada                                 2
Greater Philadelphia Area              2
Greater Atlanta Area                   2
Torrance, California                   1
Highland, California                   1
Gaithersburg, Maryland                 1
Baltimore, Maryland                    1
Milpitas, California                   1
Greater Chicago Area                   1
Houston, Texas                         1
Long Beach, California                 1
Chattanooga, Tennessee Area            1
Bridgewater, Massachusetts             1
Lafayette, Indiana                     1
Kokomo, Indiana Area                   1
Las Vegas, Nevada Area                 1
Cape Girardeau, Missouri               1
Greater Los Angeles Area               1
Los Angeles, California                1
Dallas/

In [31]:
# Removing punctuations and correcting misspelt word(s)
punct_chars = punct_chars.difference(set([',']))
df['location'] = df['location'].apply(lambda x: ''.join(char for char in x if char not in punct_chars))
df['location'] = df['location'].str.lower()
df.replace({'location' : {'amerika birleşik devletleri' : 'united states of america',
                           'kanada' : 'canada', 'türkiye' : 'turkey',
    }}, regex=True, inplace=True)
df

Unnamed: 0,id,job_title,location,connection
0,1,ct bauer college business graduate magna cum l...,"houston, texas",85
1,2,native english teacher epik english program korea,canada,500+
2,3,aspiring human resource professional,"raleighdurham, north carolina area",44
3,4,people development coordinator ryan,"denton, texas",500+
4,5,advisory board member celal bayar university,"i̇zmir, turkey",500+
5,6,aspiring human resource specialist,greater new york city area,1
6,7,student humber college aspiring human resource...,canada,61
7,8,human resource senior specialist,san francisco bay area,500+
8,10,seeking human resource human resourcesis gener...,greater philadelphia area,500+
9,11,student chapman university,"lake forest, california",2


In [32]:
preprocess_text(df, 'location')
df

Unnamed: 0,id,job_title,location,connection
0,1,ct bauer college business graduate magna cum l...,"houston , texas",85
1,2,native english teacher epik english program korea,canada,500+
2,3,aspiring human resource professional,"raleighdurham , north carolina area",44
3,4,people development coordinator ryan,"denton , texas",500+
4,5,advisory board member celal bayar university,"i̇zmir , turkey",500+
5,6,aspiring human resource specialist,greater new york city area,1
6,7,student humber college aspiring human resource...,canada,61
7,8,human resource senior specialist,san francisco bay area,500+
8,10,seeking human resource human resourcesis gener...,greater philadelphia area,500+
9,11,student chapman university,"lake forest , california",2


### b) Preprocessing for CONNECTION

In [33]:
#We need to normalize the connections to be between 0-1. We will count 500+ as 500

# Function to normalize scores to between 0-1
def normalize_score(score):
    if score == '500+ ':
        score = 500
    return float(score)/500

df['connection'] = df['connection'].apply(normalize_score)
df.head()

Unnamed: 0,id,job_title,location,connection
0,1,ct bauer college business graduate magna cum l...,"houston , texas",0.17
1,2,native english teacher epik english program korea,canada,1.0
2,3,aspiring human resource professional,"raleighdurham , north carolina area",0.088
3,4,people development coordinator ryan,"denton , texas",1.0
4,5,advisory board member celal bayar university,"i̇zmir , turkey",1.0


## 3. Word Embeddings

We will now convert the words or tokens into vectors , ie, continuous-valued vectors in a high-dimensional space to capture the semantic relationships between words. The methods we will use are :
1) **TF-IDF** is a classic metric that calculates the importance of a term in a document relative to a collection of documents.
2) **Word2Vec** It is a shallow neural network-based model that learns word embeddings by predicting words in their context (CBOW) or predicting context words given a target word (Skip-gram).
3) **Glove** It is a count-based model that learns word embeddings based on the co-occurrence statistics of words in a large corpus.
4) **FastText** It represents words as the sum of their constituent subword embeddings (character n-grams) for capturing more meaning.
5) **BERT** It is a contextual embedding model that pre-trains on massive text corpora and captures word meanings in context.

### Chosen Performance metric : Cosine Similarity

We need to find how fit the candidate is for the role. As stated in the problem statement, it should be a **number**, like probability between 0-1. So for this case **cosine similarity** seems the closest needed to fit our criterion. As it is apt for text-oriented searches through which we can gauge the **similarity between two vectors**, such as query and document vectors.
As our use case is one of ranking and performing **document search** , cosine similarity is the apt metric. 
<p><i>NOTE:</i> The <u>final ranking fitness</u> score will also take into account the starring feedback loop action.</p>

#### 1) Beginning with TF-IDF

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorizer instantiation
tfidf = TfidfVectorizer()

# Create tfidf matrix
tf_matrix = tfidf.fit_transform(df['job_title'])

# Shape of matrix
print(tf_matrix.shape)

(53, 183)


In [35]:
data = pd.DataFrame(tf_matrix.toarray(), columns=tfidf.get_feature_names_out())
data

Unnamed: 0,administration,administrative,admission,advisory,always,america,analyst,analytics,army,art,...,vice,victoria,wellington,western,westfield,within,woodland,work,world,young
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.426332,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
def preprocessing_keywords(keywords):
    keywords = re.sub('[^a-zA-Z]', ' ', keywords) # remove punctuations and numbers
    keywords = str(keywords).lower()              # lowercase
    tokens = word_tokenize(keywords)          
    keywords_no_stopwords = [word for word in tokens if word not in stop_nltk]
    keywords = [lemmatiser.lemmatize(word=word) for word in keywords_no_stopwords]
    keywords = ' '.join(keywords)
    return keywords

#### Let us start with our search term keywords as "Aspiring human resources" 

In [37]:
keyword = 'Aspiring human resources'

# Preprocess these words
keyword_cleaned = preprocessing_keywords(keyword)
print(f'{keyword_cleaned}')

aspiring human resource


In [38]:
tfidf_keyword = tfidf.transform([keyword_cleaned])
print(tfidf_keyword.shape)

(1, 183)


In [39]:
# Now computing cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

In [40]:
cos_sim = cosine_similarity(tf_matrix, tfidf_keyword)

cos_data = { 'cosine_similarity' : cos_sim.ravel()}
cos_df = pd.DataFrame(cos_data)
cos_df.head()

Unnamed: 0,cosine_similarity
0,0.251387
1,0.0
2,0.76765
3,0.0
4,0.0


In [41]:
## Creating a final dataset with the performance metrics appended
final_df = pd.concat([df, cos_df], axis=1)
final_df.head()

Unnamed: 0,id,job_title,location,connection,cosine_similarity
0,1,ct bauer college business graduate magna cum l...,"houston , texas",0.17,0.251387
1,2,native english teacher epik english program korea,canada,1.0,0.0
2,3,aspiring human resource professional,"raleighdurham , north carolina area",0.088,0.76765
3,4,people development coordinator ryan,"denton , texas",1.0,0.0
4,5,advisory board member celal bayar university,"i̇zmir , turkey",1.0,0.0


In [58]:
# Let us sort the dataframe to have the job_titles which have closest fit to our keywords search query
sorted_df = final_df.sort_values(['cosine_similarity'], ascending=False)
sorted_df.head(6)

Unnamed: 0,id,job_title,location,connection,cosine_similarity
45,97,aspiring human resource professional,"kokomo , indiana area",0.142,0.76765
2,3,aspiring human resource professional,"raleighdurham , north carolina area",0.088,0.76765
5,6,aspiring human resource specialist,greater new york city area,0.002,0.676406
21,73,aspiring human resource manager seeking intern...,"houston , texas area",0.014,0.612429
12,27,aspiring human resource management student see...,"houston , texas area",1.0,0.429728
20,72,business management major aspiring human resou...,"monroe , louisiana area",0.01,0.427448


In [49]:
sorted_df['job_title'].head(6).tolist()

['aspiring human resource professional',
 'aspiring human resource professional',
 'aspiring human resource specialist',
 'aspiring human resource manager seeking internship human resource',
 'aspiring human resource management student seeking internship',
 'business management major aspiring human resource manager']

Here we find the top 5 <b>job_titles</b> similar to <b><i>"Aspiring human resources"</b></i> are: 
* aspiring human resource professional with similarity 76.76%
* aspiring human resource specialist with similarity 67.64%
* aspiring human resource manager seeking internship human resource with similarity 61.2%
* aspiring human resource management student seeking internship with similarity 42.97%
* business management major aspiring human resource manager with similarity 42.74%

In [63]:
pd.set_option('display.max_colwidth', None)

In [60]:
## Function to do above all steps in one go
def top_n_talent(n, job_title):
    keyword_cleaned = preprocessing_keywords(job_title)
    tfidf_keyword = tfidf.transform([keyword_cleaned])
    cos_sim = cosine_similarity(tf_matrix, tfidf_keyword)
    cos_data = { 'cosine_similarity' : cos_sim.ravel()}
    cos_df = pd.DataFrame(cos_data)
    temp_df = pd.concat([df, cos_df], axis=1)
    sorted_df = temp_df.sort_values(['cosine_similarity'], ascending=False)
    talents = sorted_df['job_title'].head(n-1)
    return sorted_df.head(n)

In [65]:
top_n_talent(4, 'Aspiring human resources')

Unnamed: 0,id,job_title,location,connection,cosine_similarity
45,97,aspiring human resource professional,"kokomo , indiana area",0.142,0.76765
2,3,aspiring human resource professional,"raleighdurham , north carolina area",0.088,0.76765
5,6,aspiring human resource specialist,greater new york city area,0.002,0.676406
21,73,aspiring human resource manager seeking internship human resource,"houston , texas area",0.014,0.612429


#### Let us also check for keywords: "seeking human resources"

In [64]:
top_n_talent(7, 'seeking human resources')

Unnamed: 0,id,job_title,location,connection,cosine_similarity
47,99,seeking human resource position,"la vega , nevada area",0.096,0.697555
13,28,seeking human resource opportunity,"chicago , illinois",0.78,0.67434
21,73,aspiring human resource manager seeking internship human resource,"houston , texas area",0.014,0.627828
8,10,seeking human resource human resourcesis generalist position,greater philadelphia area,1.0,0.518121
12,27,aspiring human resource management student seeking internship,"houston , texas area",1.0,0.455532
48,100,aspiring human resource manager graduating may seeking entrylevel human resource position st louis,"cape girardeau , missouri",0.206,0.382513
22,74,human resource professional,greater boston area,0.032,0.379757
