# Setup

## 1. Importing neccessary libraries

In [None]:
!pip install pandas
!pip install numpy
!pip install matplotlib
!pip install scikit-learn

!pip install tensorflow
!pip install transformers
!pip install sentence-transformers

!pip install nltk
!pip install spacy
!python -m spacy download en

In [None]:
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import pprint
import random

import re

import tensorflow as tf
import transformers

import spacy
from nltk.stem import PorterStemmer

In [None]:
pd.set_option('display.max_colwidth', 200)

## 2. Loading dataset

In [None]:
dataset_original = pd.read_csv('./data/raw/data.csv')
dataset_original.head()

# Inspecting & Cleaning the dataset

In [None]:
dataset_cleaned_temp = dataset_original.copy()

In [None]:
dataset_cleaned_temp.info()

In [None]:
dataset_cleaned_temp.describe()

### Checking missing data

In [None]:
dataset_cleaned_temp.isnull().sum()

### Checking duplicates

In [None]:
dataset_cleaned_temp.duplicated().sum()

### Remove unnecessary words & Replace abbreviations

In [None]:
dataset_cleaned_temp['job_title'].value_counts()

In [None]:
spacy_nlp = spacy.load('en_core_web_sm')
spacy_nlp.pipe_names

stemmer = PorterStemmer()

In [None]:
abbreviations_to_replace = {
    'GPHR': 'Global Professional in Human Resources',
    'CSR': 'Corporate Social Responsibility',
    'MES': 'Manufacturing Execution Systems',
    'SPHR': 'Senior Professional in Human Resources',
    'SVP': 'Senior Vice President',
    'GIS': 'Geographic Information System',
    'RRP': 'Reduced Risk Products',
    'CHRO': 'Chief Human Resources Officer',
    'HRIS': 'Human resources information system',
    'HR': 'Human resources',
}

def replace_abbreviations(sentence):
    replaced_sentence = sentence
    for abbreviation, replacement in abbreviations_to_replace.items():
        # Create a regular expression pattern to match the whole word
        pattern = r'\b{}\b'.format(re.escape(abbreviation))
    
        # Use re.sub() to replace the word in the sentence
        replaced_sentence = re.sub(pattern, replacement, replaced_sentence, flags=re.IGNORECASE)

    return replaced_sentence

In [None]:
def clean_sentence(sentence):
    # Remove special characters
    new_sentence = re.sub(r'[+*,.|(){}&\-\']', '', sentence)

    # Replce abbreviations
    new_sentence = replace_abbreviations(new_sentence)
    
    words = new_sentence.split()
    
    # Stemming
    stemmed_words = []
    for word in words:
        stemmed_words.append(stemmer.stem(word))
        
    # Lemmatization
    lemmatized_words = []
    doc = spacy_nlp(" ".join(stemmed_words))
    for token in doc:
        if not token.is_stop:
            lemmatized_words.append(token.lemma_)

    return " ".join(lemmatized_words)

In [None]:
dataset_cleaned_temp['job_title_cleaned'] = dataset_cleaned_temp['job_title'].apply(clean_sentence)
print(dataset_cleaned_temp['job_title_cleaned'].head())

In [None]:
print([*set(dataset_cleaned_temp["job_title_cleaned"].str.split().agg(sum, axis = 0))])

In [None]:
dataset_cleaned = dataset_cleaned_temp.copy()

# Preprocessing

In [None]:
dataset_preprocessed = dataset_cleaned.copy()

## Setup BERT & Utils

In [None]:
from transformers import BertTokenizer, TFBertModel
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained("bert-base-uncased")

In [None]:
def get_bert_embeddings(sentences):
    embeddings = []
    for sentence in sentences:
        # Tokenize input sentence
        encoded_inputs = bert_tokenizer(sentence, padding=True, truncation=True, return_tensors='tf')
    
        # Generate BERT embeddings
        outputs = bert_model(encoded_inputs)
        hidden_states = outputs.last_hidden_state

        # Apply pooling strategy - averaging
        pooled = tf.reduce_mean(hidden_states, axis=1)
        embeddings.append(pooled.numpy().reshape(-1))
    
    return np.array(embeddings)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def encode_and_get_similarity(data, queries, search_columns, output_columns):
    data = data.copy()
    
    embeddings = {}
    queries_embeddings = []
    
    # without replacing the abbreviations with their full meaning, we will get very bad results
    for index, query in enumerate(queries):
        query = replace_abbreviations(query)
        query = clean_sentence(query)
        queries_embeddings.append(get_bert_embeddings([query]))
        
    queries_embeddings_mean = np.mean(queries_embeddings, axis=0)
    # queries_embeddings_mean = get_bert_embeddings('Aspiring Human Resources Professional')

    for index, column in enumerate(search_columns):
        sentences = dataset_preprocessed[column].tolist()

        # Encoding
        embeddings[column] = get_bert_embeddings(sentences)

        # Cosine Similarity
        cosine_similarities = cosine_similarity(
            queries_embeddings_mean,
            embeddings[column]
        )        
        data[output_columns[index]] = cosine_similarities[0]
    
    return data

# Ranking

## Search Queries/Keywords

In [None]:
queries = [
    # 'Aspiring Human Resources Professional',
    'aspiring human resources',
    'seeking human resources'
]

## Get Embeddings & Similarities

In [None]:
dataset_preprocessed = encode_and_get_similarity(dataset_preprocessed, queries, ['job_title_cleaned'], ['bert_similarity'])

## First Rank

In [None]:
dataset_preprocessed.sort_values(by='bert_similarity', ascending=False).head(20)

## Starred Candidates

Mark them as favorite/bookmark

In [None]:
starred_ids = [int(item) for item in input("Enter the ids of the candidates you want to star (separate by spaces): ").split()]

## Second Rank (Re-Rank)

- similar to bookmark
- First way:  Marging the keypharse and the starred title
- Second way: one more column of scores (starred), use the starred job title as a keyword

In [None]:
dataset_preprocessed.loc[dataset_preprocessed['id'].isin(starred_ids), 'is_starred'] = 1
dataset_preprocessed.loc[~dataset_preprocessed['id'].isin(starred_ids), 'is_starred'] = 0

In [None]:
def get_starred_score(data):
    data = data.copy()
    
    # Starred Queries
    queries = data[data['is_starred'] == 1]['job_title_cleaned']
    
    similarities = []
    for query in queries:
        print('START: ' + query)
        data = encode_and_get_similarity(data, [query], ['job_title_cleaned'], ['starred_similarity'])
        similarities.append(data['starred_similarity'])
        
        
    starred_similarity = np.mean(similarities, axis=0)
    
    return starred_similarity

In [None]:
dataset_preprocessed['starred_similarity'] = get_starred_score(dataset_preprocessed)

In [None]:
dataset_preprocessed['mean_similarity'] = dataset_preprocessed[['bert_similarity', 'starred_similarity']].mean(axis=1)

In [None]:
dataset_preprocessed[['job_title', 'is_starred', 'bert_similarity', 'starred_similarity', 'mean_similarity']].sort_values(by=['mean_similarity', 'is_starred'], ascending=False).head(20)