In [None]:
from torch.utils.data import DataLoader
import pandas as pd
import re
from sentence_transformers import SentenceTransformer, InputExample, models, evaluation, losses
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import shuffle

# Getting the data

In [None]:
# Reading and saving to CSV
def get_data_csv():
    df_data = pd.read_csv('data.csv')
    df_data.to_csv('data.csv', index=False, encoding='utf-8')
    return df_data

In [None]:
df_data = get_data_csv()
df_data

# Removing digits

In [None]:
df_data["cvs"] = df_data["curriculos"].apply(lambda x: re.sub('\d+', '', x))
df_data["jobs"] = df_data["vagas"].apply(lambda x: re.sub('\d+', '', x))

# Transforming relevance scores into similarity scores

In [None]:
min_max_scaler = MinMaxScaler()
min_max_scaler.set_output(transform='pandas');

In [None]:
df_data["scores"] = min_max_scaler.fit_transform(df_data["notas"].values.reshape(-1, 1))
# 1 -> 0.00
# 2 -> 0.25
# 3 -> 0.50
# 4 -> 0.75
# 5 -> 1.00
df_data

In [None]:
# Removing unnecessary columns
df_data.drop(columns=['curriculos', 'vagas', 'notas'])

# Getting the data ready for training

In [None]:
data_examples = []
for index, row in df_data.iterrows():
    data_examples.append(InputExample(texts=[row['cvs'], row['jobs']], label=row['scores']))

# Splitting the data into 60% for training, 20% for validation and 20% for tests.
data_examples = shuffle(data_examples, random_state=42)
train_index = int(len(data_examples) * 0.6)
val_index = int(len(data_examples) * 0.2)

train_examples = data_examples[:train_index]
val_examples = data_examples[train_index:train_index+val_index]
test_examples = data_examples[train_index+val_index:]

# Creating PyTorch's DataLoader
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=4)

# Training

In [None]:
checkpoint = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'

word_embedding_model = models.Transformer(checkpoint, cache_dir=f'model/{checkpoint}')
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode='cls')
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

train_loss = losses.CosineSimilarityLoss(model)

evaluator = evaluation.EmbeddingSimilarityEvaluator.from_input_examples(val_examples, name='sbert')

model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=5, evaluator=evaluator, show_progress_bar=True, output_path=f'model_FT/{checkpoint}')

# Testing a job recommendation system for a cv

In [None]:
cv_test = 'Nome: Laura Costa - Objetivo: Busco uma posição como Analista Econômico, onde posso aplicar minha formação acadêmica em Economia e aprimorar minhas habilidades em análise econômica. Formação Acadêmica: Bacharelado em Economia - Universidade Federal de Estado Y (-) Experiência Profissional: Assistente de Análise Econômica - Empresa de Consultoria Econômica LTDA - Cidade Financeira, Estado Y (-Presente) Coleta de dados econômicos. Auxílio na elaboração de relatórios e análises. Habilidades: Conhecimentos intermediários em análise econômica. Familiaridade com ferramentas como Excel e SPSS. Idiomas: Inglês: Avançado Espanhol: Básico'

jobs_test = list(set([test_example.texts[1] for test_example in test_examples]))

In [None]:
cv_embedding = model.encode(cv_test)
jobs_embedding = [model.encode(vaga) for vaga in jobs_test]
similarity_score = util.cos_sim(cv_embedding, jobs_embedding)

In [None]:
# Finding the pairs cv-vacancy with the highest cosine similarity score
pairs = []
for index, score in enumerate(similarity_score[0]):
    pairs.append({"index": index, "score": score})

# Sort the pairs by scores in descending order
pairs = sorted(pairs, key=lambda x: x["score"], reverse=True)

In [None]:
print(f' CV: {cv_test} \n\n')
for pair in pairs[0:5]:
    print(f' Job: {jobs_test[pair["index"]]} \n Predicted similarity score after fine-tuning: {pair["score"]} \n')