In [1]:
from google.colab import drive

drive.mount("/content/drive/")

Mounted at /content/drive/


In [2]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125923 sha256=6dd74fa22eddcd98840ee06dc16785336a3beb91d74c89c52273ee0aafd1714b
  Stored in directory: /root/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbbd902f
Successfully built sentence-tra

# Imports

In [3]:
import numpy as np
import pandas as pd
import torch
from scipy import spatial
from sentence_transformers import SentenceTransformer
from transformers import AutoModel, AutoTokenizer

In [9]:
def cosine_similarity(emb1, emb2):
    return 1 - spatial.distance.cosine(emb1, emb2)


def calc_sim(vac_embeddings, res_embeddings):
    d = {}
    for i in range(res_embeddings.shape[0]):
        d[i] = []
        for j in range(vac_embeddings.shape[0]):
            d[i].append(cosine_similarity(res_embeddings[i], vac_embeddings[j]))
    return d


# Для каждого резюме топ-10 вакансий
def calc_top10_vac(df):
    d = {}
    for i in range(100):
        d[i] = df[i].sort_values(ascending=False).head(10).index.tolist()
    return d


def calc_accuracy(top_10, labeling):
    tp, tn, fp, fn = 0, 0, 0, 0
    for vac in labeling["vacancy_index"].unique().tolist():
        tmp_df = labeling[labeling["vacancy_index"] == vac]
        for res in tmp_df["resume_index"].tolist():
            if tmp_df[tmp_df["resume_index"] == res]["is_relevant"].values[0] == 0:
                if res not in top_10[vac].tolist():
                    tn += 1
                else:
                    fp += 1
            else:
                if res in top_10[vac].tolist():
                    tp += 1
                else:
                    fn += 1
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    print(f"tp: {tp}")
    print(f"tn: {tn}")
    print(f"fp: {fp}")
    print(f"fn: {fn}")
    print(f"accuracy: {accuracy}")

    return tp, tn, fp, fn, accuracy


def calc_f1(top_10_df, labeling):
    precision_list = []
    recall_list = []
    f1_list = []
    for vac in labeling["vacancy_index"].unique().tolist():
        tmp_df = labeling[labeling["vacancy_index"] == vac]
        relevant = tmp_df["resume_index"].tolist()
        top_10 = top_10_df[vac].tolist()
        precision_at_k = len(set(top_10).intersection(set(relevant))) / len(top_10)
        recall_at_k = len(set(top_10).intersection(set(relevant))) / len(relevant)
        f1_at_k = (
            2 * (precision_at_k * recall_at_k) / (precision_at_k + recall_at_k)
            if (precision_at_k + recall_at_k) > 0
            else 0
        )
        precision_list.append(precision_at_k)
        recall_list.append(recall_at_k)
        f1_list.append(f1_at_k)
    final_precision = np.mean(precision_list)
    final_recall = np.mean(recall_list)
    final_f1 = np.mean(f1_list)

    print(f"precision@10: {final_precision}")
    print(f"recall@10: {final_recall}")
    print(f"f1@10: {final_f1}")

    return final_precision, final_recall, final_f1

# Get data

In [5]:
labeling = pd.read_csv("/content/drive/MyDrive/itmo_resume_matching/labeling_2.csv")
vacancy = pd.read_csv("/content/drive/MyDrive/itmo_resume_matching/random100vac.csv")
resume = pd.read_csv("/content/drive/MyDrive/itmo_resume_matching/random100resume.csv")

vacancy["text_e5"] = vacancy["text"].apply(lambda x: "query: " + x)
resume["text_e5"] = resume["text"].apply(lambda x: "query: " + x)

In [6]:
labeling.head()

Unnamed: 0,is_relevant,score,interpretation,resume_index,vacancy_index
0,0,2,"Вакансия не подходит, поскольку опыт работы в ...",83,4
1,0,0,"Вакансия не соответствует опыту кандидата, так...",83,63
2,0,0,"Вакансия не релевантна, поскольку требуется сп...",83,3
3,0,0,"Отсутствие в резюме упоминания о знании PHP, L...",83,36
4,0,2,Не подходит: опыт работы соискателя значительн...,83,44


# multilingual-e5-large

In [7]:
model = SentenceTransformer("intfloat/multilingual-e5-large")

vacancy_embeddings = model.encode(
    vacancy["text_e5"].tolist(), normalize_embeddings=True
)
resume_embeddings = model.encode(resume["text_e5"].tolist(), normalize_embeddings=True)

.gitattributes:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/201 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/160k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

onnx/config.json:   0%|          | 0.00/688 [00:00<?, ?B/s]

model.onnx:   0%|          | 0.00/546k [00:00<?, ?B/s]

model.onnx_data:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

onnx/special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

onnx/tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

In [8]:
res_vac_similarity = calc_sim(vacancy_embeddings, resume_embeddings)
res_vac_similarity_df = pd.DataFrame(res_vac_similarity)

top_10_vac = calc_top10_vac(res_vac_similarity_df)
top_10_vac_df = pd.DataFrame(top_10_vac)
top_10_vac_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,84,48,54,61,47,95,84,61,12,44,...,78,51,54,61,12,19,50,61,47,95
1,61,61,12,84,78,78,61,49,61,28,...,64,2,4,97,54,62,61,54,51,64
2,31,88,21,64,58,64,38,54,97,12,...,51,78,12,54,8,39,82,84,78,78
3,85,12,61,88,12,47,76,50,95,54,...,58,47,97,12,61,54,43,43,95,51
4,40,40,97,40,64,12,80,84,25,85,...,47,58,9,68,35,61,7,58,58,61


In [10]:
tp_e5, tn_e5, fp_e5, fn_e5, accuracy_e5 = calc_accuracy(top_10_vac_df, labeling)

tp: 0
tn: 122
fp: 6
fn: 19
accuracy: 0.8299319727891157


In [12]:
precision_e5, recall_e5, f1_e5 = calc_f1(top_10_vac_df, labeling)

precision@10: 0.007407407407407407
recall@10: 0.027777777777777776
f1@10: 0.011576900465789354


# paraphrase-multilingual-mpnet-base-v2

In [13]:
model = SentenceTransformer(
    "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
)

vacancy_embeddings = model.encode(vacancy["text"].tolist(), normalize_embeddings=True)
resume_embeddings = model.encode(resume["text"].tolist(), normalize_embeddings=True)

.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.10k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [14]:
res_vac_similarity = calc_sim(vacancy_embeddings, resume_embeddings)
res_vac_similarity_df = pd.DataFrame(res_vac_similarity)

top_10_vac = calc_top10_vac(res_vac_similarity_df)
top_10_vac_df = pd.DataFrame(top_10_vac)
top_10_vac_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,3,88,3,15,51,51,88,3,58,94,...,51,51,51,88,33,62,88,11,51,51
1,13,94,45,58,91,96,51,88,43,58,...,91,96,11,61,93,71,71,3,91,91
2,61,29,16,85,2,2,58,58,7,88,...,78,3,78,56,79,88,58,51,3,63
3,32,71,24,90,78,63,61,15,90,85,...,96,91,91,78,39,78,50,88,63,78
4,58,85,17,88,64,78,78,71,50,15,...,63,2,3,51,35,51,43,71,96,2


In [15]:
tp_mpnet, tn_mpnet, fp_mpnet, fn_mpnet, accuracy_mpnet = calc_accuracy(
    top_10_vac_df, labeling
)

tp: 2
tn: 114
fp: 14
fn: 17
accuracy: 0.7891156462585034


In [16]:
precision_mpnet, recall_mpnet, f1_mpnet = calc_f1(top_10_vac_df, labeling)

precision@10: 0.019753086419753086
recall@10: 0.09156378600823045
f1@10: 0.031735753957976176


# paraphrase-multilingual-MiniLM-L12-v2

In [17]:
model = SentenceTransformer(
    "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
)

vacancy_embeddings = model.encode(vacancy["text"].tolist(), normalize_embeddings=True)
resume_embeddings = model.encode(resume["text"].tolist(), normalize_embeddings=True)

.gitattributes:   0%|          | 0.00/968 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.09k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/471M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

unigram.json:   0%|          | 0.00/14.8M [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [18]:
res_vac_similarity = calc_sim(vacancy_embeddings, resume_embeddings)
res_vac_similarity_df = pd.DataFrame(res_vac_similarity)

top_10_vac = calc_top10_vac(res_vac_similarity_df)
top_10_vac_df = pd.DataFrame(top_10_vac)
top_10_vac_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,13,45,7,2,51,63,63,71,14,3,...,23,51,81,63,33,62,50,11,51,51
1,96,94,13,3,63,23,78,3,95,21,...,13,3,96,3,71,71,71,63,21,63
2,3,63,78,64,23,96,51,77,22,2,...,63,78,91,13,93,51,63,97,39,78
3,97,29,21,13,2,78,2,97,82,78,...,51,13,7,51,51,3,51,51,96,2
4,63,51,45,95,58,39,3,78,21,71,...,81,2,23,78,79,63,78,10,23,91


In [19]:
tp_minilm, tn_minilm, fp_minilm, fn_minilm, accuracy_minilm = calc_accuracy(
    top_10_vac_df, labeling
)

tp: 2
tn: 110
fp: 18
fn: 17
accuracy: 0.7619047619047619


In [20]:
precision_minilm, recall_minilm, f1_minilm = calc_f1(top_10_vac_df, labeling)

precision@10: 0.024691358024691357
recall@10: 0.12242798353909465
f1@10: 0.0401532623754846


# distiluse-base-multilingual-cased-v1

In [21]:
model = SentenceTransformer(
    "sentence-transformers/distiluse-base-multilingual-cased-v1"
)

vacancy_embeddings = model.encode(vacancy["text"].tolist(), normalize_embeddings=True)
resume_embeddings = model.encode(resume["text"].tolist(), normalize_embeddings=True)

.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

2_Dense/config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.45k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/556 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/539M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/452 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

In [22]:
res_vac_similarity = calc_sim(vacancy_embeddings, resume_embeddings)
res_vac_similarity_df = pd.DataFrame(res_vac_similarity)

top_10_vac = calc_top10_vac(res_vac_similarity_df)
top_10_vac_df = pd.DataFrame(top_10_vac)
top_10_vac_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,32,60,97,23,51,2,63,6,95,97,...,91,63,54,63,54,62,50,11,51,63
1,13,29,21,54,58,51,55,55,34,73,...,63,54,11,57,93,54,6,63,96,55
2,97,36,23,55,69,91,57,23,29,85,...,23,6,38,97,69,60,96,97,91,91
3,38,97,55,6,96,55,96,73,23,40,...,78,21,19,38,85,68,55,55,21,96
4,60,55,6,97,91,63,38,63,19,54,...,96,55,55,96,79,29,63,2,2,78


In [23]:
(
    tp_distiluse,
    tn_distiluse,
    fp_distiluse,
    fn_distiluse,
    accuracy_distiluse,
) = calc_accuracy(top_10_vac_df, labeling)

tp: 0
tn: 118
fp: 10
fn: 19
accuracy: 0.8027210884353742


In [24]:
precision_distiluse, recall_distiluse, f1_distiluse = calc_f1(top_10_vac_df, labeling)

precision@10: 0.012345679012345678
recall@10: 0.0699588477366255
f1@10: 0.020345909234798124


# LaBSE-en-ru

In [25]:
tokenizer = AutoTokenizer.from_pretrained("cointegrated/LaBSE-en-ru")
model = AutoModel.from_pretrained("cointegrated/LaBSE-en-ru")


encoded_input = tokenizer(
    vacancy["text"].tolist(),
    padding=True,
    truncation=True,
    max_length=64,
    return_tensors="pt",
)
with torch.no_grad():
    model_output = model(**encoded_input)
vacancy_embeddings = model_output.pooler_output
vacancy_embeddings = torch.nn.functional.normalize(vacancy_embeddings)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/521k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/516M [00:00<?, ?B/s]

In [26]:
encoded_input = tokenizer(
    resume["text"].tolist(),
    padding=True,
    truncation=True,
    max_length=64,
    return_tensors="pt",
)
with torch.no_grad():
    model_output = model(**encoded_input)
resume_embeddings = model_output.pooler_output
resume_embeddings = torch.nn.functional.normalize(resume_embeddings)

In [27]:
res_vac_similarity = calc_sim(vacancy_embeddings, resume_embeddings)
res_vac_similarity_df = pd.DataFrame(res_vac_similarity)

top_10_vac = calc_top10_vac(res_vac_similarity_df)
top_10_vac_df = pd.DataFrame(top_10_vac)
top_10_vac_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,56,42,6,2,51,63,56,6,7,42,...,54,6,7,56,33,62,56,11,51,63
1,62,7,17,90,63,91,57,73,6,85,...,7,2,90,19,93,19,51,42,63,91
2,29,91,19,6,2,96,88,42,43,98,...,91,91,50,88,71,98,42,63,91,51
3,57,6,7,7,6,51,51,51,23,7,...,82,92,97,57,35,56,7,88,78,2
4,94,2,42,51,91,2,10,98,87,15,...,51,51,6,80,64,60,88,36,2,78


In [29]:
tp_labse, tn_labse, fp_labse, fn_labse, accuracy_labse = calc_accuracy(
    top_10_vac_df, labeling
)

tp: 4
tn: 104
fp: 24
fn: 15
accuracy: 0.7346938775510204


In [30]:
precision_labse, recall_labse, f1_labse = calc_f1(top_10_vac_df, labeling)

precision@10: 0.0345679012345679
recall@10: 0.18004115226337447
f1@10: 0.05667172333839001


# Compare merics

In [31]:
df_metrics = pd.DataFrame(
    {
        "model": ["e5", "mpnet", "minilm", "distiluse", "labse"],
        "accuracy": [
            accuracy_e5,
            accuracy_mpnet,
            accuracy_minilm,
            accuracy_distiluse,
            accuracy_labse,
        ],
        "precision@10": [
            precision_e5,
            precision_mpnet,
            precision_minilm,
            precision_distiluse,
            precision_labse,
        ],
        "recall@10": [
            recall_e5,
            recall_mpnet,
            recall_minilm,
            recall_distiluse,
            recall_labse,
        ],
        "f1@10": [f1_e5, f1_mpnet, f1_minilm, f1_distiluse, f1_labse],
    }
)

In [37]:
df_metrics.set_index("model").style.highlight_max(color="lightgreen", axis=0)

Unnamed: 0_level_0,accuracy,precision@10,recall@10,f1@10
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
e5,0.829932,0.007407,0.027778,0.011577
mpnet,0.789116,0.019753,0.091564,0.031736
minilm,0.761905,0.024691,0.122428,0.040153
distiluse,0.802721,0.012346,0.069959,0.020346
labse,0.734694,0.034568,0.180041,0.056672
