# Подготовка данных

In [100]:
!git clone https://github.com/abdullinilgiz/LLMmatch

Cloning into 'LLMmatch'...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


remote: Enumerating objects: 40, done.[K
remote: Counting objects: 100% (40/40), done.[K
remote: Compressing objects: 100% (19/19), done.[K
remote: Total 40 (delta 18), reused 37 (delta 18), pack-reused 0[K
Receiving objects: 100% (40/40), 585.05 KiB | 2.42 MiB/s, done.
Resolving deltas: 100% (18/18), done.


In [8]:
%cd LLMmatch
!git pull
%cd ..

/home/eugene/Documents/CV-enhancement/LLMmatch
Already up to date.
/home/eugene/Documents/CV-enhancement


In [9]:
import sys

sys.path.append("src")

In [14]:
from importlib import reload
import utils

reload(utils)

<module 'utils' from '/home/eugene/Documents/CV-enhancement/src/utils.py'>

In [15]:
from utils import save_matches

In [16]:
jobs = [
    job.replace(".txt", "")
    for job in os.listdir("LLMmatch/data/matches")
    if job.endswith(".txt")
]
for job in jobs:
    save_matches(job)

# Загрузка модели

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# MODEL_NAME = 'meta-llama/Llama-2-7b-chat-hf'
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.1"
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, torch_dtype=torch.float16, low_cpu_mem_usage=True
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model.to("cuda");

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

# Пайплайн

In [3]:
import os
import re
import json
import logging
import random
import string


def make_id():
    chars = list(string.ascii_lowercase) + list(map(str, list(range(0, 9)) * 5))
    return "".join(random.sample(chars, 6))


class CVEnhancer:
    def __init__(
        self,
        model,
        tokenizer,
        system_prompt_path="prompts/system_prompt.txt",
        user_prompt_path="prompts/user_prompt.txt",
        report_path="evaluation/report.json",
        max_len=100000,
    ):
        self.ASSISTANT_MESSAGE_START = "\nAssistant:Hello! As a career assiatant, I am glad to help you. I will give you a detailed review of your CV and give some advice to help you improve it so that it will fit the job better."
        self.RESPONSE_REGEXP = self.__get_response_regexp()
        self.TEMPLATE = "<s>[INST] <<SYS>> {system_prompt} <</SYS>> {user_message} [/INST]{assistant_message}"
        self.model = model
        self.tokenizer = tokenizer
        with open(system_prompt_path) as f:
            self.system_prompt = f.read()
        with open(user_prompt_path) as f:
            self.user_prompt = f.read()
        self.report_path = report_path
        self.logger = logger = logging.getLogger()
        self.logger.setLevel(logging.INFO)
        self.max_len = max_len

    def __get_response_regexp(self):
        assistant_message_start_regexp = self.ASSISTANT_MESSAGE_START.replace(
            "\n", ""
        ).replace(".", "\.")
        text_regexp = "[\S\s]*"
        return re.compile(f"(?<={assistant_message_start_regexp}){text_regexp}")

    def make_prompt(self, cv_path, job_path):
        with open(cv_path) as f:
            cv = f.read()
        with open(job_path) as f:
            job = f.read()
        user_prompt = self.user_prompt.format(cv=cv, job=job)
        return self.TEMPLATE.format(
            system_prompt=self.system_prompt,
            user_message=user_prompt,
            assistant_message=self.ASSISTANT_MESSAGE_START,
        )

    def generate_text(self, text, **generation_kwargs):
        input_ids = self.tokenizer(text, return_tensors="pt").input_ids.to("cuda")
        output = self.tokenizer.decode(
            self.model.generate(
                input_ids,
                pad_token_id=self.tokenizer.eos_token_id,
                max_new_tokens=self.max_len,
                **generation_kwargs,
            )[0]
        )
        return output

    def process(self, llm_output):
        llm_output = llm_output.replace("</s>", "")
        return llm_output

    def extract(self, llm_output):
        llm_output = self.process(llm_output)
        matches = self.RESPONSE_REGEXP.findall(llm_output)
        if matches:
            return matches[0].lstrip()
        return ""

    def save_report(self, llm_output, cv_path, job_path, **generation_kwargs):
        reports = load_or_create_json(self.report_path)
        new_report = {
            "cv": cv_path,
            "job": job_path,
            "system_prompt": self.system_prompt,
            **generation_kwargs,
            "model": MODEL_NAME,
            "output": llm_output,
        }
        report_id = make_id()
        reports.update({report_id: new_report})
        with open(self.report_path, "w") as jf:
            json.dump(reports, jf)
        # logging.info(f'Report saved to {self.report_path}')

    def enhance(self, cv_path, job_path, save_report=True, **generation_kwargs):
        prompt = self.make_prompt(cv_path, job_path)
        llm_output = self.generate_text(prompt, **generation_kwargs)
        llm_output = self.extract(llm_output)
        if save_report:
            self.save_report(llm_output, cv_path, job_path, **generation_kwargs)
        return llm_output

In [6]:
from src.enhancer import CVEnhancer

In [9]:
enhancer = CVEnhancer(model, tokenizer, MODEL_NAME)
generation_kwargs = {"do_sample": False, "temperature": 1}
print(
    enhancer.enhance(
        cv_path="data/CVs/sys_analitic.txt",
        job_path="data/Vacancies/sys_analitic/1.txt",
        save_report=True,
        **generation_kwargs
    )
)

First, let's take a look at how well your CV matches the job description.

The job description is looking for a System Analyst with 3-6 years of experience. However, your CV shows that you have only 4 years and 11 months of experience. This means that you may not be the most suitable candidate for this job.

In addition, the job description requires experience in IT-related development, but your CV does not mention any experience in this area. This means that you may need to acquire some relevant skills and experience in order to apply for this job.

Now, let's take a closer look at your CV and see what can be improved.

Work Experience:

Your CV shows that you have 4 years and 11 months of experience as a System Analyst. This is a good start, but it would be helpful to provide more specific details about your experience. For example, you could mention any specific projects you have worked on, any challenges you faced, and how you overcame them.

You could also highlight any skills or 

# Эксперименты

In [5]:
import gc
import torch
from tqdm.notebook import tqdm


def cleanup():
    torch.cuda.empty_cache()
    gc.collect()


def experiment(enhancer, jobs, temperatures=(1.0, 1.25, 1.5, 1.75, 2.0)):
    generation_kwargs_combinations = [{"do_sample": False, "temperature": 1.0}]
    for t in temperatures:
        generation_kwargs_combinations.append({"do_sample": True, "temperature": t})

    with tqdm(total=len(generation_kwargs_combinations) * len(jobs)) as pbar:
        for generation_kwargs in generation_kwargs_combinations:
            for job in jobs:
                cv_path = os.path.join("data", "CVs", f"{job}.txt")
                vacancy_dir = os.path.join("data", "Vacancies", job)
                for vacancy in (
                    f for f in os.listdir(vacancy_dir) if f.endswith(".txt")
                ):
                    vacancy_path = os.path.join(vacancy_dir, vacancy)
                    pbar.set_description(f"{vacancy_path} {generation_kwargs}")
                    enhancer.enhance(
                        cv_path, vacancy_path, save_report=True, **generation_kwargs
                    )
                    cleanup()
                pbar.update(1)

In [6]:
cleanup()

In [7]:
enhancer = CVEnhancer(model, tokenizer, max_len=1000)
jobs = [
    job.replace(".txt", "")
    for job in os.listdir("LLMmatch/data/matches")
    if job.endswith(".txt")
]
temperatures = (1.0, 1.25, 1.5, 1.75, 2.0)
experiment(enhancer, jobs, temperatures=(1.0, 1.25, 1.5, 1.75, 2.0))

  0%|          | 0/36 [00:00<?, ?it/s]

This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (4096). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


# Оценка

In [11]:
import os
import json
import random
from IPython.display import clear_output


class Evaluator:
    def __init__(
        self,
        report_path="evaluation/report.json",
        eval_dir="evaluation/results",
        guidelines_path="evaluation/guidelines.json",
        urls_path="evaluation/urls.json",
        shuffle=True,
    ):
        self.annotator = input("Enter your name")
        self.report_path = report_path
        self.eval_path = os.path.join(eval_dir, f"{self.annotator}_eval.json")
        self.reports = self.load(self.report_path)
        self.evaluated = self.load(self.eval_path)
        self.CRITERIA = {
            "adequacy": (0, 5),
            "reliability": (0, 5),
            "usefulness": (0, 5),
            "honesty": (0, 5),
            "linguistic correctness": (0, 5),
            "coherence": (0, 5),
        }
        with open(guidelines_path) as jf:
            self.guidelines = json.load(jf)
        with open(urls_path) as jf:
            self.urls = json.load(jf)
        self.shuffle = shuffle

    def load(self, path):
        if os.path.exists(path):
            with open(path) as jf:
                return json.load(jf)
        return {}

    def evaluate_report(self, report_id):
        report = self.reports[report_id]
        clear_output()

        print(f"CV: {report['cv']}\nVacancy:{report['job']}\n")

        guideline = self.guidelines.get(f"{report['cv']} & {report['job']}", "")
        cv_url = self.urls.get(report["cv"])
        vacancy_url = self.urls.get(report["job"])
        if cv_url:
            print(f"CV URL: {cv_url}")
        if vacancy_url:
            print(f"VACANCY URL: {vacancy_url}")

        if guideline:
            print(f"guidelines: {guideline}\n")

        print(report["output"])

        scores = {}
        for criterion, (min_score, max_score) in self.CRITERIA.items():
            score = int(input(f"{criterion} ({min_score}-{max_score})"))
            scores.update({criterion: score})
        self.evaluated.update({report_id: {**report, **scores}})
        with open(self.eval_path, "w") as jf:
            json.dump(self.evaluated, jf)

    def run(self):
        report_ids = list(self.reports.keys())
        if self.shuffle:
            random.shuffle(report_ids)
        for report_id in report_ids:
            if not report_id in self.evaluated:
                self.evaluate_report(report_id)
        print("All texts were evaluated. Thank you!")

In [55]:
Evaluator().run()

CV: data/CVs/electro.txt
Vacancy:data/Vacancies/electro/1.txt

VACANCY URL: (https://novosibirsk.hh.ru/vacancy/89244253?from=vacancy_search_list&hhtmFrom=vacancy_search_list&query=%D0%BA%D0%B0%D0%B1%D0%B5%D0%BB%D1%8C%D1%89%D0%B8%D0%BA)
guidelines: Оценка 8/10: Нет группы допуска V

CV Analysis:
1. Experience: This section of your CV shows your background in the field of cable connections, electronics and maintenance of telecommunication systems. Based on the information provided, it seems that you have more than 4 years of relevant experience in your field, specifically working as a "Кабельщик-спайщик" for at least two different employers, "МГТС"  Moscow Global Information Technologies & Systematics  and "Спецстрой России" which indicates a strong potential background that will match well with the job description.
2. Job Description: The job you have your sights set on, that of an electronically skilled network technician, primarily concerns with installations, masonry works and electr

KeyboardInterrupt: Interrupted by user

In [6]:
import os
import json
import pandas as pd


def aggregate_evaluations(eval_dir="evaluation/results"):
    criteria = [
        "adequacy",
        "reliability",
        "usefulness",
        "honesty",
        "linguistic correctness",
        "coherence",
    ]
    agg_criteria = {criterion: "mean" for criterion in criteria}
    all_evals = []
    fnames = [fname for fname in os.listdir(eval_dir) if fname.endswith(".json")]
    for i, fname in enumerate(fnames):
        path = os.path.join(eval_dir, fname)
        with open(path) as jf:
            evals = json.load(jf)
        for report_id, report in evals.items():
            eval = {"id": report_id, "annotator": i, **report}
            all_evals.append(eval)
    return (
        pd.DataFrame(all_evals)
        .groupby(["model", "do_sample", "temperature"])
        .agg({"id": "count", **agg_criteria})
        .round(2)
    )

In [7]:
results = aggregate_evaluations()
results.style.highlight_max(color="green", axis=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id,adequacy,reliability,usefulness,honesty,linguistic correctness,coherence
model,do_sample,temperature,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
meta-llama/Llama-2-7b-chat-hf,False,1.0,4,4.75,4.25,4.5,5.0,5.0,4.0
meta-llama/Llama-2-7b-chat-hf,True,1.0,4,3.75,3.5,4.0,4.25,5.0,4.5
meta-llama/Llama-2-7b-chat-hf,True,1.25,6,2.83,4.33,2.5,5.0,5.0,4.83
meta-llama/Llama-2-7b-chat-hf,True,1.5,4,2.75,2.75,3.25,3.75,3.0,3.0
meta-llama/Llama-2-7b-chat-hf,True,1.75,4,3.25,3.5,3.75,4.0,3.25,2.25
meta-llama/Llama-2-7b-chat-hf,True,2.0,6,2.5,2.33,2.5,3.17,2.17,1.83
mistralai/Mistral-7B-Instruct-v0.1,False,1.0,3,3.67,4.33,3.33,4.67,5.0,4.0
mistralai/Mistral-7B-Instruct-v0.1,True,1.0,2,3.5,4.0,4.0,5.0,5.0,5.0
mistralai/Mistral-7B-Instruct-v0.1,True,1.25,1,0.0,0.0,0.0,0.0,0.0,0.0
mistralai/Mistral-7B-Instruct-v0.1,True,1.5,3,1.67,1.33,1.67,2.0,3.0,2.33
