In [162]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import requests
import re
import ast
#import fitz 
import psycopg2
import psycopg2.extras


import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer
import torch.nn.functional as F
from tqdm import tqdm


import warnings
warnings.filterwarnings('ignore')


In [163]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL_NAME = "princeton-nlp/sup-simcse-roberta-base"
BATCH_SIZE = 32
EPOCHS = 15
MAX_LENGTH = 256

## Model load

In [164]:
class SiameseSimCSE(nn.Module):
    def __init__(self, model_name, freeze_percentage=0.8):
        super(SiameseSimCSE, self).__init__()
        self.encoder = AutoModel.from_pretrained(model_name)

        total_layers = len(self.encoder.encoder.layer)
        layers_to_freeze = int(total_layers * freeze_percentage)

        for param in self.encoder.parameters():
            param.requires_grad = False  

        for i in range(layers_to_freeze, total_layers):
            for param in self.encoder.encoder.layer[i].parameters():
                param.requires_grad = True  

    def forward(self, input_ids, attention_mask):
        output = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = output.last_hidden_state[:, 0] 
        embeddings = F.normalize(embeddings, p=2, dim=1)  
        return embeddings

In [165]:
model_load = SiameseSimCSE(MODEL_NAME).to(device)
model_load.load_state_dict(torch.load("mse_base.pth", map_location=torch.device("cpu")))
model_load.to(device)
model_load.eval()

SiameseSimCSE(
  (encoder): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): L

## Data preproc for model

In [166]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [167]:
from bs4 import BeautifulSoup

def preprocess_text(text):
    text = BeautifulSoup(text, "html.parser").get_text()
    text = re.sub(r'https?://\S+|www\.\S+', '<url>', text)
    emoji_pattern = re.compile("["  
                                u"\U0001F600-\U0001F64F"   # Emoji: Emoticons
                                u"\U0001F300-\U0001F5FF"   # Emoji: Miscellaneous Symbols and Pictographs
                                u"\U0001F680-\U0001F6FF"   # Emoji: Transport and Map Symbols
                                u"\U0001F1E0-\U0001F1FF"   # Emoji: Regional Indicator Symbols
                                u"\U00002500-\U00002BEF"   # Emoji: CJK Ideograph Extension A
                                u"\U00002702-\U000027B0"   # Emoji: Dingbats
                                u"\U000024C2-\U0001F251"   # Emoji: Enclosed Characters
                                u"\U0001F926-\U0001F937"   # Emoji: People
                                u"\U00010000-\U0010FFFF"   # Emoji: Supplemental Multilingual Plane
                                u"\u200d"                   # Zero Width Joiner
                                u"\u2640-\u2642"            # Emoji: Gender Symbols
                                u"\u2600-\u2B55"            # Emoji: Miscellaneous Symbols
                                u"\u23cf\u23e9\u231a"      # Emoji: Miscellaneous symbols
                                u"\u3030"                   # Emoji: Japanese Characters
                                u"\ufe0f\u2069\u2066"      # Emoji: Variation Selectors
                                u"\u200c\u2068\u2067"      # Emoji: Zero Width Non-Joiner
                                "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    text = re.sub(r'[\xa0\u200d\t\r\n]+', ' ', text)
    text = re.sub(r'\s{2,}', ' ', text).strip()
    text = re.sub(r"[^\w\s\.,'!?]", '', text)

    return text


### Connection with DB

In [168]:
class VectorDB:
    def __init__(self, dbname="vectordb", user="admin", password="admin", host="localhost", port=5432):
        self.connection = psycopg2.connect(
            dbname=dbname, user=user, password=password, host=host, port=port
        )

    def load_candidates(self, df):
        cur = self.connection.cursor()
        for _, row in df.iterrows():
            cur.execute(
                "INSERT INTO candidates (resume_text, embedding_vector) VALUES (%s, %s)",
                (row['resume_text'], row['resume_embeddings'])
            )
        self.connection.commit()
        cur.close()
        return f"{len(df)} records saved to candidates table."

    def save_feedback(self, df):
        cur = self.connection.cursor()
        for _, row in df.iterrows():
            cur.execute(
                """
                INSERT INTO feedback (job_description_text, resume_text, label)
                VALUES (%s, %s, %s)
                """,
                (row["vacancy"], row["candidate"], row["label"])
            )
        self.connection.commit()
        cur.close()
        return f"{len(df)} records saved to feedback table."

    def insert_vacancy(self, vacancy_text, vacancy_embedding, vacancy_link=None):
        cur = self.connection.cursor()
        cur.execute(
            """
            INSERT INTO vacancies (vacancy_text, vacancy_link, embedding_vector)
            VALUES (%s, %s, %s)
            RETURNING vacancy_id
            """,
            (vacancy_text, vacancy_link, vacancy_embedding)
        )
        vacancy_id = cur.fetchone()[0]
        self.connection.commit()
        cur.close()
        return vacancy_id

    def find_similar_candidates(self, k, vacancy_id):
        cur = self.connection.cursor()
        cur.execute(
            "SELECT embedding_vector FROM vacancies WHERE vacancy_id = %s",
            (vacancy_id,)
        )
        result = cur.fetchone()
        if not result:
            cur.close()
            return "Vacancy not found"
        
        vacancy_embedding = result[0]
        cur.execute(
            """
            SELECT (1 - (embedding_vector <=> %s)) AS similarity, resume_text
            FROM candidates
            ORDER BY similarity DESC
            LIMIT %s
            """,
            (vacancy_embedding, k)
        )
        results = cur.fetchall()
        cur.close()
        return results

    def close(self):
        self.connection.close()


## Text extraction and preproc

In [169]:
def extract_text_from_url(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
    except Exception as e:
        return f"Error during loading web-page: {str(e)}"

    soup = BeautifulSoup(response.content, "html.parser")
    for tag in soup(["script", "style", "header", "footer", "nav", "aside", "form", "noscript"]):
        tag.decompose()
    main_content = soup.find("main")
    if not main_content:
        main_content = max(soup.find_all("div"), key=lambda d: len(d.get_text(strip=True)), default=None)

    if main_content:
        text = main_content.get_text(separator=' ', strip=True)
    else:
        text = soup.get_text(separator=' ', strip=True)

    return text


In [171]:
def extract_text(input_type, text_input=None, url=None, pdf_file=None):
    if input_type == "from url":
        try:
            text = extract_text_from_url(url)
        except Exception as e:
            return f"Error fetching URL: {e}"

    # elif input_type == "from pdf":
    #     try:
    #         doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
    #         text = ""
    #         for page in doc:
    #             text += page.get_text()
    #     except Exception as e:
    #         return f"Error reading PDF: {e}"

    elif input_type == "from text":
        text = text_input

    else:
        return "Invalid input type"

    return text


In [177]:
def get_embedding(input_type, text_input=None, url=None, pdf_file=None):
    
    db = VectorDB()
    
    raw_text = extract_text(input_type, text_input, url, pdf_file)

    if not isinstance(raw_text, str):
        return raw_text
    
    cleaned_text = preprocess_text(raw_text)
    
    encoded = tokenizer(
        cleaned_text,
        padding='max_length',
        truncation=True,
        max_length=MAX_LENGTH,
        return_tensors='pt'
    )
    input_ids = encoded['input_ids'].to(device)
    attention_mask = encoded['attention_mask'].to(device)

    with torch.no_grad():
        embedding = model_load(input_ids, attention_mask)
        
    url = url if url else ''
    
    embedding = embedding[0].cpu().numpy().tolist()
    vacancy_id = db.insert_vacancy(cleaned_text,embedding, url)
    
    return cleaned_text, vacancy_id


In [58]:
link = 'https://jobs.develops.today/jobs/manual-qa-engineer'

## API

In [178]:
stored_candidates = []

def interface_submit_vacancy(input_type, text_input, url_input):
    cleaned_text, vacancy_id = get_embedding(
        input_type=input_type,
        text_input=text_input if input_type == "from text" else None,
        url=url_input if input_type == "from url" else None,
    )
    return cleaned_text, vacancy_id, "Data has been successfully encoded"

def interface_find_candidates(k, vacancy_id):
    k = int(k)
    db = VectorDB()
    
    candidates =db.find_similar_candidates(k, vacancy_id)
    results = [(round(sim, 4), text) for sim, text in candidates]
    
    return results 

def save_labels(cleaned_text, *values):
    
    db = VectorDB()
    labels = values[:10] 
    candidates = values[10:]  

    data = []
    for i in range(10):
        if candidates[i].strip():  
            data.append({
                "vacancy": cleaned_text,
                "candidate": candidates[i],
                "label": labels[i]
            })

    df = pd.DataFrame(data)
    return db.save_feedback(df)

In [182]:
import gradio as gr

with gr.Blocks() as demo:
    gr.Markdown("## Candidate Matcher")

    input_type = gr.Radio(choices=["from url", "from text"], label="Input type", value="from text")

    with gr.Row():
        text_input = gr.Textbox(label="Type/insert vacancy text here", lines=8, visible=True)
        url_input = gr.Textbox(label="Insert vacancy URL here", visible=False)
        
    generate_btn = gr.Button("Get embedding")
    cleaned_text_output = gr.Textbox(label='Cleaned text', lines=8, visible=False)
    embedding_output = gr.Textbox(visible=False)
    status_output = gr.Textbox(visible=True, interactive=False, label="Embedding status")
    vacancy_id_output = gr.Textbox(label="Vacancy ID")

    def toggle_inputs(choice):
        return (
            gr.update(visible=(choice == "from text")),
            gr.update(visible=(choice == "from url")),
            gr.update(visible=(choice == "from url"))
        )

    input_type.change(toggle_inputs, inputs=[input_type], outputs=[text_input, url_input, cleaned_text_output])

    generate_btn.click(
        interface_submit_vacancy,
        inputs=[input_type, text_input, url_input],
        outputs=[cleaned_text_output, vacancy_id_output, status_output]
    )

    gr.Markdown("## Find similar candidates")
    k_input = gr.Number(label="Number of top-k candidates", value=5, precision=0)
    find_btn = gr.Button("Find Candidates")

    candidate_rows = []
    similarity_bars = []
    candidate_boxes = []
    label_radios = []

    for i in range(10):  
        with gr.Column(visible=False) as column:
            similarity = gr.Slider(minimum=0, maximum=1, step=0.0001, label=f"Similarity {i+1}", interactive=False)
            text = gr.Textbox(label=f"Candidate {i+1}", lines=5, interactive=False)
            radio = gr.Radio(choices=["Fit", "No Fit"], label="Assessment", value="Fit", interactive=True)
            similarity_bars.append(similarity)
            candidate_boxes.append(text)
            label_radios.append(radio)
            candidate_rows.append(column)

    def update_candidate_outputs(k, vacancy_id_output):
        results = interface_find_candidates(k, vacancy_id_output)
        updates = []

        for i in range(10):
            if i < len(results):
                sim, text = results[i]
                updates += [
                    gr.update(visible=True),  # column
                    gr.update(value=sim),     # similarity slider
                    gr.update(value=text),    # resume text
                    gr.update(value="Fit")    # Fit/No Fit
                ]
            else:
                updates += [
                    gr.update(visible=False),
                    gr.update(value=0.0),
                    gr.update(value=""),
                    gr.update(value="Fit")
                ]
        return updates

    find_btn.click(
        update_candidate_outputs,
        inputs=[k_input, vacancy_id_output],
        outputs=sum([[candidate_rows[i], similarity_bars[i], candidate_boxes[i], label_radios[i]] for i in range(10)], [])
    )

    gr.Markdown("## Save Fit/No Fit results")
    save_btn = gr.Button("Save Feedback")
    saved_results_output = gr.Textbox(label="Saved Results Status", lines=1)


    save_btn.click(
        save_labels,
        inputs=[cleaned_text_output] + label_radios + candidate_boxes,
        outputs=saved_results_output
    )

demo.launch(debug=True)


* Running on local URL:  http://127.0.0.1:7863

To create a public link, set `share=True` in `launch()`.


Keyboard interruption in main thread... closing server.


