In [None]:
!pip install beautifulsoup4
!pip install requests
!pip install tqdm
!pip install transformers
!pip install nltk
!pip install torch
!pip install transformers


In [None]:
from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin
import re
from tqdm import tqdm
import json
from transformers import BertTokenizer, BertModel
import torch

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

output_filename = 'vacancies.pt'
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertModel.from_pretrained('bert-base-multilingual-cased')

In [None]:


base_url = "https://careers.itmo.ru/catalog/"
current_page = 1

vacancies = []

def remove_stopwords(text):
    stop_words = set(stopwords.words('russian'))
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = torch.mean(outputs.last_hidden_state, dim=1)  
    return embeddings


index = 0

while current_page<49:
    url = f"{base_url}page-{current_page}.html"
    
    response = requests.get(url)

    if response.status_code == 200:
        
        soup = BeautifulSoup(response.content, "html.parser")
        
        job_items = soup.find_all("div", class_="jobs-item")
        
        for job_item in job_items:
            title = job_item.find("h6", class_="title").text.strip()
            company = job_item.find("span", class_="meta").text.strip()
            description = job_item.find("p", class_="description").text.strip()

            description = description.replace('\n', ' ')

            description = re.sub(' +', ' ', description)

            vacancy_info = {
                "Index": index,
                "Название вакансии": [title],
                "Компания": [company],
                "Описание": [description],
                "Embeded": get_bert_embeddings(remove_stopwords(description))
            }
            vacancies.append(vacancy_info)
            index+=1

        current_page += 1
    else:
        print("Ошибка при получении страницы:", response.status_code)
        break



In [None]:
torch.save(vacancies, output_filename)