## Imports

In [1]:
import os
import sys
import json
import gc
import re
import json
from tqdm import notebook as tqdm
import pickle
import itertools
from collections import Counter, defaultdict

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import httpx
import logging

from llm_input import top_tokens_lvl0, top_tokens_lvl1, gpt_output_lvl0, gpt_output_lvl1

OLLAMA_CONFIG = {
    "keep_alive": "5m",
    "stream": False,
}

SEP_TOKEN = '<sep>'

gpt_output_lvl0 = np.array(list(gpt_output_lvl0.items()))
gpt_output_lvl1 = np.array(list(gpt_output_lvl1.items()))

In [2]:
sns.set(font_scale=1.2, palette='Set2')

plt.rcParams['font.family'] = 'DejaVu Serif'
plt.rcParams['lines.linewidth'] = 2
plt.rcParams['lines.markersize'] = 12
plt.rcParams['xtick.labelsize'] = 16
plt.rcParams['ytick.labelsize'] = 24
plt.rcParams['legend.fontsize'] = 24
plt.rcParams['axes.titlesize'] = 30
plt.rcParams['axes.labelsize'] = 24
plt.rcParams["figure.figsize"] = (12, 7)

SEED = 123

## Text preprocessing

–°–º–æ—Ç—Ä–∏–º –Ω–∞ —Ä–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ —Å–µ–∫—Ü–∏–π –≤–Ω—É—Ç—Ä–∏ —Ç–µ–∫—Å—Ç–∞.

–°–µ–∫—Ü–∏—è -- –≤—Å—ë, —á—Ç–æ –æ–±—ë—Ä–Ω—É—Ç–æ –≤ `==`

–ò–∑ –ø—Ä–∏–º–µ—Ä–æ–≤ –≤—ã—à–µ: `== –ú–∞–π–Ω–¥–º–∞–ø ==`, `== –ê–Ω–Ω–æ—Ç–∞—Ü–∏—è ==`, `== –í–∏–¥–µ–æ ==`

–î–∞–ª–µ–µ —Ñ–∏–ª—å—Ç—Ä—É–µ–º –ø–æ —á–∏—Å–ª—É –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤ –≤ —Å–µ–∫—Ü–∏–∏: –∏—Ö –¥–æ–ª–∂–Ω–æ –±—ã—Ç—å –Ω–µ –º–µ–Ω–µ–µ 20.

–í –∫–æ–Ω—Ü–µ —Å–æ—Ä—Ç–∏—Ä—É–µ–º –ø–æ —á–∏—Å–ª—É –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤ –≤ —Å–µ–∫—Ü–∏–∏.

In [3]:
with open('../data/in/0x1tv-dataset.pickle', 'rb') as fd:
    data = pickle.load(fd)

–û—Å—Ç–∞–≤–ª—è–µ–º 3 –¥–æ–º–µ–Ω–∞: –∑–∞–≥–æ–ª–æ–≤–æ–∫, –∞–Ω–Ω–æ—Ç–∞—Ü–∏—è –∏ —Ç–µ–∑–∏—Å—ã

In [4]:
def get_min_position(*positions):
    return min([pos for pos in positions if pos >= 0], default=len(data))

def extract_section_text(data, section_name):
    if section_name == '–ê–Ω–Ω–æ—Ç–∞—Ü–∏—è':
        blockquote_pattern = r'<blockquote>(.*?)</blockquote>'
        match = re.search(blockquote_pattern, data, re.DOTALL)
        return match.group(1).strip() if match else None
    
    section_pattern = fr'==\s*{re.escape(section_name)}\s*=='
    section_start = re.search(section_pattern, data)
    
    if not section_start:
        return None
   
    position = section_start.end()

    # Look for stop tokens and the start of next sections
    stop_tokens = ['{{----}}', '{{LinksSection}}', '== –ü—Ä–∏–º–µ—á–∞–Ω–∏—è –∏ –æ—Ç–∑—ã–≤—ã ==']
    ends = [data.find(token, position) for token in stop_tokens]
    end_position = get_min_position(*ends)
    
    if end_position != len(data):
        return data[position:end_position].strip()
    
    return data[position:].strip()

def extract_sections(data):
    sections = ['–ê–Ω–Ω–æ—Ç–∞—Ü–∏—è', 'Thesis', '–¢–µ–∑–∏—Å—ã', '–†–∞—Å—à–∏—Ä–µ–Ω–Ω—ã–µ —Ç–µ–∑–∏—Å—ã']
    extracted_texts = {section: extract_section_text(data, section) for section in sections}
    return extracted_texts


def coalesce(dct, keys):
    for key in keys:
        if dct[key] is not None or len(dct[key]) > 0:
            return dct[key]
    return None


def parse_document(doc):
    parsed_doc = dict()
    parsed_doc['title'] = doc['title']
    
    sections_data = extract_sections(doc['text'])
    parsed_doc['annotation'] = sections_data['–ê–Ω–Ω–æ—Ç–∞—Ü–∏—è']
    parsed_doc['thesis'] = coalesce(sections_data, ['Thesis', '–¢–µ–∑–∏—Å—ã', '–†–∞—Å—à–∏—Ä–µ–Ω–Ω—ã–µ —Ç–µ–∑–∏—Å—ã'])
    
    return parsed_doc

–ü–∞—Ä—Å–∏–º –§–ò–û –¥–æ–∫–ª–∞–¥—á–∏–∫–æ–≤

In [20]:
def speaker_detect(text):
    speakers = re.findall(r'(?<={{Speaker\|).*(?=}})', text)
    preproc_speakers = []
    for speaker in speakers:
        preproc_speakers.extend(speaker.split('|'))
    return preproc_speakers


all_speakers = [speaker_detect(text['text']) for text in data['articles']]

print('–ß–∏—Å–ª–æ –¥–æ–∫–ª–∞–¥–æ–≤ –±–µ–∑ —Å–ø–∏–∫–µ—Ä–æ–≤:', len(list(filter(lambda x: len(x) == 0, all_speakers))))
print('–ß–∏—Å–ª–æ –¥–æ–∫–ª–∞–¥–æ–≤ —Å –æ–¥–Ω–∏–º —Å–ø–∏–∫–µ—Ä–æ–º:', len(list(filter(lambda x: len(x) == 1, all_speakers))))
print('–ß–∏—Å–ª–æ –¥–æ–∫–ª–∞–¥–æ–≤ —Å –¥–≤—É–º—è —Å–ø–∏–∫–µ—Ä–∞–º–∏:', len(list(filter(lambda x: len(x) == 2, all_speakers))))
print('–ß–∏—Å–ª–æ –¥–æ–∫–ª–∞–¥–æ–≤ —Å –Ω–µ –º–µ–Ω–µ–µ —Ç—Ä–µ–º—è —Å–ø–∏–∫–µ—Ä–∞–º–∏:', len(list(filter(lambda x: len(x) >= 3, all_speakers))))

–ß–∏—Å–ª–æ –¥–æ–∫–ª–∞–¥–æ–≤ –±–µ–∑ —Å–ø–∏–∫–µ—Ä–æ–≤: 74
–ß–∏—Å–ª–æ –¥–æ–∫–ª–∞–¥–æ–≤ —Å –æ–¥–Ω–∏–º —Å–ø–∏–∫–µ—Ä–æ–º: 2313
–ß–∏—Å–ª–æ –¥–æ–∫–ª–∞–¥–æ–≤ —Å –¥–≤—É–º—è —Å–ø–∏–∫–µ—Ä–∞–º–∏: 119
–ß–∏—Å–ª–æ –¥–æ–∫–ª–∞–¥–æ–≤ —Å –Ω–µ –º–µ–Ω–µ–µ —Ç—Ä–µ–º—è —Å–ø–∏–∫–µ—Ä–∞–º–∏: 35


–§–∏–ª—å—Ç—Ä—É–µ–º –∫–∞—Ç–µ–≥–æ—Ä–∏–∏

In [21]:
def clear_categories(sample):
    return [re.sub('–ö–∞—Ç–µ–≥–æ—Ä–∏—è:', '', el) for el in sample]


text_categories = [clear_categories(item['categories']) for item in data['articles']]
child_categories = [item['title'] for item in data['categories']]
parent_categories = [clear_categories(item['categories']) for item in data['categories']]

ALL_CATEGORIES = list(set(itertools.chain(*text_categories)) | set(child_categories) | set(itertools.chain(*parent_categories)))

child_to_parent_categories = {item['title']: clear_categories(item['categories']) for item in data['categories']}

In [23]:
ALL_CATEGORIES

['–ì–µ–π–º–∏—Ñ–∏–∫–∞—Ü–∏—è –≤ UX',
 'SECON-2017',
 '–ú–∏—Ö–∞–∏–ª –ö–∞—Ä–ø–æ–≤',
 '–ù–∏–∫–∏—Ç–∞ –ï—Ä–º–∞–∫–æ–≤',
 '–ò–ª—å—è –ë–æ–≥—É–Ω–æ–≤',
 '–ò–≥–æ—Ä—å –ë—É—Ä–µ–Ω–∫–æ–≤',
 '–ê–ª–µ–∫—Å–∞–Ω–¥—Ä –ü—Ä–æ–∑–æ—Ä–æ–≤',
 '–î–º–∏—Ç—Ä–∏–π –°–∞—Ç–∏–Ω',
 '–û–ª–µ–≥ –ì—Ä–æ–º–æ–≤',
 '–í–∏—Ç–∞–ª–∏–π –ö–∞–º—è–Ω—Å–∫–∏–π',
 '–ï–≤–≥–µ–Ω–∏–π –ö–∞–±–∞–Ω–æ–≤',
 '–Æ—Ä–∏–π –û—Ä–ª–æ–≤',
 '–ù–∞—Ç–∞–ª–∏—è –°–º–∏—Ä–Ω–æ–≤–∞',
 '–í–ª–∞–¥–∏–º–∏—Ä –í–∞—Ö–ª–æ–≤',
 '–ú–∏—Ö–∞–∏–ª –ó–∞–±–æ—Ä–æ–≤',
 '–ù–∞–∏–º –®–∞—Ñ–∏–µ–≤',
 'UI –±–∏–∑–Ω–µ—Å-–ø—Ä–∏–ª–æ–∂–µ–Ω–∏–π',
 '–û—Ü–µ–Ω–∫–∞ —Å–æ—Ç—Ä—É–¥–Ω–∏–∫–æ–≤',
 '–í–∏–∫—Ç–æ—Ä–∏—è –ü—Ä–∏–¥–∞—Ç–∫–æ',
 'Zuzi Sochova',
 '–ö–æ–Ω—Å—Ç–∞–Ω—Ç–∏–Ω –†—ã–±–∞—Å',
 '–ï–∫–∞—Ç–µ—Ä–∏–Ω–∞ –ü–æ—Ç–∞–ø–æ–≤–∞',
 '–î–º–∏—Ç—Ä–∏–π –ü–æ–¥–ª—É–∂–Ω—ã–π',
 '–§–µ–¥–æ—Ä –õ—è—Ö–æ–≤',
 '–ê–ª–µ–∫—Å–µ–π –õ–µ—Å–æ–≤—Å–∫–∏–π',
 '–ö—Ä–∏—Å—Ç–∏–Ω–∞ –ü–∏–≤–æ–≤–∞—Ä–æ–≤–∞',
 '–ü–ª–∞—Ç–æ–Ω –î–Ω–µ–ø—Ä–æ–≤—Å–∫–∏–π',
 '–ì–æ–ª–æ—Å–æ–≤–æ–π –∏–Ω—Ç–µ—Ä—Ñ–µ–π—Å',
 '–ù–∏–∫–∏—Ç–∞ –§—Ä–æ–ª–æ–≤',
 '–ê–ª–µ–∫—Å–∞–Ω–¥—Ä –õ–µ–≥–∞–ª–æ–≤',
 '

–£–¥–∞–ª–∏–º –≤—Ä—É—á–Ω—É—é –≤—Å–µ –∏–º–µ–Ω–∞ –∏–∑ —Å–ø–∏—Å–∫–∞ –∫–∞—Ç–µ–≥–æ—Ä–∏–π + `HasSpeaker`

üßî: –û—Ö, –≤—Ä—É—á–Ω—É—é –Ω–∏—á–µ–≥–æ –¥–µ–ª–∞—Ç—å –Ω–µ–ª—å–∑—è, –Ω–µ–≤–æ—Å–ø—Ä–æ–∏–∑–≤–æ–¥–∏–º–æ –∏ –Ω–µ–º–æ–¥–∏—Ñ–∏—Ü–∏—Ä—É–µ–º–æ. –î–∞ –∏ –≤—Ä—É—á–Ω—É—é –Ω–µ –≤—ã–π–¥–µ—Ç (¬´Pascale Xelot-Dugat¬ª ‚Äî —ç—Ç–æ —Å–ø–∏–∫–µ—Ä)

In [22]:
ALL_CATEGORIES_WITHOUT_NAMES = set([
    '.NET',
    '1C',
    'ALT Linux',
    'ALTLinux –Ω–∞ –≠–ª—å–±—Ä—É—Å–µ',
    'AR',
    'AWS',
    'Accessibility',
    'Agile',
    'Agile Introduction',
    'Agile process',
    'Agile –≤ –∫–æ—Ä–ø–æ—Ä–∞—Ü–∏—è—Ö',
    'Agile ‚Äî —Ç–µ—Ö–Ω–æ–ª–æ–≥–∏—á–µ—Å–∫–∏–µ –ø—Ä–∞–∫—Ç–∏–∫–∏',
    'Agile&Lean Mindset',
    'Agile-–∫—É–ª—å—Ç—É—Ä–∞',
    'Agile-–º–∞—Å—à—Ç–∞–±–∏—Ä–æ–≤–∞–Ω–∏–µ',
    'Agile-–ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏—è',
    'Alfresco',
    'Ansible',
    'Arduino',
    'AstraLinux',
    'Atlassian',
    'Azure',
    'B2B –ø—Ä–æ–¥—É–∫—Ç—ã',
    'BDD',
    'BigData',
    'Blockchain',
    'Bluemix',
    'Business rules engine',
    'C++',
    'CMMI',
    'CQRS',
    'CRIU',
    'Clouds',
    'Clsync',
    'Code Review',
    'Collaboration tools',
    'Configuration Management',
    'ContactOK',
    'Continuous Integration',
    'CouchDB',
    'Csharp',
    'CustisWikiToLib',
    'Customer Journey Map',
    'DDD',
    'DSL-—è–∑—ã–∫–∏',
    'Data Analysis',
    'Deployment',
    'Design Thinking',
    'DevOps',
    'Draft',
    'E-commerce',
    'Embox',
    'Erlang',
    'Extreme Programming',
    'Feature Branches',
    'Firefox',
    'Foresight management',
    'FreeIPA',
    'Front end development',
    'Fsharp',
    'GWT',
    'Glibc',
    'Go',
    'Groovy',
    'Growth Hacking',
    'HR',
    'Hardware',
    'Health',
    'High Performace Computing',
    'Highload-–∞—Ä—Ö–∏—Ç–µ–∫—Ç—É—Ä—ã',
    'IP-—Ç–µ–ª–µ—Ñ–æ–Ω–∏—è',
    'IT-–∑–∞–∫–æ–Ω—ã',
    'IT-–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ',
    'Impact Map',
    'Information Security',
    'Internet of Thing',
    'Java',
    'Java EE',
    'Javascript',
    'Jenkins',
    'Kanban',
    'Knowledge Management',
    'Kotlin',
    'Kubernetes',
    'LAMP',
    'LSM',
    'LeSS',
    'Lean',
    'Lean Startup',
    'Legacy',
    'Libreoffice',
    'Linux',
    'Linux –¥–ª—è –≠–ª—å–±—Ä—É—Å–∞',
    'Linux-–¥–∏—Å—Ç—Ä–∏–±—É—Ç–∏–≤—ã',
    'Linux-–¥–∏—Å—Ç—Ä–∏–±—É—Ç–∏–≤—ã –¥–ª—è Enterprise',
    'Linux-–æ–±—É—á–µ–Ω–∏–µ',
    'Lua',
    'MIPS',
    'Machine Learning',
    'Mechanics',
    'Microsoft',
    'Misc',
    'Mkimage-profiles',
    'MongoDB',
    'Morpheus',
    'MySQL',
    'NLP',
    'NOSQL',
    'NUI',
    'Natural Language Processing',
    'NeedContacts',
    'Nemerle',
    'Node.js',
    'Object Oriented Programming',
    'Open-source',
    'Open-source CAD',
    'Open-source CMS',
    'Open-source CRM',
    'Open-source ERP',
    'Open-source PAAS',
    'Open-source TCO',
    'Open-source and Community',
    'Open-source and hardware',
    'Open-source communications',
    'Open-source for Enterprise',
    'Open-source operating systems',
    'Open-source projects',
    'Open-source –°–£–ë–î',
    'Open-source –∏ –∑–∞–∫–æ–Ω—ã',
    'OpenShift',
    'OpenStack',
    'OpenVZ',
    'PAAS',
    'PHP',
    'Pascale Xelot-Dugat',
    'People Management',
    'Pivot',
    'PostgreSQL',
    'ProductMeetup',
    'Python',
    'RISC-V',
    'ROSA Linux',
    'ROSALab',
    'RTOS',
    'Reviewed',
    'Riak',
    'Ruby',
    'RunaWFE',
    'SAP',
    'SELinux',
    'SOLID',
    'SVM',
    'Samba',
    'Scala',
    'Scrum',
    'Serverless',
    'Sharepoint',
    'SkillsWiki',
    'Skype',
    'Software Defined Networks',
    'Strace',
    'Support',
    'TAU-–ø–ª–∞—Ç—Ñ–æ—Ä–º–∞',
    'TDD',
    'Talks in English',
    'Tarantool',
    'Taucraft',
    'Team Communication',
    'Te–º—ã',
    'ToPublish',
    'Tuukka Ahoniemi',
    'UI',
    'UI SmartTV',
    'UI –±–∏–∑–Ω–µ—Å-–ø—Ä–∏–ª–æ–∂–µ–Ω–∏–π',
    'UX',
    'UX + Agile',
    'UX –ø—Ä–æ–µ–∫—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ',
    'User Story',
    'VCS',
    'Visual Studio',
    'Waterfall',
    'WebRTC',
    'Windows',
    'World Usability Day',
    'ZFS',
    'Zabbix',
    '–ê–≤—Ç–æ–º–∞—Ç–∏–∑–∏—Ä–æ–≤–∞–Ω–Ω–æ–µ —Ç–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ',
    '–ê–ª–≥–æ—Ä–∏—Ç–º—ã',
    '–ê–Ω–∞–ª–∏–∑ –ø—Ä–æ–≥—Ä–∞–º–º –∏ —Å–∏—Å—Ç–µ–º',
    '–ê–Ω–∞–ª–∏—Ç–∏–∫–∞',
    '–ê—Ä—Ö–∏—Ç–µ–∫—Ç—É—Ä–∞',
    '–ê—Ä—Ö–∏—Ç–µ–∫—Ç—É—Ä–∞ –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏–æ–Ω–Ω—ã—Ö —Å–∏—Å—Ç–µ–º',
    '–ê—Ä—Ö–∏—Ç–µ–∫—Ç—É—Ä–∞ —Å–µ—Ä–≤–µ—Ä–Ω—ã—Ö –ø—Ä–∏–ª–æ–∂–µ–Ω–∏–π',
    '–ê—É—Ç–µ–Ω—Ç–∏—Ñ–∏–∫–∞—Ü–∏—è –∏ –∞–≤—Ç–æ—Ä–∏–∑–∞—Ü–∏—è',
    '–ë–ü–õ–ê',
    '–ë–∞–∑—ã –¥–∞–Ω–Ω—ã—Ö',
    '–ë–∞–π–∫–∞–ª',
    '–ë–µ–∑–æ–ø–∞—Å–Ω–æ—Å—Ç—å',
    '–ë–∏–∑–Ω–µ—Å –≤ IT',
    '–ë–∏–∑–Ω–µ—Å –∏ –°–ü–û',
    '–ë–∏–∑–Ω–µ—Å-–∞–Ω–∞–ª–∏–∑',
    '–ë–ª–∏—Ü-–¥–æ–∫–ª–∞–¥—ã',
    '–í–µ–±-–¥–∏–∑–∞–π–Ω',
    '–í–µ–±-—Ä–∞–∑—Ä–∞–±–æ—Ç–∫–∞',
    '–í–µ–±–∏–Ω–∞—Ä—ã PingWin',
    '–í–µ—Ä–∏—Ñ–∏–∫–∞—Ü–∏—è',
    '–í–∏–¥–µ–æ—Å–≤—è–∑—å',
    '–í–∏–∑—É–∞–ª–∏–∑–∞—Ü–∏—è',
    '–í–∏—Ä—Ç—É–∞–ª–∏–∑–∞—Ü–∏—è',
    '–í–∏—Ä—Ç—É–∞–ª—å–Ω—ã–π –∞—Å—Å–∏—Å—Ç–µ–Ω—Ç',
    '–í—Å—Ç—Ä–∞–∏–≤–∞–µ–º—ã–µ —Å–∏—Å—Ç–µ–º—ã',
    '–í—ã—Ö–æ–¥ –Ω–∞ –∑–∞—Ä—É–±–µ–∂–Ω—ã–µ —Ä—ã–Ω–∫–∏',
    '–ì–µ–π–º–∏—Ñ–∏–∫–∞—Ü–∏—è',
    '–ì–µ–π–º–∏—Ñ–∏–∫–∞—Ü–∏—è –≤ UX',
    '–ì–µ–æ–ª–æ–∫–∞—Ü–∏—è',
    '–ì–æ–ª–æ—Å–æ–≤–æ–π –∏–Ω—Ç–µ—Ä—Ñ–µ–π—Å',
    '–ì–æ—Å—Å–µ–∫—Ç–æ—Ä',
    '–ì–æ—Å—É–¥–∞—Ä—Å—Ç–≤–æ –∏ —Å–æ—Ñ—Ç',
    '–ì—Ä–∞—Ñ–æ–≤—ã–µ –±–∞–∑—ã –¥–∞–Ω–Ω—ã—Ö',
    '–î–∏–∞–≥—Ä–∞–º–º—ã –ö–∞–Ω–æ',
    '–î–∏–∑–∞–π–Ω',
    '–î–∏–Ω–∞–º–∏—á–µ—Å–∫–∏–π –∞–Ω–∞–ª–∏–∑',
    '–î–∏—Å–∫—É—Å—Å–∏–∏',
    '–î–∏—Å–∫—É—Å—Å–∏–∏ –æ —é–∑–∞–±–∏–ª–∏—Ç–∏',
    '–î–æ–≤–µ—Ä–µ–Ω–Ω–∞—è –∑–∞–≥—Ä—É–∑–∫–∞',
    '–î–æ–∫–ª–∞–¥ —Å–æ —Å—Ç–µ–Ω–æ–≥—Ä–∞–º–º–æ–π',
    '–î–æ–∫–ª–∞–¥—á–∏–∫–∏',
    '–î–æ–∫–ª–∞–¥—ã –Ω–∞ –∞–Ω–≥–ª–∏–π—Å–∫–æ–º',
    '–î–æ–∫–ª–∞–¥—ã –Ω–∞ –±–µ–ª–æ—Ä—É—Å—Å–∫–æ–º —è–∑—ã–∫–µ',
    '–î–æ–∫–ª–∞–¥—ã –Ω–∞ –∏–Ω–æ—Å—Ç—Ä–∞–Ω–Ω—ã—Ö —è–∑—ã–∫–∞—Ö',
    '–î–æ–∫–ª–∞–¥—ã –Ω–∞ —É–∫—Ä–∞–∏–Ω—Å–∫–æ–º',
    '–î–æ–∫—É–º–µ–Ω—Ç–∞—Ü–∏—è –∏ Agile',
    '–î–æ–∫—É–º–µ–Ω—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ',
    '–ó–∞–ø—É—Å–∫ –ø—Ä–æ–¥—É–∫—Ç–∞',
    '–ó–∞–ø—É—Å–∫ –ø—Ä–æ–¥—É–∫—Ç–∞ –≤ Retail',
    '–ò–Ω—Å—Ç—Ä—É–º–µ–Ω—Ç—ã –º–∞–π–Ω—Ç–µ–π–Ω–µ—Ä–∞',
    '–ò–Ω—Å—Ç—Ä—É–º–µ–Ω—Ç—ã –º–∞–π–Ω—Ç–µ–π–Ω–µ—Ä–æ–≤',
    '–ò–Ω—Å—Ç—Ä—É–º–µ–Ω—Ç—ã —Ä–∞–∑—Ä–∞–±–æ—Ç–∫–∏',
    '–ò–Ω—Ñ–æ—Ä–º–∞—Ü–∏–æ–Ω–Ω–∞—è –±–µ–∑–æ–ø–∞—Å–Ω–æ—Å—Ç—å',
    '–ò–Ω—Ñ–æ—Ä–º–∞—Ü–∏–æ–Ω–Ω–∞—è –±–µ–∑–æ–ø–∞—Å–Ω–æ—Å—Ç—å –∏ –°–ü–û',
    '–ò–Ω—Ñ–æ—Ä–º–∞—Ü–∏–æ–Ω–Ω—ã–µ —Å–∏—Å—Ç–µ–º—ã –í–£–ó–æ–≤',
    '–ò—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ open-source',
    '–ò—Å—Å–ª–µ–¥–æ–≤–∞—Ç–µ–ª—å—Å–∫–æ–µ —Ç–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ',
    '–ò—Å—Ç–æ—Ä–∏—è –∏–∑ –ø—Ä–∞–∫—Ç–∏–∫–∏',
    '–ö–∞—Ä—Ç–æ–≥—Ä–∞—Ñ–∏—è',
    '–ö–∞—Ä—å–µ—Ä–∞ –≤ IT',
    '–ö–æ–º–∞–Ω–¥–æ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ',
    '–ö–æ–º–ø–∏–ª—è—Ç–æ—Ä—ã',
    '–ö–æ–º–ø–∏–ª—è—Ü–∏—è –ø–æ–¥ Linux',
    '–ö–æ–º–ø—å—é—Ç–µ—Ä–Ω–∞—è –≥—Ä–∞—Ñ–∏–∫–∞',
    '–ö–æ–º–ø—å—é—Ç–µ—Ä–Ω–æ–µ –∑—Ä–µ–Ω–∏–µ',
    '–ö–æ–Ω—Ñ–µ—Ä–µ–Ω—Ü–∏–∏',
    '–ö–æ—Ä–ø–æ—Ä–∞—Ç–∏–≤–Ω—ã–µ —Ä–µ—à–µ–Ω–∏—è',
    '–ö—Ä–∏–ø—Ç–æ–≥—Ä–∞—Ñ–∏—è',
    '–ö—Ä–æ—Å—Å–ø–ª–∞—Ç—Ñ–æ—Ä–º–µ–Ω–Ω–∞—è —Ä–∞–∑—Ä–∞–±–æ—Ç–∫–∞',
    '–ö—Ä—É–≥–ª—ã–π —Å—Ç–æ–ª',
    '–ö—É–º–∏—Ä',
    '–õ–∏–¥–µ—Ä—Å—Ç–≤–æ',
    '–õ–∏–Ω–µ–π–∫–∏ –∫–æ–Ω—Ñ–µ—Ä–µ–Ω—Ü–∏–π',
    '–õ–æ–∫–∞–ª–∏–∑–∞—Ü–∏—è',
    '–ú–∞—Ä–∫–µ—Ç–∏–Ω–≥',
    '–ú–∞—Å—Ç–µ—Ä-–∫–ª–∞—Å—Å—ã',
    '–ú–µ–Ω–µ–¥–∂–º–µ–Ω—Ç',
    '–ú–µ—Ç—Ä–∏–∫–∏ –∫–∞—á–µ—Å—Ç–≤–∞',
    '–ú–∏–∫—Ä–æ–ø—Ä–æ–≥—Ä–∞–º–º–∏—Ä–æ–≤–∞–Ω–∏–µ',
    '–ú–∏–∫—Ä–æ—Å–µ—Ä–≤–∏—Å—ã',
    '–ú–æ–±–∏–ª—å–Ω–∞—è —Ä–∞–∑—Ä–∞–±–æ—Ç–∫–∞',
    '–ú–æ–¥–µ–ª–∏—Ä–æ–≤–∞–Ω–∏–µ –±–∏–∑–Ω–µ—Å-–ø—Ä–æ—Ü–µ—Å—Å–æ–≤',
    '–ú–æ–¥–µ–ª–∏—Ä–æ–≤–∞–Ω–∏–µ —Ñ–∏–∑–∏—á–µ—Å–∫–∏—Ö —Å–∏—Å—Ç–µ–º',
    '–ú–æ–Ω–µ—Ç–∏–∑–∞—Ü–∏—è',
    '–ú–æ–Ω–∏—Ç–æ—Ä–∏–Ω–≥',
    '–ú–æ—Ç–∏–≤–∞—Ü–∏—è',
    '–ù–∞—É–∫–∞',
    '–û–±–ª–∞—á–Ω—ã–µ —Å–µ—Ä–≤–∏—Å—ã',
    '–û–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ',
    '–û–±—É—á–µ–Ω–∏–µ',
    '–û–±—É—á–µ–Ω–∏–µ –±–∏–∑–Ω–µ—Å-–∞–Ω–∞–ª–∏–∑—É',
    '–û–±—É—á–µ–Ω–∏–µ –±–∏–∑–Ω–µ—Å-–ø—Ä–æ—Ü–µ—Å—Å–∞–º',
    '–û–±—É—á–µ–Ω–∏–µ –ø—Ä–æ–µ–∫—Ç–Ω–æ–º—É –º–µ–Ω–µ–¥–∂–º–µ–Ω—Ç—É',
    '–û–±—É—á–µ–Ω–∏–µ —Å–∏—Å—Ç–µ–º–Ω–æ–º—É –ø—Ä–æ–≥—Ä–∞–º–º–∏—Ä–æ–≤–∞–Ω–∏—é',
    '–û–Ω–ª–∞–π–Ω-–æ–±—É—á–µ–Ω–∏–µ',
    '–û–ø–µ—Ä–∞—Ü–∏–æ–Ω–Ω—ã–µ —Å–∏—Å—Ç–µ–º—ã',
    '–û–ø—Ç–∏–º–∏–∑–∞—Ü–∏—è –ø—Ä–∏–ª–æ–∂–µ–Ω–∏—è',
    '–û–ø—ã—Ç –≤–Ω–µ–¥—Ä–µ–Ω–∏—è –°–ü–û',
    '–û—Ä–≥–∞–Ω–∏–∑–∞—Ü–∏–æ–Ω–Ω—ã–µ –∏–∑–º–µ–Ω–µ–Ω–∏—è',
    '–û—Ä–≥–∞–Ω–∏–∑–∞—Ü–∏–æ–Ω–Ω—ã–π –∞–Ω–∞–ª–∏–∑',
    '–û—Ç–∫—Ä—ã—Ç—ã–µ –¥–∞–Ω–Ω—ã–µ',
    '–û—Ç–ª–∞–¥–∫–∞',
    '–û—Ü–µ–Ω–∫–∞ —Å–æ—Ç—Ä—É–¥–Ω–∏–∫–æ–≤',
    '–û—á–µ—Ä–µ–¥–∏',
    '–ü–∞—Ä–∞–ª–ª–µ–ª—å–Ω–æ–µ –ø—Ä–æ–≥—Ä–∞–º–º–∏—Ä–æ–≤–∞–Ω–∏–µ',
    '–ü–∏–∫—Ç–æ–º–∏—Ä',
    '–ü–ª–∞–Ω–∏—Ä–æ–≤–∞–Ω–∏–µ',
    '–ü–ª–∞–Ω–∏—Ä–æ–≤–∞–Ω–∏–µ –≤ Agile',
    '–ü–ª–∞–Ω–∏—Ä–æ–≤–∫–∞ –∑–∞–¥–∞—á',
    '–ü—Ä–∏–≤–µ—Å—Ç–≤–µ–Ω–Ω—ã–µ —Ä–µ—á–∏',
    '–ü—Ä–æ–≥—Ä–∞–º–º–∏—Ä–æ–≤–∞–Ω–∏–µ',
    '–ü—Ä–æ–≥—Ä–∞–º–º–Ω–∞—è –∞—Ä—Ö–∏—Ç–µ–∫—Ç—É—Ä–∞',
    '–ü—Ä–æ–¥—É–∫—Ç–æ–≤–∞—è –∞–Ω–∞–ª–∏—Ç–∏–∫–∞',
    '–ü—Ä–æ—Ç–æ—Ç–∏–ø–∏—Ä–æ–≤–∞–Ω–∏–µ UI',
    '–ü—Ä–æ—Ü–µ—Å—Å —Ä–∞–∑—Ä–∞–±–æ—Ç–∫–∏',
    '–ü—Ä–æ—Ü–µ—Å—Å —Ä–∞–∑—Ä–∞–±–æ—Ç–∫–∏ UX –∏ UI',
    '–ü—Ä–æ—Ü–µ—Å—Å —Ç–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏—è',
    '–ü—Å–∏—Ö–æ–ª–æ–≥–∏—è –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è',
    '–ü—Å–∏—Ö–æ–ª–æ–≥–∏—è —Ä–∞–∑—Ä–∞–±–æ—Ç–∫–∏',
    '–†–∞–∑—Ä–∞–±–æ—Ç–∫–∞ open-source',
    '–†–∞–∑—Ä–∞–±–æ—Ç–∫–∞ –¥–µ—Å–∫—Ç–æ–ø-–ø—Ä–∏–ª–æ–∂–µ–Ω–∏–π',
    '–†–∞–∑—Ä–∞–±–æ—Ç–∫–∞ –¥–µ—Å–∫—Ç–æ–ø–Ω—ã—Ö –ø—Ä–∏–ª–æ–∂–µ–Ω–∏–π –ø–æ–¥ Windows',
    '–†–∞–∑—Ä–∞–±–æ—Ç–∫–∞ –∏–≥—Ä',
    '–†–∞–∑—Ä–∞–±–æ—Ç–∫–∞ –æ–ø–µ—Ä–∞—Ü–∏–æ–Ω–Ω—ã—Ö —Å–∏—Å—Ç–µ–º',
    '–†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–Ω—ã–µ —Å–∏—Å—Ç–µ–º—ã',
    '–†–µ–¥–∫–∏–µ —è–∑—ã–∫–∏ –ø—Ä–æ–≥—Ä–∞–º–º–∏—Ä–æ–≤–∞–Ω–∏—è',
    '–†–µ–∫–ª–∞–º–∞',
    '–†–µ–∫–ª–∞–º–∞ –∫–æ–º–ø–∞–Ω–∏–∏',
    '–†–µ–∫–æ–º–µ–Ω–¥–∞—Ç–µ–ª—å–Ω—ã–µ —Å–∏—Å—Ç–µ–º—ã',
    '–†–µ—Ç—Ä–æ—Å–ø–µ–∫—Ç–∏–≤–∞',
    '–†–µ—Ñ–∞–∫—Ç–æ—Ä–∏–Ω–≥',
    '–†–æ–±–æ—Ç–æ—Ç–µ—Ö–Ω–∏–∫–∞',
    '–°++',
    '–°–ê–ü–†',
    '–°–ü–û –≤ –ì–æ—Å—É–ø—Ä–∞–≤–ª–µ–Ω–∏–∏',
    '–°–ü–û –≤ –†–æ—Å—Å–∏–∏',
    '–°–ü–û –≤ –Ω–∞—É–∫–µ',
    '–°–ü–û –≤ –æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–∏',
    '–°–ü–û –≤ —Ç–≤–æ—Ä—á–µ—Å—Ç–≤–µ',
    '–°–ü–û –¥–ª—è —Å–∏—Å—Ç–µ–º–Ω–æ–≥–æ –∞–¥–º–∏–Ω–∏—Å—Ç—Ä–∏—Ä–æ–≤–∞–Ω–∏—è',
    '–°–£–ë–î',
    '–°–≤–æ–±–æ–¥–Ω—ã–µ –±–∏–±–ª–∏–æ—Ç–µ–∫–∏ –ø–æ—Å—Ç—Ä–æ–µ–Ω–∏—è –≥—Ä–∞—Ñ–∏–∫–æ–≤',
    '–°–≤–æ–±–æ–¥–Ω—ã–µ –ª–∏—Ü–µ–Ω–∑–∏–∏',
    '–°–µ—Ä–≤–∏—Å-–æ—Ä–∏–µ–Ω—Ç–∏—Ä–æ–≤–∞–Ω–Ω–∞—è –∞—Ä—Ö–∏—Ç–µ–∫—Ç—É—Ä–∞',
    '–°–µ—Ç–∏',
    '–°–∏—Å—Ç–µ–º–Ω–æ–µ –∞–¥–º–∏–Ω–∏—Å—Ç—Ä–∏—Ä–æ–≤–∞–Ω–∏–µ',
    '–°–∏—Å—Ç–µ–º–Ω–æ–µ –º—ã—à–ª–µ–Ω–∏–µ',
    '–°–∏—Å—Ç–µ–º–Ω—ã–π –∞–Ω–∞–ª–∏–∑',
    '–°–∏—Å—Ç–µ–º–Ω—ã–π –ø–æ–¥—Ö–æ–¥',
    '–°–∏—Å—Ç–µ–º—ã —É–ø—Ä–∞–≤–ª–µ–Ω–∏—è –≤–µ—Ä—Å–∏—è–º–∏',
    '–°–∫—Ä—ã—Ç—ã–µ –∫–∞—Ç–µ–≥–æ—Ä–∏–∏',
    '–°–æ–±—Ä–∞–Ω–∏—è ALT.NET',
    '–°–æ–≤–µ—â–∞–Ω–∏—è',
    '–°—Ç–∞–∂–∏—Ä–æ–≤–∫–∞',
    '–°—Ç–∞—Ç–∏—á–µ—Å–∫–∏–π –∞–Ω–∞–ª–∏–∑ –∫–æ–¥–∞',
    '–°—Ç—Ä–∞—Ç–µ–≥–∏—á–µ—Å–∫–æ–µ –ø–ª–∞–Ω–∏—Ä–æ–≤–∞–Ω–∏–µ',
    '–¢–†–ò–ó',
    '–¢–µ–º—ã',
    '–¢–µ–æ—Ä–∏—è –æ–≥—Ä–∞–Ω–∏—á–µ–Ω–∏–π',
    '–¢–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ',
    '–¢–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ UI',
    '–¢–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ –∏–≥—Ä',
    '–¢–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ –º–æ–±–∏–ª—å–Ω—ã—Ö –ø—Ä–∏–ª–æ–∂–µ–Ω–∏–π',
    '–¢–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ –ø—Ä–æ–∏–∑–≤–æ–¥–∏—Ç–µ–ª—å–Ω–æ—Å—Ç–∏',
    '–¢–µ—Ö–Ω–æ–ª–æ–≥–∏–∏',
    '–¢–µ—Ö–Ω–æ–ª–æ–≥–∏–∏ –±—É–¥—É—â–µ–≥–æ',
    '–¢–µ—Ö–Ω–æ–ª–æ–≥–∏–∏ –∫—Ä—É–ø–Ω—ã—Ö –≤–µ–Ω–¥–æ—Ä–æ–≤',
    '–¢—Ä–µ–Ω–¥—ã Open-source',
    '–£–º–Ω—ã–µ –≤–µ—â–∏',
    '–£–ø—Ä–∞–≤–ª–µ–Ω–∏–µ –∑–∞–∏–Ω—Ç–µ—Ä–µ—Å–æ–≤–∞–Ω–Ω—ã–º–∏ —Å—Ç–æ—Ä–æ–Ω–∞–º–∏',
    '–£–ø—Ä–∞–≤–ª–µ–Ω–∏–µ –∫–∞—á–µ—Å—Ç–≤–æ–º',
    '–£–ø—Ä–∞–≤–ª–µ–Ω–∏–µ –ø—Ä–æ–¥—É–∫—Ç–∞–º–∏',
    '–£–ø—Ä–∞–≤–ª–µ–Ω–∏–µ —Ä–∏—Å–∫–∞–º–∏',
    '–£–ø—Ä–∞–≤–ª–µ–Ω–∏–µ —Å–æ–±–æ–π',
    '–£–ø—Ä–∞–≤–ª–µ–Ω–∏–µ —Ç–µ—Ö–ø–æ–¥–¥–µ—Ä–∂–∫–æ–π',
    '–£–ø—Ä–∞–≤–ª–µ–Ω–∏–µ —Ç—Ä–µ–±–æ–≤–∞–Ω–∏—è–º–∏',
    '–§–∞–π–ª–æ–≤—ã–µ —Å–∏—Å—Ç–µ–º—ã',
    '–§–∏–ª–æ—Å–æ—Ñ–∏—è –ø—Ä–æ–≥—Ä–∞–º–º–∏—Ä–æ–≤–∞–Ω–∏—è',
    '–§–∏–ª–æ—Å–æ—Ñ–∏—è —é–∑–∞–±–∏–ª–∏—Ç–∏',
    '–§–∏–Ω–∞–Ω—Å–æ–≤—ã–µ —Å–∏—Å—Ç–µ–º—ã',
    '–§—Ä–µ–π–º–≤–æ—Ä–∫–∏',
    '–§—Ä–∏–ª–∞–Ω—Å',
    '–§—É–Ω–∫—Ü–∏–æ–Ω–∞–ª—å–Ω–æ–µ –ø—Ä–æ–≥—Ä–∞–º–º–∏—Ä–æ–≤–∞–Ω–∏–µ',
    '–•—Ä–∞–Ω–µ–Ω–∏–µ –¥–∞–Ω–Ω—ã—Ö',
    '–•—ç—à–∏',
    '–≠–∫—Å–ø–ª—É–∞—Ç–∞—Ü–∏—è',
    '–≠–ª—å–±—Ä—É—Å',
    '–Æ–∑–∞–±–∏–ª–∏—Ç–∏',
    '–Æ–∑–∞–±–∏–ª–∏—Ç–∏ –≤ 1C',
    '–Æ–∑–∞–±–∏–ª–∏—Ç–∏ –≤ –∏–≥—Ä–∞—Ö',
    '–Æ–∑–∞–±–∏–ª–∏—Ç–∏ –∏–Ω—Ç–µ—Ä–Ω–µ—Ç-–º–∞–≥–∞–∑–∏–Ω–æ–≤',
    '–Æ–∑–∞–±–∏–ª–∏—Ç–∏ –∏—Å—Å–ª–µ–¥–æ–≤–∞–Ω–∏—è',
    '–Æ–∑–∞–±–∏–ª–∏—Ç–∏ –º–æ–±–∏–ª—å–Ω—ã—Ö —É—Å—Ç—Ä–æ–π—Å—Ç–≤',
    '–Æ–∑–∞–±–∏–ª–∏—Ç–∏ –æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ',
    '–Æ–∑–∞–±–∏–ª–∏—Ç–∏ –ø–æ–∏—Å–∫–∞',
    '–Æ–∑–∞–±–∏–ª–∏—Ç–∏ —Ç–µ–∫—Å—Ç–∞',
    '–Ø–∑—ã–∫–∏ –ø—Ä–æ–≥—Ä–∞–º–º–∏—Ä–æ–≤–∞–Ω–∏—è',
])

In [8]:
child_to_parent_categories

{'.NET': ['–ü—Ä–æ–≥—Ä–∞–º–º–∏—Ä–æ–≤–∞–Ω–∏–µ'],
 '1C': [],
 'ADD-2010': ['Application Developers Days', '–ö–æ–Ω—Ñ–µ—Ä–µ–Ω—Ü–∏–∏'],
 'ADD-2011': ['Application Developers Days', '–ö–æ–Ω—Ñ–µ—Ä–µ–Ω—Ü–∏–∏'],
 'ADD-2012': ['Application Developers Days', '–ö–æ–Ω—Ñ–µ—Ä–µ–Ω—Ü–∏–∏'],
 'ALTLinux –Ω–∞ –≠–ª—å–±—Ä—É—Å–µ': ['ALT Linux', 'Linux –¥–ª—è –≠–ª—å–±—Ä—É—Å–∞'],
 'ALT Linux': ['Linux-–¥–∏—Å—Ç—Ä–∏–±—É—Ç–∏–≤—ã –¥–ª—è Enterprise'],
 'AR': ['–ü—Ä–æ–≥—Ä–∞–º–º–∏—Ä–æ–≤–∞–Ω–∏–µ'],
 'AWS': ['PAAS', '–û–±–ª–∞—á–Ω—ã–µ —Å–µ—Ä–≤–∏—Å—ã'],
 'Accessibility': ['–Æ–∑–∞–±–∏–ª–∏—Ç–∏'],
 'Adrian Reed': ['–î–æ–∫–ª–∞–¥—á–∏–∫–∏'],
 'Agile': ['–ú–µ–Ω–µ–¥–∂–º–µ–Ω—Ç'],
 'Agile&Lean Mindset': ['Agile-–∫—É–ª—å—Ç—É—Ä–∞'],
 'Agile-–∫—É–ª—å—Ç—É—Ä–∞': ['Agile'],
 'Agile-–º–∞—Å—à—Ç–∞–±–∏—Ä–æ–≤–∞–Ω–∏–µ': ['Agile'],
 'Agile-–ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏—è': [],
 'AgileDays': ['AgileDays-2011', 'AgileDays-2014', '–õ–∏–Ω–µ–π–∫–∏ –∫–æ–Ω—Ñ–µ—Ä–µ–Ω—Ü–∏–π'],
 'AgileDays-2011': ['AgileDays', '–ö–æ–Ω—Ñ–µ—Ä–µ–Ω—Ü–∏–∏'],
 'AgileDays-2013': [

In [9]:
def coalesce(dct, keys):
    for key in keys:
        if dct[key] is not None:
            return dct[key]
    return None


def parse_document_with_categories(doc):
    parsed_doc = dict()
    parsed_doc["title"] = doc["title"]
    parsed_doc['speakers'] = speaker_detect(doc['text'])
    
    sections_data = extract_sections(doc['text'])
    parsed_doc["annotation"] = sections_data["–ê–Ω–Ω–æ—Ç–∞—Ü–∏—è"]
    parsed_doc["thesis"] = coalesce(
        sections_data, ["Thesis", "–¢–µ–∑–∏—Å—ã", "–†–∞—Å—à–∏—Ä–µ–Ω–Ω—ã–µ —Ç–µ–∑–∏—Å—ã"]
    )
    
    parsed_doc['raw_categories'] = clear_categories(doc["categories"])
    parsed_doc["categories"] = list(
        filter(
            lambda x: x in ALL_CATEGORIES_WITHOUT_NAMES,
            parsed_doc['raw_categories'],
        )
    )
    return parsed_doc


data_raw = [parse_document_with_categories(doc) for doc in data['articles']]

–¢–∞–∫ –∫–∞–∫ –±–æ–ª—å—à–∏–µ —è–∑—ã–∫–æ–≤—ã–µ –º–æ–¥–µ–ª–∏ –¥–æ–≤–æ–ª—å–Ω–æ "—É–º–Ω—ã–µ", —Ç–æ –º–æ–∂–Ω–æ —Å–ø–æ–∫–æ–π–Ω–æ –æ–ø—É—Å—Ç–∏—Ç—å –±–æ–ª—å—à—É—é —á–∞—Å—Ç—å —Ñ–∏–ª—å—Ç—Ä–∞—Ü–∏–∏ —Ç–µ–∫—Å—Ç–∞.

–ü—Ä–∏—á—ë–º, –µ—Å–ª–∏ —ç—Ç—É —Ñ–∏–ª—å—Ç—Ä–∞—Ü–∏—é –æ—Å—Ç–∞–≤–∏—Ç—å, —Ç–æ –ø–æ–ª—É—á–µ–Ω–Ω—ã–µ —ç–º–±–µ–¥–¥–∏–Ω–≥–∏ –æ–∫–∞–∂—É—Ç—Å—è –Ω–µ —Ç–∞–∫–∏–º–∏ —Ö–æ—Ä–æ—à–∏–º–∏, –ø–æ—Å–∫–æ–ª—å–∫—É –æ–±—Ä–∞–±–æ—Ç–∞–Ω–Ω—ã–π —Ç–µ–∫—Å—Ç –º–µ–Ω–µ–µ –ª–æ–≥–∏—á–µ–Ω –∏ —Å–≤—è–∑–µ–Ω, –≤ –æ—Ç–ª–∏—á–∏–∏ –æ—Ç –µ—Å—Ç–µ—Å—Ç–≤–µ–Ω–Ω–æ–≥–æ —è–∑—ã–∫–∞.

–¢–µ–º –Ω–µ –º–µ–Ω–µ–µ, –º—ã –æ—Å—Ç–∞–≤–∏–º –Ω–µ–∫–æ—Ç–æ—Ä—ã–µ —Ñ–∏–ª—å—Ç—Ä—ã: —ç—Ç–æ —É–¥–∞–ª–µ–Ω–∏–µ —Å—Å—ã–ª–æ–∫ –∏ —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ —Ç–æ–∫–µ–Ω–∞ <sep> –º–µ–∂–¥—É —Ä–∞–∑–Ω—ã–º–∏ –º–æ–¥–∞–ª—å–Ω–æ—Å—Ç—è–º–∏

–û—á–∏—Å—Ç–∫–∞ —Ç–µ–∫—Å—Ç–∞:
- —É–¥–∞–ª–µ–Ω–∏–µ —Å—Å—ã–ª–æ–∫
- —É–¥–∞–ª–µ–Ω–∏–µ –≤—Å–µ—Ö HTML —Ç–µ–≥–æ–≤

In [10]:
def clean_text_for_llm(text):
    if text is None or len(text) == 0:
        return ""

    text = re.sub(r"<[^>]+>", "", text)
    text = re.sub(r"http\S+|www\.\S+", "", text)

    # Remove LaTeX commands and symbols
    text = re.sub(r'\\[a-zA-Z]+\s*({.*?})?', '', text)
    text = re.sub(r'\{.*?\}', '', text)

    # Remove special characters and excessive whitespace
    text = re.sub(r'[^a-zA-Z–∞-—è–ê-–Ø0-9\s.,<>:;\'\"()\[\]\-‚Äî]', ' ', text)
    text = re.sub(r'\s+', ' ', text)

    return text


def clean_and_merge_document_for_llm(doc, sep_token=SEP_TOKEN):
    cleaned_texts = []

    # Clean each field and collect non-empty results
    for field in ["title", "annotation", "thesis"]:
        field_text = doc.get(field, "")
        cleaned_tokens = clean_text_for_llm(field_text)

        if cleaned_tokens:
            cleaned_texts.append(cleaned_tokens)

    # Join with <sep> token
    result_text = f" {sep_token} ".join(cleaned_texts).strip()

    result = {
        "text": result_text,
        "categories": doc.get("categories", []),
        "raw_categories": doc.get("raw_categories", []),
        "speakers": doc.get("speakers", []),
    }

    return result


# Example usage
text = """
<html>–ü—Ä–∏–º–µ—Ä —Ç–µ–∫—Å—Ç–∞. –≠—Ç–æ —Ç–µ—Å—Ç–æ–≤—ã–π —Ç–µ–∫—Å—Ç —Å –ø—Ä–∏–º–µ—Ä–æ–º <a href="http://example.com">—Å—Å—ã–ª–∫–∏</a>.
–í–æ—Ç –µ—â–µ —Å—Å—ã–ª–∫–∞: https://www.test.ru.
–ó–¥–µ—Å—å –µ—Å—Ç—å —á–∞—Å—Ç–æ-–≤—Å—Ç—Ä–µ—á–∞—é—â–∏–µ—Å—è —Å–ª–æ–≤–∞, –Ω–∞–ø—Ä–∏–º–µ—Ä, "—Ç–µ—Å—Ç" –º–Ω–æ–≥–æ —Ä–∞–∑.
...
–¢–∞–∫–∂–µ –Ω—É–∂–Ω–æ —É–¥–∞–ª–∏—Ç—å .—ç—Ç–∏ —Å—Ç—Ä–æ–∫–∏.
"""

print(clean_text_for_llm(text))
print(clean_and_merge_document_for_llm(data_raw[1298]))

 –ü—Ä–∏–º–µ—Ä —Ç–µ–∫—Å—Ç–∞. –≠—Ç–æ —Ç–µ—Å—Ç–æ–≤—ã–π —Ç–µ–∫—Å—Ç —Å –ø—Ä–∏–º–µ—Ä–æ–º —Å—Å—ã–ª–∫–∏. –í–æ—Ç –µ—â–µ —Å—Å—ã–ª–∫–∞: –ó–¥–µ—Å—å –µ—Å—Ç—å —á–∞—Å—Ç–æ-–≤—Å—Ç—Ä–µ—á–∞—é—â–∏–µ—Å—è —Å–ª–æ–≤–∞, –Ω–∞–ø—Ä–∏–º–µ—Ä, "—Ç–µ—Å—Ç" –º–Ω–æ–≥–æ —Ä–∞–∑. ... –¢–∞–∫–∂–µ –Ω—É–∂–Ω–æ —É–¥–∞–ª–∏—Ç—å .—ç—Ç–∏ —Å—Ç—Ä–æ–∫–∏. 
{'text': '–ú–∞—Ä–∫–µ—Ç–∏–Ω–≥ –º–æ–±–∏–ª—å–Ω—ã—Ö –ø—Ä–∏–ª–æ–∂–µ–Ω–∏–π (–Æ—Ä–∏–π –ú–µ–ª—å–Ω–∏—á–µ–∫, SECR-2015) <sep> –Ø —Ä—É–∫–æ–≤–æ–∂—É MAPS.ME —Å —Å–∞–º–æ–≥–æ –Ω–∞—á–∞–ª–∞ –∏ –¥–æ 25 –º–∏–ª–ª–∏–æ–Ω–æ–≤ –∏–Ω—Å—Ç–∞–ª–ª—è—Ü–∏–π –Ω–∞ —Ç–µ–∫—É—â–∏–π –º–æ–º–µ–Ω—Ç. –ó–∞ —ç—Ç–æ –≤—Ä–µ–º—è —É –º–µ–Ω—è —Å—Ñ–æ—Ä–º–∏—Ä–æ–≤–∞–ª—Å—è —Å–∏—Å—Ç–µ–º–Ω—ã–π –≤–∑–≥–ª—è–¥ –Ω–∞ –º–∞—Ä–∫–µ—Ç–∏–Ω–≥ –º–æ–±–∏–ª—å–Ω—ã—Ö –ø—Ä–∏–ª–æ–∂–µ–Ω–∏–π, –∫–æ—Ç–æ—Ä—ã–º —è –∏ –ø–æ–¥–µ–ª—é—Å—å —Å–æ —Å–ª—É—à–∞—Ç–µ–ª—è–º–∏: –ö–∞–∫–∏–µ —Å—É—â–µ—Å—Ç–≤—É—é—Ç –º–∞—Ä–∫–µ—Ç–∏–Ω–≥–æ–≤—ã–µ –∫–∞–Ω–∞–ª—ã –¥–ª—è —Ä–∞—Å–∫—Ä—É—Ç–∫–∏ –º–æ–±–∏–ª—å–Ω—ã—Ö –ø—Ä–∏–ª–æ–∂–µ–Ω–∏–π (PR, ASO, —Ä–µ–∫–ª–∞–º–∞, —Ñ–∏—á–µ—Ä–∏–Ω–≥–∏ –∏ —Ç.–¥). –ö–∞–∫ —Ä–∞–±

Process text

In [11]:
processed_data = [clean_and_merge_document_for_llm(doc) for doc in data_raw]

In [12]:
text_lengths = list(map(len, [elem['text'] for elem in processed_data]))
pd.Series(text_lengths).describe()

count     2541.000000
mean      1327.888233
std       2120.183248
min          2.000000
25%        282.000000
50%        623.000000
75%       1153.000000
max      20089.000000
dtype: float64

## Prompt engineering for topic summary

### Utils

In [13]:
OLLAMA_CONFIG = {
    "keep_alive": "2m",
    "stream": False,
}

SYSTEM_PROMPT = (
    "You are an AI assistant tasked with generating precise and concise topic titles. "
    "Do not include explanations or reasoning‚Äîonly provide the final title for each topic. "
    "Write answer in Russian language, but if there are foreign terms leave them as is."
)

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


In [14]:
def run_prompt(endpoint, prompt, model='gemma2', temperature=0, seed=123, **kwargs):
    options_config = {"temperature": temperature, "seed": seed, **kwargs}

    combined_prompt = f"{SYSTEM_PROMPT}\n\n{prompt}"

    payload = {
        **OLLAMA_CONFIG,
        "model": model,
        "prompt": combined_prompt,
        "options": options_config
    }

    response = httpx.post(
        endpoint,
        json=payload,
        headers={"Content-Type": "application/json"},
        timeout=300,
    )
    
    if response.status_code != 200:
        print("Error", response.status_code)
        assert(False)

    return response


def debug_prompt(prompt, endpoint='http://127.0.0.1:11434/api/generate', model='gemma2', **kwargs):
    text = re.compile(r'#[^\n]*').sub('', prompt).strip()
    response = run_prompt(endpoint, text, model, **kwargs)
    response_content = response.json().get('response', 'No response found.')
    
    print(response_content)

### Examples

In [15]:
!ollama list

NAME                               ID              SIZE      MODIFIED     
rscr/ruadapt_qwen2.5_32b:Q8_0      9907b3df7370    34 GB     7 days ago      
mistral:latest                     f974a74358d6    4.1 GB    2 weeks ago     
llama3:latest                      365c0bd3c000    4.7 GB    2 weeks ago     
aya-expanse:latest                 65f986688a01    5.1 GB    7 weeks ago     
gemma2:latest                      ff02c3702f32    5.4 GB    7 weeks ago     
gemma2:27b                         53261bc9c192    15 GB     2 months ago    
mistral-large:latest               0ca7dfa0bf06    69 GB     2 months ago    
deepseek-v2.5:latest               409b2dd8a3c4    132 GB    2 months ago    
dolphin-mixtral:latest             cfada4ba31c7    26 GB     8 months ago    
codellama:latest                   8fdf8f752f6e    3.8 GB    8 months ago    
llama2:latest                      78e26419b446    3.8 GB    8 months ago    
mistral:7b-instruct-v0.2-q4_K_S    ba00d3a5239e    4.1 GB    9 mont

–ë–æ–ª—å—à–∏–µ –º–æ–¥–µ–ª–∏ (–±–æ–ª—å—à–µ 15 GB) –¥–æ–ª–≥–æ –∏–Ω—Ñ–µ—Ä–µ–Ω—Å—è—Ç—Å—è: —ç—Ç–æ `mistral-large`, `deepseek-v2.5`, `dolphin-mixtral`. –ü–æ—ç—Ç–æ–º—É –∏—Ö –º—ã –æ—Ç–±—Ä–æ—Å–∏–º

–¢–∞–∫–∂–µ –∏–∑ —Ä–∞—Å—Å–º–æ—Ç—Ä–µ–Ω–∏—è –∏—Å–∫–ª—é—á–µ–Ω—ã `codellama` –∏–∑-–∑–∞ –æ—à–∏–±–∫–∏ 404 –≤ API

In [24]:
models_list = (
    'llama3',
    'mistral',
    'aya-expanse',
    'gemma2',
    'gemma2:27b',
    'llama2',
    'mistral:7b-instruct-v0.2-q4_K_S',
    'codellama:13b'
)

for model in models_list:
    print(model)
    debug_prompt("""
    –¢—ã —Ä–µ–¥–∞–∫—Ç–æ—Ä –≤–∏–∫–∏–ø–µ–¥–∏–∏. 
    –ü—Ä–æ—á—Ç–∏ —Ç–µ–∫—Å—Ç ''' 
    ¬´–ö–æ–≤–∏–¥–Ω—ã–π –ø–µ—Ä–∏–æ–¥¬ª –∏ –Ω–µ–¥–∞–≤–Ω–æ –Ω–∞—Å—Ç—É–ø–∏–≤—à–∞—è ¬´—ç—Ä–∞ –ø–µ—Ä–µ–º–µ–Ω¬ª —Å–∏–ª—å–Ω–æ –ø–æ–¥—Å—Ç–µ–≥–Ω—É–ª–∞ –í–£–ó–æ–≤—Å–∫–æ–µ –¥–∏—Å—Ç–∞–Ω—Ü–∏–æ–Ω–Ω–æ–µ –æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ. –†–∞–Ω—å—à–µ –Ω–æ—Ä–º–æ–π –±—ã–ª–∏ ¬´–æ—Ñ—Ñ–ª–∞–π–Ω¬ª –∑–∞–Ω—è—Ç–∏—è, –µ—Å–ª–∏ –ø–æ–≤–µ–∑–ª–æ, –ø–æ–¥–¥–µ—Ä–∂–∏–≤–∞–µ–º—ã–µ –∫–Ω–∏–∂–∫–æ–π –ª–µ–∫—Ç–æ—Ä–∞, –≤–∏–¥–µ–æ–∑–∞–ø–∏—Å—è–º–∏ –ª–µ–∫—Ü–∏–π-—Å–µ–º–∏–Ω–∞—Ä–æ–≤, –∞ ¬´–¥–∏—Å—Ç–∞–Ω—Ü–∏–æ–Ω–∫–∞¬ª –±—ã–ª–∞ —Å–∫–æ—Ä–µ–µ —Ä–µ–¥–∫–æ —Ç–µ—Ä–ø–∏–º—ã–º –∏—Å–∫–ª—é—á–µ–Ω–∏–µ–º. –°–µ–π—á–∞—Å –∂–µ –Ω–æ—Ä–º–æ–π —Å—Ç–∞–Ω–æ–≤–∏—Ç—Å—è —Å–∏—Ç—É–∞—Ü–∏—è, –∫–æ–≥–¥–∞ –ø—Ä–µ–ø–æ–¥–∞–≤–∞—Ç–µ–ª—å –∏ –¥–∞–∂–µ —Å—Ç—É–¥–µ–Ω—Ç—ã —Ä–∞–∑–±—Ä–æ—Å–∞–Ω—ã –ø–æ –≤—Å–µ–º—É –º–∏—Ä—É. –ù–æ –º–∞–ª–æ, —Å–æ–∑–≤–æ–Ω–∏–≤—à–∏—Å—å –≤ –∫–∞–∫–æ–º-–Ω–∏–±—É–¥—å zoom-–µ, –Ω–∞—á–∞—Ç—å —á–∏—Ç–∞—Ç—å –ª–µ–∫—Ü–∏—é, –∏, –∑–∞–≥–ª—è–¥—ã–≤–∞—è –≤ —á—ë—Ä–Ω—ã–µ –∫–≤–∞–¥—Ä–∞—Ç–∏–∫–∏ –æ—Ç–∫–ª—é—á—ë–Ω–Ω—ã—Ö –≤–∏–¥–µ–æ–∫–∞–º–µ—Ä, –ø—ã—Ç–∞—Ç—å—Å—è –ø–æ–Ω—è—Ç—å, —Å–ª—ã—à–∞—Ç –ª–∏ –ø—Ä–µ–ø–æ–¥–∞–≤–∞—Ç–µ–ª—è —Å—Ç—É–¥–µ–Ω—Ç—ã –∏ –≤–æ–æ–±—â–µ, —á—Ç–æ –æ–Ω–∏ –¥–µ–ª–∞—é—Ç, –º—É—á–∏—Ç–µ–ª—å–Ω–æ –Ω–∞–¥–µ—è—Å—å –≤–æ–≤–ª–µ—á—å –∏—Ö –≤ –ø—Ä–æ—Ü–µ—Å—Å. –£–≤—ã, —ç—Ç–æ –ø—Ä–æ—Å—Ç–æ –Ω–µ —Ä–∞–±–æ—Ç–∞–µ—Ç. –ï—â—ë —Ö—É–∂–µ —Å –¥–æ–º–∞—à–Ω–µ–π —Ä–∞–±–æ—Ç–æ–π ‚Äî –º–∞–ª–æ –¥–∞–≤–∞—Ç—å —Å—Ç—É–¥–µ–Ω—Ç–∞–º ¬´–∫–Ω–∏–≥—É¬ª –∏ –Ω–∞–¥–µ—è—Ç—Å—è, —á—Ç–æ –æ–Ω–∏ –µ—ë –ø—Ä–æ—á—Ç—É—Ç ‚Äî –≤—Å—ë –≤ —ç—Ç–æ–º —É—Å—Ç–∞—Ä–µ–≤—à–µ–º —Ñ–æ—Ä–º–∞—Ç–µ (–æ–±—ä—ë–º, —Å—Ç–∞—Ç–∏—á–Ω–æ—Å—Ç—å, –Ω–µ—É–¥–æ–±—Å—Ç–≤–æ ¬´–ª–∏—Å—Ç–æ–≤¬ª, –Ω–µ–∏–Ω—Ç–µ—Ä–∞–∫—Ç–∏–≤–Ω–æ—Å—Ç—å, –Ω–µ–≤–µ—Ä–∏—Ñ–∏—Ü–∏—Ä—É–µ–º–æ—Å—Ç—å) –Ω–µ—ç—Ñ—Ñ–µ–∫—Ç–∏–≤–Ω–æ –ø–æ —Å–æ–≤—Ä–µ–º–µ–Ω–Ω—ã–º –º–µ—Ä–∫–∞–º.

    –î–∞, –µ—Å—Ç—å –º–∞—Å—Å–∞ –æ–Ω–ª–∞–π–Ω —Å–µ—Ä–≤–∏—Å–æ–≤, –∫–æ—Ç–æ—Ä—ã–µ —É–∂–µ –¥–æ–ª–≥–æ–µ –≤—Ä–µ–º—è –ø–æ–º–æ–≥–∞–ª–∏ –∞–π—Ç–∏—à–Ω–∏–∫–∞–º –∏ —Å—Ç—É–¥–µ–Ω—Ç–∞–º –≤–æ –≤—Ä–µ–º—è —Å–æ–∑–≤–æ–Ω–æ–≤, –∏ –∫–æ—Ç–æ—Ä—ã–µ —Å—Ç–∞–ª–∏ –µ—â—ë –ø–æ–ø—É–ª—è—Ä–Ω–µ–µ –≤ ¬´–∫–æ–≤–∏–¥–Ω—É—é —ç—Ä—É¬ª ‚Äî —Ä–∞–∑–¥–µ–ª—è–µ–º—ã–µ –æ–Ω–ª–∞–π–Ω –¥–æ—Å–∫–∏ –∏ –¥–æ–∫—É–º–µ–Ω—Ç—ã, –≥—É–≥–ª-–¥–æ–∫–∏ –∏ –≥—É–≥–ª-–∫–æ–ª–∞–±-–Ω–æ—É—Ç–±—É–∫–∏, ‚Ä¶ –Ω–æ –ø–æ—Å–ª–µ ¬´–≤–æ–π–Ω—ã –≤–∑–∞–∏–º–Ω—ã—Ö –∏–Ω—Ç–µ—Ä–Ω–µ—Ç-–±–ª–æ–∫–∏—Ä–æ–≤–æ–∫ –∏ —Å–∞–Ω–∫—Ü–∏–π¬ª –Ω–∞ –Ω–∏—Ö –Ω–µ–ª—å–∑—è —Ä–∞—Å—á–∏—Ç—ã–≤–∞—Ç—å. –ù–∞–ø—Ä–∏–º–µ—Ä, –≤ –†–§ –ø—Ä–æ–≥—Ä–∞–º–º–∞ ¬´Google Suite for Education¬ª –∑–∞–∫—Ä—ã–ª–∞—Å—å –∏–∑-–∑–∞ –∞–Ω—Ç–∏-–†–§ —Å–∞–Ω–∫—Ü–∏–π, —Å –¥—Ä—É–≥–æ–π —Å—Ç–æ—Ä–æ–Ω—ã, –¥–µ—Å—è—Ç–∫–∏ –∏ —Å–æ—Ç–Ω–∏ —Å–µ—Ä–≤–∏—Å–æ–≤ –ø–µ—Ä–µ—Å—Ç–∞–ª–∏ —Ä–∞–±–æ—Ç–∞—Ç—å —Ç–æ–ª—å–∫–æ –∏–∑-–∑–∞ —Ç–æ–≥–æ, —á—Ç–æ –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–ª–∏ –∫–µ—à–∏—Ä–æ–≤–∞–Ω–∏–µ –∏ –±–∞–ª–∞–Ω—Å–∏—Ä–æ–≤–∫—É —á–µ—Ä–µ–∑ —Å–µ—Ä–≤–∏—Å –°loudflare (–∑–∞–ø—Ä–µ—â–∞–µ—Ç—Å—è –†–ö–ù). –°–∏—Ç—É–∞—Ü–∏—è –ø—Ä–æ–¥–æ–ª–∂–∞–µ—Ç —É—Ö—É–¥—à–∞—Ç—å—Å—è, –ø–µ—Ä—Å–ø–µ–∫—Ç–∏–≤—ã –Ω–µ—É—Ç–µ—à–∏—Ç–µ–ª—å–Ω—ã.

    –¢–æ–ª—å–∫–æ —Å–∏—Å—Ç–µ–º—ã —Å –æ—Ç–∫—Ä—ã—Ç—ã–º –∫–æ–¥–æ–º, –ø–æ–¥–Ω—è—Ç—ã–µ –Ω–∞ —Å–≤–æ–µ–π –∏–Ω—Ñ—Ä–∞—Å—Ç—Ä—É–∫—Ç—É—Ä–µ, –ø—Ä–µ–¥—Å–∫–∞–∑—É–µ–º—ã –∏ –∑–∞—â–∏—â–µ–Ω—ã –æ—Ç —ç—Ç–∏—Ö ¬´–±–ª–æ–∫–∞–¥¬ª ‚Äî –±–ª–æ–∫–∏—Ä–æ–≤–æ–∫ –∏ —Å–∞–Ω–∫—Ü–∏–π. –ú—ã —Ä–∞—Å—Å–∫–∞–∂–µ–º –∏–º–µ–Ω–Ω–æ –æ —Ç–∞–∫–∏—Ö —Å–∏—Å—Ç–µ–º–∞—Ö, —É—Å–ø–µ—à–Ω–æ –ø—Ä–∏–º–µ–Ω—è–µ–º—ã—Ö –∏ —Ä–∞–∑–≤–∏–≤–∞–µ–º—ã—Ö –∞–≤—Ç–æ—Ä–æ–º –ø—Ä–∏ –ø—Ä–µ–ø–æ–¥–∞–≤–∞–Ω–∏–∏ –º–∞—Ç–µ–º–∞—Ç–∏—á–µ—Å–∫–∏—Ö –∏ –∞–ª–≥–æ—Ä–∏—Ç–º–∏—á–µ—Å–∫–∏—Ö –∫—É—Ä—Å–æ–≤ –≤ –ò–°–ü–†–ê–ù –∏ –ú–§–¢–ò –∏ –≤–µ–¥–µ–Ω–∏–∏ –∏—Å—Å–ª–µ–¥–æ–≤–∞—Ç–µ–ª—å—Å–∫–æ–π –∏ –Ω–∞—É—á–Ω–æ–π —Ä–∞–±–æ—Ç—ã –≤ –æ—Ç–¥–µ–ª–µ –º–∞—Ç–µ–º–∞—Ç–∏—á–µ—Å–∫–∏—Ö –º–µ—Ç–æ–¥–æ–≤ –∏ –∞–ª–≥–æ—Ä–∏—Ç–º–æ–≤ –ò–°–ü–†–ê–ù.

    –°–∏—Å—Ç–µ–º—ã –¥–ª—è –æ—Ä–≥–∞–Ω–∏–∑–∞—Ü–∏–∏ —Å–æ–∑–≤–æ–Ω–æ–≤, ¬´–∏–Ω—Ç–µ—Ä–∞–∫—Ç–∏–≤–Ω—ã–µ –¥–æ—Å–∫–∏¬ª, —Å–∏—Å—Ç–µ–º—ã –¥–ª—è –æ–¥–Ω–æ–≤—Ä–µ–º–µ–Ω–Ω–æ–≥–æ —Å–æ–≤–º–µ—Å—Ç–Ω–æ–≥–æ —Ä–µ–¥–∞–∫—Ç–∏—Ä–æ–≤–∞–Ω–∏—è –∫–æ–¥–∞ –∏ —Ç–µ–∫—Å—Ç–æ–≤, ¬´–∫–æ–ª–ª–∞–±–æ—Ä–∞—Ç–∏–≤–Ω—ã–µ –±–ª–æ–∫–Ω–æ—Ç—ã¬ª‚Ä¶ –æ—Ä–≥–∞–Ω–∏–∑—É—é—Ç –∏ ¬´–∫–ª–∞—Å—Å–Ω—É—é¬ª –∏ ¬´–≤–Ω–µ–∫–ª–∞—Å—Å–Ω—É—é¬ª —Ä–∞–±–æ—Ç—ã, –≥—Ä–∞–Ω–∏—Ü–∞ –º–µ–∂–¥—É –∫–æ—Ç–æ—Ä—ã–º–∏ —É–∂–µ —Ä–∞–∑–º—ã—Ç–∞ –ø–æ –≤—Å–µ–º—É, —á—Ç–æ –∫–∞–∫-—Ç–æ —Å–≤—è–∑–∞–Ω–æ —Å –º–∞—Ç–µ–º–∞—Ç–∏–∫–æ–π –∏ –∞–ª–≥–æ—Ä–∏—Ç–º–∞–º–∏. –ê –∏—Ö –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ –ø–æ–∑–≤–æ–ª—è–µ—Ç –¥–µ–ª–∞—Ç—å –Ω–æ–≤—ã–µ —Ç–∏–ø—ã –æ–±—Ä–∞–∑–æ–≤–∞—Ç–µ–ª—å–Ω–æ–≥–æ –∫–æ–Ω—Ç–µ–Ω—Ç–∞ ‚Äî –Ω–∞ –∑–∞–º–µ–Ω—É —É–Ω—ã–ª—ã–º —Ç–æ–ª—Å—Ç—ã–º –∫–Ω–∏–∂–∫–∞–º –∏ —É—Ä–æ–¥–ª–∏–≤—ã–º —Å–ª–∞–π–¥–∞–º –ø—Ä–∏—Ö–æ–¥—è—Ç ¬´–∂–∏–≤—ã–µ –ª–∞–±–æ—Ä–∞—Ç–æ—Ä–∏–∏¬ª —Å –∫–æ–º–ø–∞–∫—Ç–Ω—ã–º–∏ –∏–Ω—Ç–µ—Ä–∞–∫—Ç–∏–≤–Ω—ã–º–∏ –º–∞—Ç–µ—Ä–∏–∞–ª–∞–º–∏, —Å –∫–æ—Ç–æ—Ä—ã–º–∏ –º–æ–∂–Ω–æ –∏ ¬´–ø–æ–∏–≥—Ä–∞—Ç—å¬ª, –∏ –ø—Ä–∏–≤–ª–µ—á—å —Å—Ç—É–¥–µ–Ω—Ç–æ–≤ –∫ —Å–æ–≤–º–µ—Å—Ç–Ω–æ–º—É —Ç–≤–æ—Ä—á–µ—Å—Ç–≤—É –Ω–æ–≤–æ–≥–æ.
    '''
    –í—ã–≤–µ–¥–∏ –∫–∞—Ç–µ–≥–æ—Ä–∏—é, –∫ –∫–æ—Ç–æ—Ä–æ–π —ç—Ç–∞ —Å—Ç–∞—Ç—å—è –æ—Ç–Ω–æ—Å–∏—Ç—Å—è.
    """, model=model)
    print('='*80)

llama3


2024-12-30 05:25:39,271 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


¬´–û–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ –≤ —ç—Ä—É —Ü–∏—Ñ—Ä–æ–≤—ã—Ö —Ç–µ—Ö–Ω–æ–ª–æ–≥–∏–π¬ª
mistral


2024-12-30 05:25:41,563 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


 –ö–∞—Ç–µ–≥–æ—Ä–∏—è: "–û–Ω–ª–∞–π–Ω-–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ" –∏–ª–∏ "–ò—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ —Ç–µ—Ö–Ω–æ–ª–æ–≥–∏–π –≤ –æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–∏"
aya-expanse


2024-12-30 05:25:45,993 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


# –î–∏—Å—Ç–∞–Ω—Ü–∏–æ–Ω–Ω–æ–µ –æ–±—É—á–µ–Ω–∏–µ –≤ —ç–ø–æ—Ö—É COVID-19: –≤–æ–∑–º–æ–∂–Ω–æ—Å—Ç–∏ –∏ –≤—ã–∑–æ–≤—ã —Å–∏—Å—Ç–µ–º —Å –æ—Ç–∫—Ä—ã—Ç—ã–º –∫–æ–¥–æ–º

–ö–∞—Ç–µ–≥–æ—Ä–∏—è: –û–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ, –¢–µ—Ö–Ω–æ–ª–æ–≥–∏–∏, –û—Ç–∫—Ä—ã—Ç—ã–π –∏—Å—Ö–æ–¥–Ω—ã–π –∫–æ–¥
gemma2


2024-12-30 05:25:49,753 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


–û–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ  

gemma2:27b


2024-12-30 05:26:02,666 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


–î–∏—Å—Ç–∞–Ω—Ü–∏–æ–Ω–Ω–æ–µ –æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ –≤ –í–£–ó–∞—Ö —Å –æ—Ç–∫—Ä—ã—Ç—ã–º –∏—Å—Ö–æ–¥–Ω—ã–º –∫–æ–¥–æ–º 

llama2


2024-12-30 05:26:05,464 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"



–ö–∞—Ç–µ–≥–æ—Ä–∏—è: –û–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ
mistral:7b-instruct-v0.2-q4_K_S


2024-12-30 05:26:08,536 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


 –≠—Ç–∞ —Å—Ç–∞—Ç—å—è –æ—Ç–Ω–æ—Å–∏—Ç—Å—è –∫ –∫–∞—Ç–µ–≥–æ—Ä–∏–∏ "–û–Ω–ª–∞–π–Ω-–æ–±—É—á–µ–Ω–∏–µ" –∏–ª–∏ "–ò–Ω—Ç–µ—Ä–∞–∫—Ç–∏–≤–Ω—ã–µ —Å–∏—Å—Ç–µ–º—ã –¥–ª—è –æ–±—É—á–µ–Ω–∏—è".
codellama:13b


2024-12-30 05:26:12,206 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"



–ö–æ–º–ø—å—é—Ç–µ—Ä–Ω—ã–µ –Ω–∞—É–∫–∏


–õ—É—á—à–µ —Å–µ–±—è –ø–æ–∫–∞–∑–∞–ª–∏ `llama3`, `mistral`, `gemma2:27b`, `mistral:7b-instruct-v0.2-q4_K_S`

–£ `codellama:13b` —Å–æ–≤—Å–µ–º –¥—É—Ä–Ω–æ–π –≤—ã—Ö–æ–¥, –ø–æ—ç—Ç–æ–º—É –Ω–µ –±—É–¥–µ–º –µ—ë –¥–∞–ª—å—à–µ —Ä–∞—Å—Å–º–∞—Ç—Ä–∏–≤–∞—Ç—å

### –ú–æ–¥–∏—Ñ–∏—Ü–∏—Ä–æ–≤–∞–Ω–Ω—ã–π –ø—Ä–æ–º–ø—Ç

In [None]:
SUMMARIZING_PROMPT = """
    You are an AI language model designed to analyze topics defined by keywords, bigrams, and categories, and then summarize each topic into a concise, descriptive name. Use the provided information (keywords, bigrams, and categories) to infer the central theme of each topic and generate a short, meaningful title. Follow these guidelines:

    1. Focus on the most relevant and representative keywords and bigrams to identify the primary subject of the topic.
    2. Use the categories to refine and contextualize the theme.
    3. Avoid overly generic titles; aim for specificity and clarity.
    4. Ensure the title reflects the core idea that unites the keywords, bigrams, and categories.

    Here is the input format:
    ```
    Topic: [topic_name]
    Words: [list of words]
    Bigrams: [list of bigrams]
    Categories: [list of categories]
    ```

    And here is an example of the desired output:
    ```
    Input:
    Topic: topic_15
    Words: ['–∞—Ä—Ö–∏—Ç–µ–∫—Ç—É—Ä–∞', '–∫–æ–¥', '–ø–ª–∞—Ç—Ñ–æ—Ä–º–∞', '—Å–µ—Ä–≤–∏—Å', '—Ç–µ—Ö–Ω–æ–ª–æ–≥–∏—è']
    Bigrams: ['—Ö—Ä–∞–Ω–∏–ª–∏—â–µ_–¥–∞–Ω–Ω—ã–µ', '–æ–±–ª–∞—á–Ω—ã–π_—Å–µ—Ä–≤–∏—Å', '—Ä–∞—Å–ø—Ä–µ–¥–µ–ª—ë–Ω–Ω—ã–π_—Å–∏—Å—Ç–µ–º–∞', '–º–æ–¥—É–ª—å_—è–¥—Ä–æ', '–æ–±—Ä–∞–±–æ—Ç–∫–∞_–±–æ–ª—å—à–∏–π']
    Categories: ['–ø—Ä–æ–≥—Ä–∞–º–º–∏—Ä–æ–≤–∞–Ω–∏–µ', '–ø—Ä–æ–≥—Ä–∞–º–º–Ω–∞—è_–∞—Ä—Ö–∏—Ç–µ–∫—Ç—É—Ä–∞', '—Ä–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–Ω—ã–µ_—Å–∏—Å—Ç–µ–º—ã', '–≤–∏—Ä—Ç—É–∞–ª–∏–∑–∞—Ü–∏—è', 'paas']

    Output:
    "topic_15": "–ê—Ä—Ö–∏—Ç–µ–∫—Ç—É—Ä–∞ –ø—Ä–æ–≥—Ä–∞–º–º –∏ –æ–±–ª–∞—á–Ω—ã–µ —Å–µ—Ä–≤–∏—Å—ã"

    Input:
    Topic: topic_16
    Words: ['–º–æ–¥–µ–ª—å', '—Ç–µ–∫—Å—Ç', '–∞–Ω–∞–ª–∏–∑', '–º–µ—Ç–æ–¥', '–∞–ª–≥–æ—Ä–∏—Ç–º']
    Bigrams: ['—Ä–µ–¥_–±–∞–∑–∞', '—É—á–µ–±–Ω—ã–π_–º–∞—Ç–µ—Ä–∏–∞–ª', '—Å—É–±–¥_—Ä–µ–¥', '–≤—ã—á–∏—Å–ª–∏—Ç–µ–ª—å–Ω—ã–π_–∫–æ–º–ø–ª–µ–∫—Å', '–∞–Ω–∞–ª–∏–∑_—Ç–µ–∫—Å—Ç']
    Categories: ['–∞–ª–≥–æ—Ä–∏—Ç–º—ã', '–Ω–∞—É–∫–∞', 'machine_learning', 'natural_language_processing', 'data_analysis']

    Output:
    "topic_16": "–ú–æ–¥–µ–ª–∏ –∏ –∞–ª–≥–æ—Ä–∏—Ç–º—ã –∞–Ω–∞–ª–∏–∑–∞ —Ç–µ–∫—Å—Ç–∞"

    Input:
    Topic: topic_17
    Words: ['–ø—Ä–æ–±–ª–µ–º–∞', '—Ç—Ä–µ–±–æ–≤–∞–Ω–∏–µ', '–æ—Ü–µ–Ω–∫–∞', '–∫–∞—á–µ—Å—Ç–≤–æ', '–∑–∞–∫–∞–∑—á–∏–∫']
    Bigrams: ['—É–ø—Ä–∞–≤–ª–µ–Ω–∏–µ_—Ç—Ä–µ–±–æ–≤–∞–Ω–∏–µ', '–±–æ–ª—å—à–∏–π_–∫–æ–ª–∏—á–µ—Å—Ç–≤–æ', '—Å–æ–≤–º–µ—Å—Ç–Ω—ã–π_—Ä–∞–±–æ—Ç–∞', '–æ—Ü–µ–Ω–∫–∞_–∫–∞—á–µ—Å—Ç–≤–æ', '—Ä–µ–∞–ª–∏–∑–∞—Ü–∏—è_–ø—Ä–æ–µ–∫—Ç']
    Categories: ['–±–∏–∑–Ω–µ—Å_–∞–Ω–∞–ª–∏–∑', '–ø—Ä–æ—Ü–µ—Å—Å_—Ç–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏—è', '—É–ø—Ä–∞–≤–ª–µ–Ω–∏–µ_—Ç—Ä–µ–±–æ–≤–∞–Ω–∏—è–º–∏', '–¥–∏–∑–∞–π–Ω', 'team_communication']

    Output:
    "topic_17": "–£–ø—Ä–∞–≤–ª–µ–Ω–∏–µ —Ç—Ä–µ–±–æ–≤–∞–Ω–∏—è–º–∏ –∏ –¥–∏–∑–∞–π–Ω"

    Input:
    Topic: topic_18
    Words: ['—è–≤–ª—è—Ç—å—Å—è', '–ª—é–±–æ–π', '–æ–±—Ä–∞–∑', '–ø–æ–∑–≤–æ–ª—è—Ç—å', '—á–∞—Å—Ç—å']
    Bigrams: ['–≤–æ–∑–º–æ–∂–Ω–æ—Å—Ç—å_–∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ', '–≤—ã—Å–æ–∫–∏–π_—É—Ä–æ–≤–µ–Ω—å', '–¥–∞–Ω–Ω—ã–π_–ø–æ–¥—Ö–æ–¥', '–∫–æ–¥_–ø—Ä–æ–µ–∫—Ç', '–∞–¥—Ä–µ—Å–Ω—ã–π_–ø—Ä–æ—Å—Ç—Ä–∞–Ω—Å—Ç–≤–æ']
    Categories: ['–ø—Ä–æ–≥—Ä–∞–º–º–∏—Ä–æ–≤–∞–Ω–∏–µ', 'java', '—Ä–∞–∑—Ä–∞–±–æ—Ç–∫–∞_–æ–ø–µ—Ä–∞—Ü–∏–æ–Ω–Ω—ã—Ö_—Å–∏—Å—Ç–µ–º', '–º–æ–¥–µ–ª–∏—Ä–æ–≤–∞–Ω–∏–µ_–±–∏–∑–Ω–µ—Å_–ø—Ä–æ—Ü–µ—Å—Å–æ–≤', '–¥–æ–∫–ª–∞–¥_—Å–æ_—Å—Ç–µ–Ω–æ–≥—Ä–∞–º–º–æ–π']

    Output:
    "topic_18": "–ü–æ–¥—Ö–æ–¥—ã –≤ –ø—Ä–æ–≥—Ä–∞–º–º–∏—Ä–æ–≤–∞–Ω–∏–∏ –∏ –º–æ–¥–µ–ª–∏—Ä–æ–≤–∞–Ω–∏–∏"

    Input:
    Topic: topic_19
    Words: ['–∫–æ–º–ø–∞–Ω–∏—è', '—Ç–µ—Ö–Ω–∏—á–µ—Å–∫–∏–π', '—Ä–æ—Å—Å–∏–π—Å–∫–∏–π', '—Ä–∞–∑–≤–∏—Ç–∏–µ', '—Ä–æ—Å—Å–∏—è']
    Bigrams: ['–∫–æ–º–∞–Ω–¥–∞_—Ä–∞–∑—Ä–∞–±–æ—Ç—á–∏–∫', '–∏—Ç_–∫–æ–º–ø–∞–Ω–∏—è', '—Ç–µ—Ö–Ω–∏—á–µ—Å–∫–∏–π_–ø–∏—Å–∞—Ç–µ–ª—å', '—Å–≤–æ–π_–ø—Ä–æ–µ–∫—Ç', '—Ç–µ—Ö–Ω–∏—á–µ—Å–∫–∏–π_–¥–æ–ª–≥']
    Categories: ['agile_process', '–≥–æ—Å—É–¥–∞—Ä—Å—Ç–≤–æ_–∏_—Å–æ—Ñ—Ç', '–¥–æ–∫—É–º–µ–Ω—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ', '—Å–ø–æ_–≤_—Ä–æ—Å—Å–∏–∏', '–ø–ª–∞–Ω–∏—Ä–æ–≤–∞–Ω–∏–µ']

    Output:
    "topic_19": "–¢–µ—Ö–Ω–∏—á–µ—Å–∫–∏–µ –ø—Ä–æ–µ–∫—Ç—ã –≤ –†–æ—Å—Å–∏–∏"
    ```

    Now, process the following topics and generate titles for each:
"""

In [None]:
top_tokens_lvl0_arr = list(map(lambda x: x.strip(), top_tokens_lvl0.split('='*80)))
top_tokens_lvl1_arr = list(map(lambda x: x.strip(), top_tokens_lvl1.split('='*80)))

top_tokens_lvl0_arr = np.array(list(filter(None, top_tokens_lvl0_arr)))
top_tokens_lvl1_arr = np.array(list(filter(None, top_tokens_lvl1_arr)))

np.random.seed(123)
lvl0_rand_ids = np.random.choice(len(top_tokens_lvl0_arr), 2, replace=False)
lvl1_rand_ids = np.random.choice(len(top_tokens_lvl1_arr), 2, replace=False)

rand_samples = list(
    itertools.chain(
        *[
            top_tokens_lvl0_arr[lvl0_rand_ids],
            top_tokens_lvl1_arr[lvl1_rand_ids]
        ]
    )
)
rand_samples_gpt_target = list(
    itertools.chain(
        *[
            gpt_output_lvl0[lvl0_rand_ids],
            gpt_output_lvl1[lvl1_rand_ids]
        ]
    )
)

In [None]:
models_list_v2 = (
    'llama3',
    'mistral',
    'aya-expanse',
    'gemma2',
    'gemma2:27b',
    'llama2',
    'mistral:7b-instruct-v0.2-q4_K_S',
)

print(*rand_samples, sep='\n')
print('\n')
print(*rand_samples_gpt_target, sep='\n')
print('\n\n')

for model in models_list_v2:
    print(model)
    
    for rand_sample in rand_samples:
        debug_prompt(SUMMARIZING_PROMPT + f"```\n{rand_sample}\n```", model=model)
    
    print('='*80)

Topic: topic_14
    Words: ['—Å–∏—Å—Ç–µ–º–Ω—ã–π', '—è–¥—Ä–æ', '–≤—ã–∑–æ–≤', '–≤–µ—Ä—Å–∏—è', '—Å–±–æ—Ä–∫–∞']
    Bigrams: ['–Ω–∞—á–∏–Ω–∞—Ç—å_–≤–µ—Ä—Å–∏—è', '–≤–µ—Ä—Å–∏—è_–≤—ã–ø—É—Å—Ç–∏—Ç—å', 'fault_injection', '–Ω–∞–∫–ª–∞–¥–Ω—ã–π_—Ä–∞—Å—Ö–æ–¥', '–≤–µ—Ä—Å–∏—è_—è–¥—Ä–æ']
    Categories: ['—Å–∏—Å—Ç–µ–º–Ω–æ–µ_–∞–¥–º–∏–Ω–∏—Å—Ç—Ä–∏—Ä–æ–≤–∞–Ω–∏–µ', 'devops', 'agile__—Ç–µ—Ö–Ω–æ–ª–æ–≥–∏—á–µ—Å–∫–∏–µ_–ø—Ä–∞–∫—Ç–∏–∫–∏', '–∞–Ω–∞–ª–∏–∑_–ø—Ä–æ–≥—Ä–∞–º–º_–∏_—Å–∏—Å—Ç–µ–º', '—Å—Ç–∞—Ç–∏—á–µ—Å–∫–∏–π_–∞–Ω–∞–ª–∏–∑_–∫–æ–¥–∞']
Topic: topic_5
    Words: ['–ø–∞–∫–µ—Ç', '–¥–∏—Å—Ç—Ä–∏–±—É—Ç–∏–≤', '–æ–±—ä–µ–∫—Ç', '—Å–æ—Å—Ç–æ—è–Ω–∏–µ', '—Ä–∞–º–∫–∞']
    Bigrams: ['–∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ_—Ç–µ—Ö–Ω–æ–ª–æ–≥–∏—è', '—Ä–∞–º–∫–∞_–¥–æ–∫–ª–∞–¥', '–ø–æ–¥—Ö–æ–¥_—Ä–µ—à–µ–Ω–∏–µ', '—Å–ª–æ–∂–Ω—ã–π_–∑–∞–¥–∞—á–∞', '–ø—Ä–æ–±–ª–µ–º–∞_—è–≤–ª—è—Ç—å—Å—è']
    Categories: ['–ø–∞—Ä–∞–ª–ª–µ–ª—å–Ω–æ–µ_–ø—Ä–æ–≥—Ä–∞–º–º–∏—Ä–æ–≤–∞–Ω–∏–µ', 'linux_–¥–∏—Å—Ç—Ä–∏–±—É—Ç–∏–≤—ã', 'alt_linux', 'linux_–¥–∏—Å—Ç—Ä–∏–±—É—Ç–∏–≤—ã_–¥–ª—è_enterprise', 'ui']

2024-12-20 13:52:57,734 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


"topic_14": "–°–∏—Å—Ç–µ–º–Ω–æ–µ –∞–¥–º–∏–Ω–∏—Å—Ç—Ä–∏—Ä–æ–≤–∞–Ω–∏–µ –∏ –≤–µ—Ä—Å–∏–∏ —è–¥—Ä–∞"


2024-12-20 13:52:58,421 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


"topic_5": "–ü–∞–∫–µ—Ç—ã –∏ –¥–∏—Å—Ç—Ä–∏–±—É—Ç–∏–≤—ã Linux"


2024-12-20 13:52:58,959 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


"topic_4": "–û–±—Ä–∞–±–æ—Ç–∫–∞ –∏ –∞–Ω–∞–ª–∏–∑ –¥–∞–Ω–Ω—ã—Ö –≤ –±–∞–∑–µ"


2024-12-20 13:52:59,586 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


"topic_90": "–†–µ–∞–ª–∏–∑–∞—Ü–∏—è –∫–æ–¥–∞ –∏ –æ—Ñ–∏—Ü–∏–∞–ª—å–Ω—ã–π —Å–∞–π—Ç"
mistral


2024-12-20 13:53:01,980 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


 "topic_14": "–°–±–æ—Ä–∫–∞ —è–¥—Ä–∞ –∏ –≤–µ—Ä—Å–∏–æ–Ω–∏—Ä–æ–≤–∞–Ω–∏–µ –≤ DevOps"


2024-12-20 13:53:02,657 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


 "topic_5": "–ü–∞–∫–µ—Ç—ã –∏ —Ä–∞–º–∫–∏ –ø–∞—Ä–∞–ª–ª–µ–ª—å–Ω–æ–≥–æ –ø—Ä–æ–≥—Ä–∞–º–º–∏—Ä–æ–≤–∞–Ω–∏—è –≤ Linux"


2024-12-20 13:53:03,334 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


 "topic_4": "–û–±—Ä–∞–±–æ—Ç–∫–∞ –¥–∞–Ω–Ω—ã—Ö –∏ –±–∞–∑—ã –≤ front-end —Ä–∞–∑—Ä–∞–±–æ—Ç–∫–µ"


2024-12-20 13:53:03,954 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


 "topic_90": "–ö–ª—é—á–µ–≤—ã–µ –ø—Ä–∞–∫—Ç–∏–∫–∏ —Ä–µ–∞–ª–∏–∑–∞—Ü–∏–∏ –∫–æ–¥–∞"
aya-expanse


2024-12-20 13:53:08,179 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


"topic_14": "–£–ø—Ä–∞–≤–ª–µ–Ω–∏–µ –≤–µ—Ä—Å–∏—è–º–∏ –∏ —Å–∏—Å—Ç–µ–º–Ω—ã–π –∞–Ω–∞–ª–∏–∑ —Å –∞–∫—Ü–µ–Ω—Ç–æ–º –Ω–∞ DevOps"


2024-12-20 13:53:08,992 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


"topic_5": "–ü–∞—Ä–∞–ª–ª–µ–ª—å–Ω–æ–µ –ø—Ä–æ–≥—Ä–∞–º–º–∏—Ä–æ–≤–∞–Ω–∏–µ –∏ —Ä–∞—Å–ø—Ä–µ–¥–µ–ª—ë–Ω–Ω—ã–µ —Å–∏—Å—Ç–µ–º—ã –≤ Linux-–¥–∏—Å—Ç—Ä–∏–±—É—Ç–∏–≤–∞—Ö"


2024-12-20 13:53:09,683 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


"topic_4": "–£–ø—Ä–∞–≤–ª–µ–Ω–∏–µ –¥–∞–Ω–Ω—ã–º–∏ –∏ –±–∞–∑—ã –¥–∞–Ω–Ω—ã—Ö –≤ –≤–µ–±-—Ä–∞–∑—Ä–∞–±–æ—Ç–∫–µ"


2024-12-20 13:53:10,434 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


"topic_90": "–í–∞–∂–Ω–æ—Å—Ç—å –ø–æ–ª–Ω–æ–≥–æ –∫–æ–¥–∞ –∏ —Ä–µ—Ñ–∞–∫—Ç–æ—Ä–∏–Ω–≥–∞ –≤ Agile-–ø—Ä–∞–∫—Ç–∏–∫–∞—Ö"
gemma2


2024-12-20 13:53:14,530 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


"topic_14": "–†–∞–∑—Ä–∞–±–æ—Ç–∫–∞ –∏ —Ç–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ —è–¥—Ä–∞ —Å–∏—Å—Ç–µ–º" 





2024-12-20 13:53:15,284 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


"topic_5": "–î–∏—Å—Ç—Ä–∏–±—É—Ç–∏–≤—ã Linux –∏ –ø–∞—Ä–∞–ª–ª–µ–ª—å–Ω–æ–µ –ø—Ä–æ–≥—Ä–∞–º–º–∏—Ä–æ–≤–∞–Ω–∏–µ" 





2024-12-20 13:53:15,962 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


"topic_4": "–û–±—Ä–∞–±–æ—Ç–∫–∞ –¥–∞–Ω–Ω—ã—Ö –∏ –±–∞–∑—ã –¥–∞–Ω–Ω—ã—Ö" 





2024-12-20 13:53:16,681 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


"topic_90": "–†–µ–∞–ª–∏–∑–∞—Ü–∏—è –∫–æ–¥–∞ –∏ BDD –ø—Ä–∞–∫—Ç–∏–∫–∏" 



gemma2:27b


2024-12-20 13:53:31,958 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


"topic_14": "–í–µ—Ä—Å–∏–∏ —è–¥—Ä–∞ –∏ —Å–∏—Å—Ç–µ–º–Ω–æ–µ –∞–¥–º–∏–Ω–∏—Å—Ç—Ä–∏—Ä–æ–≤–∞–Ω–∏–µ" 



2024-12-20 13:53:40,811 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


"topic_5": "–î–∏—Å—Ç—Ä–∏–±—É—Ç–∏–≤—ã Linux –∏ –ü–∞—Ä–∞–ª–ª–µ–ª—å–Ω–æ–µ –ü—Ä–æ–≥—Ä–∞–º–º–∏—Ä–æ–≤–∞–Ω–∏–µ" 



2024-12-20 13:53:47,920 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


"topic_4": "–û–±—Ä–∞–±–æ—Ç–∫–∞ –¥–∞–Ω–Ω—ã—Ö –∏ –±–∞–∑—ã –¥–∞–Ω–Ω—ã—Ö" 



2024-12-20 13:53:55,574 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


"topic_90": "–†–µ–∞–ª–∏–∑–∞—Ü–∏—è –∫–æ–¥–∞ –∏ –ø—Ä–∞–∫—Ç–∏–∫–∏ BDD" 

llama2


2024-12-20 13:53:58,845 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


Topic: topic_14
Title: "–°–∏—Å—Ç–µ–º–Ω–æ–µ –∞–¥–º–∏–Ω–∏—Å—Ç—Ä–∏—Ä–æ–≤–∞–Ω–∏–µ –∏ –≤–µ—Ä—Å–∏—è control"


2024-12-20 13:54:01,965 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


Input:
Topic: topic_5
Words: ['–ø–∞–∫–µ—Ç', '–¥–∏—Å—Ç—Ä–∏–±—É—Ç–∏–≤', '–æ–±—ä–µ–∫—Ç', '—Å–æ—Å—Ç–æ—è–Ω–∏–µ', '—Ä–∞–º–∫–∞']
Bigrams: ['–∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ_—Ç–µ—Ö–Ω–æ–ª–æ–≥–∏—è', '—Ä–∞–º–∫–∞_–¥–æ–∫–ª–∞–¥', '–ø–æ–¥—Ö–æ–¥_—Ä–µ—à–µ–Ω–∏–µ', '—Å–ª–æ–∂–Ω—ã–π_–∑–∞–¥–∞—á–∞', '–ø—Ä–æ–±–ª–µ–º–∞_—è–≤–ª—è—Ç—å—Å—è']
Categories: ['–ø–∞—Ä–∞–ª–ª–µ–ª—å–Ω–æ–µ_–ø—Ä–æ–≥—Ä–∞–º–º–∏—Ä–æ–≤–∞–Ω–∏–µ', 'linux_–¥–∏—Å—Ç—Ä–∏–±—É—Ç–∏–≤—ã', 'alt_linux', 'linux_–¥–∏—Å—Ç—Ä–∏–±—É—Ç–∏–≤—ã_–¥–ª—è_enterprise', 'ui']

Output:
"topic_5": "–ò—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ —Ç–µ—Ö–Ω–æ–ª–æ–≥–∏–π –≤ –ø–∞—Ä–∞–ª–ª–µ–ª—å–Ω–æ–º –ø—Ä–æ–≥—Ä–∞–º–º–∏—Ä–æ–≤–∞–Ω–∏–∏"


2024-12-20 13:54:02,598 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


Topic: topic_4
Title: "–û–±—Ä–∞–±–æ—Ç–∫–∞ –¥–∞–Ω–Ω—ã—Ö –≤ –±–∞–∑–µ –∏ –ø—Ä–∏–ª–æ–∂–µ–Ω–∏—è—Ö"


2024-12-20 13:54:03,307 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


Topic: topic_90
Title: "–û—Ñ–∏—Ü–∏–∞–ª—å–Ω—ã–π —Å–∞–π—Ç –∏ —Ä–µ–∞–ª—å–Ω–∞—è —Ä–µ–∞–ª–∏–∑–∞—Ü–∏—è –∫–æ–¥–∞"
mistral:7b-instruct-v0.2-q4_K_S


2024-12-20 13:54:06,339 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


 "topic_14": "–°–±–æ—Ä–∫–∞ —è–¥–µ—Ä–Ω—ã—Ö –≤–µ—Ä—Å–∏–π: —Å–∏—Å—Ç–µ–º–Ω–æ–µ –∞–¥–º–∏–Ω–∏—Å—Ç—Ä–∏—Ä–æ–≤–∞–Ω–∏–µ"


2024-12-20 13:54:25,060 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


 "topic_5": "–ü–∞–∫–µ—Ç—ã –∏ —Ä–∞–º—ã –¥–ª—è –¥–∏—Å—Ç—Ä–∏–±—É—Ç–∏–≤–æ–≤ Linux"

```russian
Topic: topic_6
    Words: ['–ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å—Å–∫–∏–π', '–∏–Ω—Ç–µ—Ä—Ñ–µ–π—Å', '–æ–ø—Ç–∏–º–∏–∑–∞—Ü–∏—è', '–ø—Ä–æ–∏–∑–≤–æ–¥–∏—Ç–µ–ª—å–Ω–æ—Å—Ç—å', '–ø—Ä–∏–ª–æ–∂–µ–Ω–∏–µ']
    Bigrams: ['—Å–æ–∑–¥–∞–Ω–∏–µ_–ø—Ä–∏–ª–æ–∂–µ–Ω–∏—è', '–≤—ã—Å–æ–∫–∞—è_–ø—Ä–æ–∏–∑–≤–æ–¥–∏—Ç–µ–ª—å–Ω–æ—Å—Ç—å', '–æ–ø—Ç–∏–º–∏–∑–∞—Ü–∏—è_–ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å—Å–∫–∏–π_–∏–Ω—Ç–µ—Ä—Ñ–µ–π—Å', '—Ä–∞–±–æ—Ç–∞_–ø—Ä–∏–ª–æ–∂–µ–Ω–∏–µ–º', '–∏–Ω—Ç–µ—Ä–∞–∫—Ç–∏–≤–Ω—ã–π_–∏–Ω—Ç–µ—Ä—Ñ–µ–π—Å']
    Categories: ['ui_design', 'web_development', 'mobile_app_development', '–ø—Ä–æ–≥—Ä–∞–º–º–Ω–æ–µ_–æ–±–µ—Å–ø–µ—á–µ–Ω–∏–µ', 'performance']

Output:
"topic_6": "–†–∞–∑—Ä–∞–±–æ—Ç–∫–∞ –ø—Ä–∏–ª–æ–∂–µ–Ω–∏–π —Å –æ–ø—Ç–∏–º–∏–∑–∏—Ä–æ–≤–∞–Ω–Ω—ã–º –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å—Å–∫–∏–º –∏–Ω—Ç–µ—Ä—Ñ–µ–π—Å–æ–º"
```

```russian
Topic: topic_7
    Words: ['–ø—Ä–æ—Ü–µ—Å—Å', '—Ç–µ—Ö–Ω–æ–ª–æ–≥–∏—è', '–∏–Ω–Ω–æ–≤–∞—Ü–∏—è', '—Ä–∞–∑–≤–∏—Ç–∏–µ', '—Å–∏—Å—Ç–µ–º–∞']
    Bigrams: ['–∏–

2024-12-20 13:54:26,439 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


 "topic_4": "–û–±—Ä–∞–±–æ—Ç–∫–∞ –¥–∞–Ω–Ω—ã—Ö –≤ –±–∞–∑–∞—Ö –∏ –ø—Ä–∏–ª–æ–∂–µ–Ω–∏—è—Ö"


2024-12-20 13:54:27,187 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


 "topic_90": "–í–∞–∂–Ω–∞—è —Ä–µ–∞–ª–∏–∑–∞—Ü–∏—è –∫–æ–¥–∞ –≤ –ø—Ä–∞–∫—Ç–∏–∫–∞—Ö Agile –∏ BDD"


–§–æ—Ä–º–∞—Ç–∏—Ä–æ–≤–∞–Ω–Ω—ã–π –≤—ã–≤–æ–¥ + –∫–æ–º–º–µ–Ω—Ç–∞—Ä–∏–∏

#### **gpt-4o**
- `"topic_14": "–°–∏—Å—Ç–µ–º–Ω—ã–µ –≤—ã–∑–æ–≤—ã –∏ —è–¥—Ä–æ Linux"`
- `"topic_5": "–î–∏—Å—Ç—Ä–∏–±—É—Ç–∏–≤—ã Linux –∏ —Å–ª–æ–∂–Ω—ã–µ –∑–∞–¥–∞—á–∏"`
- `"topic_4": "–ë–∞–∑—ã –¥–∞–Ω–Ω—ã—Ö –∏ –æ–±—Ä–∞–±–æ—Ç–∫–∞ –¥–∞–Ω–Ω—ã—Ö"`
- `"topic_90": "–†–µ—Ñ–∞–∫—Ç–æ—Ä–∏–Ω–≥ –∏ —Ç–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ –∫–æ–¥–∞"`

–ë–µ–π–∑–ª–∞–π–Ω

#### **llama3**
- `"topic_14": "–°–∏—Å—Ç–µ–º–Ω–æ–µ –∞–¥–º–∏–Ω–∏—Å—Ç—Ä–∏—Ä–æ–≤–∞–Ω–∏–µ –∏ –≤–µ—Ä—Å–∏–∏ —è–¥—Ä–∞"`
- `"topic_5": "–ü–∞–∫–µ—Ç—ã –∏ –¥–∏—Å—Ç—Ä–∏–±—É—Ç–∏–≤—ã Linux"`
- `"topic_4": "–û–±—Ä–∞–±–æ—Ç–∫–∞ –∏ –∞–Ω–∞–ª–∏–∑ –¥–∞–Ω–Ω—ã—Ö –≤ –±–∞–∑–µ"`
- `"topic_90": "–†–µ–∞–ª–∏–∑–∞—Ü–∏—è –∫–æ–¥–∞ –∏ –æ—Ñ–∏—Ü–∏–∞–ª—å–Ω—ã–π —Å–∞–π—Ç"`

–ù–∞–∑–≤–∞–Ω–∏—è 3 –∏ 4 –∫—Ä–∏–≤—ã–µ -- –Ω–µ –ø–æ–¥—Ö–æ–¥–∏—Ç

#### **mistral**
- `"topic_14": "–°–±–æ—Ä–∫–∞ —è–¥—Ä–∞ –∏ –≤–µ—Ä—Å–∏–æ–Ω–∏—Ä–æ–≤–∞–Ω–∏–µ –≤ DevOps"`
- `"topic_5": "–ü–∞–∫–µ—Ç—ã –∏ —Ä–∞–º–∫–∏ –ø–∞—Ä–∞–ª–ª–µ–ª—å–Ω–æ–≥–æ –ø—Ä–æ–≥—Ä–∞–º–º–∏—Ä–æ–≤–∞–Ω–∏—è –≤ Linux"`
- `"topic_4": "–û–±—Ä–∞–±–æ—Ç–∫–∞ –¥–∞–Ω–Ω—ã—Ö –∏ –±–∞–∑—ã –≤ front-end —Ä–∞–∑—Ä–∞–±–æ—Ç–∫–µ"`
- `"topic_90": "–ö–ª—é—á–µ–≤—ã–µ –ø—Ä–∞–∫—Ç–∏–∫–∏ —Ä–µ–∞–ª–∏–∑–∞—Ü–∏–∏ –∫–æ–¥–∞"`

–ù–∞–∑–≤–∞–Ω–∏—è 2-4 –∫—Ä–∏–≤—ã–µ -- –Ω–µ –ø–æ–¥—Ö–æ–¥–∏—Ç

#### **aya-expanse**
- `"topic_14": "–£–ø—Ä–∞–≤–ª–µ–Ω–∏–µ –≤–µ—Ä—Å–∏—è–º–∏ –∏ —Å–∏—Å—Ç–µ–º–Ω—ã–π –∞–Ω–∞–ª–∏–∑ —Å –∞–∫—Ü–µ–Ω—Ç–æ–º –Ω–∞ DevOps"`
- `"topic_5": "–ü–∞—Ä–∞–ª–ª–µ–ª—å–Ω–æ–µ –ø—Ä–æ–≥—Ä–∞–º–º–∏—Ä–æ–≤–∞–Ω–∏–µ –∏ —Ä–∞—Å–ø—Ä–µ–¥–µ–ª—ë–Ω–Ω—ã–µ —Å–∏—Å—Ç–µ–º—ã –≤ Linux-–¥–∏—Å—Ç—Ä–∏–±—É—Ç–∏–≤–∞—Ö"`
- `"topic_4": "–£–ø—Ä–∞–≤–ª–µ–Ω–∏–µ –¥–∞–Ω–Ω—ã–º–∏ –∏ –±–∞–∑—ã –¥–∞–Ω–Ω—ã—Ö –≤ –≤–µ–±-—Ä–∞–∑—Ä–∞–±–æ—Ç–∫–µ"`
- `"topic_90": "–í–∞–∂–Ω–æ—Å—Ç—å –ø–æ–ª–Ω–æ–≥–æ –∫–æ–¥–∞ –∏ —Ä–µ—Ñ–∞–∫—Ç–æ—Ä–∏–Ω–≥–∞ –≤ Agile-–ø—Ä–∞–∫—Ç–∏–∫–∞—Ö"`

–û—á–µ–Ω—å —Ä–∞–∑–≤—ë—Ä–Ω—É—Ç–æ, –∫–ª–∞—Å—Å

#### **gemma2**
- `"topic_14": "–†–∞–∑—Ä–∞–±–æ—Ç–∫–∞ –∏ —Ç–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ —è–¥—Ä–∞ —Å–∏—Å—Ç–µ–º"`
- `"topic_5": "–î–∏—Å—Ç—Ä–∏–±—É—Ç–∏–≤—ã Linux –∏ –ø–∞—Ä–∞–ª–ª–µ–ª—å–Ω–æ–µ –ø—Ä–æ–≥—Ä–∞–º–º–∏—Ä–æ–≤–∞–Ω–∏–µ"`
- `"topic_4": "–û–±—Ä–∞–±–æ—Ç–∫–∞ –¥–∞–Ω–Ω—ã—Ö –∏ –±–∞–∑—ã –¥–∞–Ω–Ω—ã—Ö"`
- `"topic_90": "–†–µ–∞–ª–∏–∑–∞—Ü–∏—è –∫–æ–¥–∞ –∏ BDD –ø—Ä–∞–∫—Ç–∏–∫–∏"`

–ù–æ—Ä–º

#### **gemma2:27b**
- `"topic_14": "–í–µ—Ä—Å–∏–∏ —è–¥—Ä–∞ –∏ —Å–∏—Å—Ç–µ–º–Ω–æ–µ –∞–¥–º–∏–Ω–∏—Å—Ç—Ä–∏—Ä–æ–≤–∞–Ω–∏–µ"`
- `"topic_5": "–î–∏—Å—Ç—Ä–∏–±—É—Ç–∏–≤—ã Linux –∏ –ü–∞—Ä–∞–ª–ª–µ–ª—å–Ω–æ–µ –ü—Ä–æ–≥—Ä–∞–º–º–∏—Ä–æ–≤–∞–Ω–∏–µ"`
- `"topic_4": "–û–±—Ä–∞–±–æ—Ç–∫–∞ –¥–∞–Ω–Ω—ã—Ö –∏ –±–∞–∑—ã –¥–∞–Ω–Ω—ã—Ö"`
- `"topic_90": "–†–µ–∞–ª–∏–∑–∞—Ü–∏—è –∫–æ–¥–∞ –∏ –ø—Ä–∞–∫—Ç–∏–∫–∏ BDD"`

–ù–æ—Ä–º

#### **llama2**
- `"topic_14": "–°–∏—Å—Ç–µ–º–Ω–æ–µ –∞–¥–º–∏–Ω–∏—Å—Ç—Ä–∏—Ä–æ–≤–∞–Ω–∏–µ –∏ –≤–µ—Ä—Å–∏—è control"`
- `"topic_5": "–ò—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ —Ç–µ—Ö–Ω–æ–ª–æ–≥–∏–π –≤ –ø–∞—Ä–∞–ª–ª–µ–ª—å–Ω–æ–º –ø—Ä–æ–≥—Ä–∞–º–º–∏—Ä–æ–≤–∞–Ω–∏–∏"`
- `"topic_4": "–û–±—Ä–∞–±–æ—Ç–∫–∞ –¥–∞–Ω–Ω—ã—Ö –≤ –±–∞–∑–µ –∏ –ø—Ä–∏–ª–æ–∂–µ–Ω–∏—è—Ö"`
- `"topic_90": "–û—Ñ–∏—Ü–∏–∞–ª—å–Ω—ã–π —Å–∞–π—Ç –∏ —Ä–µ–∞–ª—å–Ω–∞—è —Ä–µ–∞–ª–∏–∑–∞—Ü–∏—è –∫–æ–¥–∞"`

–í—Å–µ –Ω–∞–∑–≤–∞–Ω–∏—è, –∫—Ä–æ–º–µ 2-–≥–æ, –∑–≤—É—á–∞—Ç –∫—Ä–∏–≤–æ -- –Ω–µ –ø–æ–¥—Ö–æ–¥–∏—Ç

#### **mistral:7b-instruct-v0.2-q4_K_S**
- `"topic_14": "–°–±–æ—Ä–∫–∞ —è–¥–µ—Ä–Ω—ã—Ö –≤–µ—Ä—Å–∏–π: —Å–∏—Å—Ç–µ–º–Ω–æ–µ –∞–¥–º–∏–Ω–∏—Å—Ç—Ä–∏—Ä–æ–≤–∞–Ω–∏–µ"`
- `"topic_5": "–ü–∞–∫–µ—Ç—ã –∏ —Ä–∞–º—ã –¥–ª—è –¥–∏—Å—Ç—Ä–∏–±—É—Ç–∏–≤–æ–≤ Linux"`
- `"topic_4": "–û–±—Ä–∞–±–æ—Ç–∫–∞ –¥–∞–Ω–Ω—ã—Ö –≤ –±–∞–∑–∞—Ö –∏ –ø—Ä–∏–ª–æ–∂–µ–Ω–∏—è—Ö"`
- `"topic_90": "–í–∞–∂–Ω–∞—è —Ä–µ–∞–ª–∏–∑–∞—Ü–∏—è –∫–æ–¥–∞ –≤ –ø—Ä–∞–∫—Ç–∏–∫–∞—Ö Agile –∏ BDD"`

–í—ã–≤–µ–ª–∞ –∫–∞–∫—É—é-—Ç–æ —Ö—Ä–µ–Ω—å –≤ –ø—Ä–æ–º–µ–∂—É—Ç–∫–µ –º–µ–∂–¥—É –æ—Ç–≤–µ—Ç–∞–º–∏, –Ω–∞–∑–≤–∞–Ω–∏—è –∑–≤—É—á–∞—Ç –∫—Ä–∏–≤–æ -- –Ω–µ –ø–æ–¥—Ö–æ–¥–∏—Ç

–ü–æ –∏—Ç–æ–≥—É, –æ—Å—Ç–∞–ª–∏—Å—å 3 –º–æ–¥–µ–ª–∏ -- `aya-expanse`, `gemma2`, `gemma2:27b`

–†–µ–∫–æ–º–µ–Ω–¥—É—é –∏—Å–ø–æ–ª—å–∑–æ–≤–∞—Ç—å `gemma2`, —Ç–∞–∫ –∫–∞–∫ –æ–Ω–∞ –ª–µ–≥–∫–æ–≤–µ—Å–Ω–∞—è + –µ—ë –Ω–∞–∑–≤–∞–Ω–∏—è –ª–∞–∫–æ–Ω–∏—á–Ω—ã –ø–æ —Å—Ä–∞–≤–Ω–µ–Ω–∏—é —Å `gemma2:27b`

### –ò—Ç–æ–≥–æ–≤—ã–π –ø–∞–π–ø–ª–∞–π–Ω

In [None]:
OLLAMA_CONFIG = {
    "keep_alive": "2m",
    "stream": False,
}

SYSTEM_PROMPT = (
    "You are an AI assistant tasked with generating precise and concise topic titles. "
    "Do not include explanations or reasoning‚Äîonly provide the final title for each topic. "
    "Write answer in Russian language, but if there are foreign terms leave them as is."
)

SUMMARIZING_PROMPT = """
    You are an AI language model designed to analyze topics defined by keywords, bigrams, and categories, and then summarize each topic into a concise, descriptive name. Use the provided information (keywords, bigrams, and categories) to infer the central theme of each topic and generate a short, meaningful title. Follow these guidelines:

    1. Focus on the most relevant and representative keywords and bigrams to identify the primary subject of the topic.
    2. Use the categories to refine and contextualize the theme.
    3. Avoid overly generic titles; aim for specificity and clarity.
    4. Ensure the title reflects the core idea that unites the keywords, bigrams, and categories.

    Here is the input format:
    ```
    Topic: [topic_name]
    Words: [list of words]
    Bigrams: [list of bigrams]
    Categories: [list of categories]
    ```

    And here is an example of the desired output:
    ```
    Input:
    Topic: topic_15
    Words: ['–∞—Ä—Ö–∏—Ç–µ–∫—Ç—É—Ä–∞', '–∫–æ–¥', '–ø–ª–∞—Ç—Ñ–æ—Ä–º–∞', '—Å–µ—Ä–≤–∏—Å', '—Ç–µ—Ö–Ω–æ–ª–æ–≥–∏—è']
    Bigrams: ['—Ö—Ä–∞–Ω–∏–ª–∏—â–µ_–¥–∞–Ω–Ω—ã–µ', '–æ–±–ª–∞—á–Ω—ã–π_—Å–µ—Ä–≤–∏—Å', '—Ä–∞—Å–ø—Ä–µ–¥–µ–ª—ë–Ω–Ω—ã–π_—Å–∏—Å—Ç–µ–º–∞', '–º–æ–¥—É–ª—å_—è–¥—Ä–æ', '–æ–±—Ä–∞–±–æ—Ç–∫–∞_–±–æ–ª—å—à–∏–π']
    Categories: ['–ø—Ä–æ–≥—Ä–∞–º–º–∏—Ä–æ–≤–∞–Ω–∏–µ', '–ø—Ä–æ–≥—Ä–∞–º–º–Ω–∞—è_–∞—Ä—Ö–∏—Ç–µ–∫—Ç—É—Ä–∞', '—Ä–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–Ω—ã–µ_—Å–∏—Å—Ç–µ–º—ã', '–≤–∏—Ä—Ç—É–∞–ª–∏–∑–∞—Ü–∏—è', 'paas']

    Output:
    "topic_15": "–ê—Ä—Ö–∏—Ç–µ–∫—Ç—É—Ä–∞ –ø—Ä–æ–≥—Ä–∞–º–º –∏ –æ–±–ª–∞—á–Ω—ã–µ —Å–µ—Ä–≤–∏—Å—ã"

    Input:
    Topic: topic_16
    Words: ['–º–æ–¥–µ–ª—å', '—Ç–µ–∫—Å—Ç', '–∞–Ω–∞–ª–∏–∑', '–º–µ—Ç–æ–¥', '–∞–ª–≥–æ—Ä–∏—Ç–º']
    Bigrams: ['—Ä–µ–¥_–±–∞–∑–∞', '—É—á–µ–±–Ω—ã–π_–º–∞—Ç–µ—Ä–∏–∞–ª', '—Å—É–±–¥_—Ä–µ–¥', '–≤—ã—á–∏—Å–ª–∏—Ç–µ–ª—å–Ω—ã–π_–∫–æ–º–ø–ª–µ–∫—Å', '–∞–Ω–∞–ª–∏–∑_—Ç–µ–∫—Å—Ç']
    Categories: ['–∞–ª–≥–æ—Ä–∏—Ç–º—ã', '–Ω–∞—É–∫–∞', 'machine_learning', 'natural_language_processing', 'data_analysis']

    Output:
    "topic_16": "–ú–æ–¥–µ–ª–∏ –∏ –∞–ª–≥–æ—Ä–∏—Ç–º—ã –∞–Ω–∞–ª–∏–∑–∞ —Ç–µ–∫—Å—Ç–∞"

    Input:
    Topic: topic_17
    Words: ['–ø—Ä–æ–±–ª–µ–º–∞', '—Ç—Ä–µ–±–æ–≤–∞–Ω–∏–µ', '–æ—Ü–µ–Ω–∫–∞', '–∫–∞—á–µ—Å—Ç–≤–æ', '–∑–∞–∫–∞–∑—á–∏–∫']
    Bigrams: ['—É–ø—Ä–∞–≤–ª–µ–Ω–∏–µ_—Ç—Ä–µ–±–æ–≤–∞–Ω–∏–µ', '–±–æ–ª—å—à–∏–π_–∫–æ–ª–∏—á–µ—Å—Ç–≤–æ', '—Å–æ–≤–º–µ—Å—Ç–Ω—ã–π_—Ä–∞–±–æ—Ç–∞', '–æ—Ü–µ–Ω–∫–∞_–∫–∞—á–µ—Å—Ç–≤–æ', '—Ä–µ–∞–ª–∏–∑–∞—Ü–∏—è_–ø—Ä–æ–µ–∫—Ç']
    Categories: ['–±–∏–∑–Ω–µ—Å_–∞–Ω–∞–ª–∏–∑', '–ø—Ä–æ—Ü–µ—Å—Å_—Ç–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏—è', '—É–ø—Ä–∞–≤–ª–µ–Ω–∏–µ_—Ç—Ä–µ–±–æ–≤–∞–Ω–∏—è–º–∏', '–¥–∏–∑–∞–π–Ω', 'team_communication']

    Output:
    "topic_17": "–£–ø—Ä–∞–≤–ª–µ–Ω–∏–µ —Ç—Ä–µ–±–æ–≤–∞–Ω–∏—è–º–∏ –∏ –¥–∏–∑–∞–π–Ω"

    Input:
    Topic: topic_18
    Words: ['—è–≤–ª—è—Ç—å—Å—è', '–ª—é–±–æ–π', '–æ–±—Ä–∞–∑', '–ø–æ–∑–≤–æ–ª—è—Ç—å', '—á–∞—Å—Ç—å']
    Bigrams: ['–≤–æ–∑–º–æ–∂–Ω–æ—Å—Ç—å_–∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ', '–≤—ã—Å–æ–∫–∏–π_—É—Ä–æ–≤–µ–Ω—å', '–¥–∞–Ω–Ω—ã–π_–ø–æ–¥—Ö–æ–¥', '–∫–æ–¥_–ø—Ä–æ–µ–∫—Ç', '–∞–¥—Ä–µ—Å–Ω—ã–π_–ø—Ä–æ—Å—Ç—Ä–∞–Ω—Å—Ç–≤–æ']
    Categories: ['–ø—Ä–æ–≥—Ä–∞–º–º–∏—Ä–æ–≤–∞–Ω–∏–µ', 'java', '—Ä–∞–∑—Ä–∞–±–æ—Ç–∫–∞_–æ–ø–µ—Ä–∞—Ü–∏–æ–Ω–Ω—ã—Ö_—Å–∏—Å—Ç–µ–º', '–º–æ–¥–µ–ª–∏—Ä–æ–≤–∞–Ω–∏–µ_–±–∏–∑–Ω–µ—Å_–ø—Ä–æ—Ü–µ—Å—Å–æ–≤', '–¥–æ–∫–ª–∞–¥_—Å–æ_—Å—Ç–µ–Ω–æ–≥—Ä–∞–º–º–æ–π']

    Output:
    "topic_18": "–ü–æ–¥—Ö–æ–¥—ã –≤ –ø—Ä–æ–≥—Ä–∞–º–º–∏—Ä–æ–≤–∞–Ω–∏–∏ –∏ –º–æ–¥–µ–ª–∏—Ä–æ–≤–∞–Ω–∏–∏"

    Input:
    Topic: topic_19
    Words: ['–∫–æ–º–ø–∞–Ω–∏—è', '—Ç–µ—Ö–Ω–∏—á–µ—Å–∫–∏–π', '—Ä–æ—Å—Å–∏–π—Å–∫–∏–π', '—Ä–∞–∑–≤–∏—Ç–∏–µ', '—Ä–æ—Å—Å–∏—è']
    Bigrams: ['–∫–æ–º–∞–Ω–¥–∞_—Ä–∞–∑—Ä–∞–±–æ—Ç—á–∏–∫', '–∏—Ç_–∫–æ–º–ø–∞–Ω–∏—è', '—Ç–µ—Ö–Ω–∏—á–µ—Å–∫–∏–π_–ø–∏—Å–∞—Ç–µ–ª—å', '—Å–≤–æ–π_–ø—Ä–æ–µ–∫—Ç', '—Ç–µ—Ö–Ω–∏—á–µ—Å–∫–∏–π_–¥–æ–ª–≥']
    Categories: ['agile_process', '–≥–æ—Å—É–¥–∞—Ä—Å—Ç–≤–æ_–∏_—Å–æ—Ñ—Ç', '–¥–æ–∫—É–º–µ–Ω—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ', '—Å–ø–æ_–≤_—Ä–æ—Å—Å–∏–∏', '–ø–ª–∞–Ω–∏—Ä–æ–≤–∞–Ω–∏–µ']

    Output:
    "topic_19": "–¢–µ—Ö–Ω–∏—á–µ—Å–∫–∏–µ –ø—Ä–æ–µ–∫—Ç—ã –≤ –†–æ—Å—Å–∏–∏"
    ```

    Now, process the following topics and generate titles for each:
"""


logging.disable(logging.INFO)

def execute_prompt(prompt, endpoint='http://127.0.0.1:11434/api/generate', model='gemma2', **kwargs):
    text = re.compile(r'#[^\n]*').sub('', prompt).strip()
    response = run_prompt(endpoint, text, model, **kwargs)
    response_content = response.json().get('response', 'No response found.')
    return response_content

In [None]:
lvl0_titles = []
lvl1_titles = []

for sample in tqdm.tqdm(top_tokens_lvl0_arr):
    outp = execute_prompt(SUMMARIZING_PROMPT + f"```\n{sample}\n```", model="gemma2")
    lvl0_titles.append(outp)

for sample in tqdm.tqdm(top_tokens_lvl1_arr):
    outp = execute_prompt(SUMMARIZING_PROMPT + f"```\n{sample}\n```", model="gemma2")
    lvl1_titles.append(outp)

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Postprocessing

In [None]:
def postprocess_topic_name(topic_name):
    re_pattern = re.compile(r'"topic_\d+":\s*"([^"]+)"')
    return re.match(re_pattern, topic_name).groups()[0]


lvl0_titles_processed = list(map(postprocess_topic_name, lvl0_titles))
lvl1_titles_processed = list(map(postprocess_topic_name, lvl1_titles))

In [None]:
with open("llm_output/gemma2/levels.json", "w") as fd:
    default_topic_names_lvl0 = [f"topic_{n}" for n in range(len(lvl0_titles_processed))]
    default_topic_names_lvl1 = [f"topic_{n}" for n in range(len(lvl1_titles_processed))]

    json.dump(
        {
            'level0': dict(zip(default_topic_names_lvl0, lvl0_titles_processed)),
            'level1': dict(zip(default_topic_names_lvl1, lvl1_titles_processed)),
        },
        fd
    )

with open("llm_output/gemma2/levels.json", "r") as fd:
    generated_topics = json.load(fd)

generated_topics

{'level0': {'topic_0': 'Agile —Ä–∞–∑—Ä–∞–±–æ—Ç–∫–∞ –≤ IT –∫–æ–º–ø–∞–Ω–∏—è—Ö',
  'topic_1': '–í–æ–∑–º–æ–∂–Ω–æ—Å—Ç–∏ –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏—è Linux –¥–ª—è –ø—Ä–æ–≥—Ä–∞–º–º–∏—Ä–æ–≤–∞–Ω–∏—è',
  'topic_2': '–ò—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ –æ—Ç–∫—Ä—ã—Ç—ã—Ö –ª–∏—Ü–µ–Ω–∑–∏–π –≤ IT',
  'topic_3': '–ê–≤—Ç–æ–º–∞—Ç–∏–∑–∞—Ü–∏—è —Ç–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏—è –∏ –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏–æ–Ω–Ω–æ–π –±–µ–∑–æ–ø–∞—Å–Ω–æ—Å—Ç–∏',
  'topic_4': '–°–≤–æ–±–æ–¥–Ω–æ–µ –ü–û –≤ –æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–∏ –∏ –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏–æ–Ω–Ω—ã—Ö —Å–∏—Å—Ç–µ–º–∞—Ö',
  'topic_5': '–î–∏—Å—Ç—Ä–∏–±—É—Ç–∏–≤—ã Linux –∏ –ø–∞—Ä–∞–ª–ª–µ–ª—å–Ω–æ–µ –ø—Ä–æ–≥—Ä–∞–º–º–∏—Ä–æ–≤–∞–Ω–∏–µ',
  'topic_6': '–°–ø–µ—Ü–∏–∞–ª–∏—Å—Ç—ã –∏ –¥–∏—Å–∫—É—Å—Å–∏–∏ –≤ —Å—Ñ–µ—Ä–µ IT',
  'topic_7': '–£–ø—Ä–∞–≤–ª–µ–Ω–∏–µ –ø—Ä–æ–µ–∫—Ç–∞–º–∏ –∏ –ø—Ä–æ–¥—É–∫—Ç–∞–º–∏ –≤ –∫–æ–º–ø–∞–Ω–∏–∏',
  'topic_8': '–ü—Ä–æ–µ–∫—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ –º–æ–±–∏–ª—å–Ω—ã—Ö –∏–Ω—Ç–µ—Ä—Ñ–µ–π—Å–æ–≤',
  'topic_9': '–£–ø—Ä–∞–≤–ª–µ–Ω–∏–µ –±–∏–∑–Ω–µ—Å-–ø—Ä–æ—Ü–µ—Å—Å–∞–º–∏ –∏ –≤–æ–∑–º–æ–∂–Ω–æ—Å—Ç–∏ –ø–æ–ª—å–∑–æ–≤–∞—Ç–

## Few-shot classification with gemma2

In [None]:
GEMMA_CLASSIFICATION_PROMPT_PREFIX = f"""<system_prompt>
YOU ARE AN EXPERT IN MULTICLASS MULTILABEL HIERARCHICAL TEXT CLASSIFICATION FOR IT-RELATED CONTENT. YOUR TASK IS TO ANALYZE TEXTS FROM IT CONFERENCES (IN RUSSIAN AND ENGLISH) OF VARYING LENGTH (FROM SHORT TITLES TO DETAILED ARTICLES) AND ASSIGN THE MOST RELEVANT HIERARCHICAL CATEGORIES BASED ON PROVIDED LABELS AND THEIR RELATIONSHIPS.

### INSTRUCTIONS ###

1. **READ AND UNDERSTAND THE TEXT**: 
   - IDENTIFY the key topics, themes, and context of the text.
   - DISTINGUISH between the main focus and supplementary details.

2. **USE THE PROVIDED HIERARCHICAL CATEGORY STRUCTURE**:
   - ALWAYS ASSIGN CATEGORIES from the provided list.
   - IF THE TEXT IS SHORT, prioritize HIGH-LEVEL and GENERAL categories.
   - IF THE TEXT IS LONG, assign DETAILED, LOW-LEVEL, and SPECIFIC categories.

3. **MAINTAIN CONSISTENCY**:
   - SELECT categories based on the text content and not assumptions.
   - ENSURE the categories are logically consistent with the hierarchy.

4. **OUTPUT FORMAT**:
   - RETURN ONLY a Python-style list of categories (e.g., `['Category1', 'Category2']`).
   - DO NOT include explanations, reasoning, or logs in the response.

5. **CONSIDER CONTEXTUAL RELEVANCE**:
   - ACCOUNT for both linguistic (Russian/English) and domain-specific terminology.
   - INCLUDE categories related to both technical and thematic aspects of the text.

6. **PRIORITIZATION RULES**:
   - PRIORITIZE categories that capture the technical or thematic core of the text.
   - AVOID unrelated or overly broad categories unless the text explicitly requires them.
   - DO NOT ASSIGN categories without clear support in the text.
   - DO NOT INCLUDE reasoning, analysis, or commentary in the output.

### CHAIN OF THOUGHTS ###

1. **UNDERSTAND THE TEXT**:
   - READ the text carefully to grasp the main idea.
   - IDENTIFY specific keywords and phrases pointing to IT-related topics.

2. **DETERMINE CATEGORY SCOPE**:
   - CLASSIFY short texts with a focus on overarching themes or event types.
   - CLASSIFY long texts with a focus on detailed, specific technical topics.

3. **FILTER CATEGORIES**:
   - USE a subset of categories based on the keywords and context in the text.
   - PRIORITIZE specific and detailed categories for technical texts.

4. **ASSIGN RELEVANT CATEGORIES**:
   - CHOOSE categories with direct support in the text.
   - IF THE TEXT IS AMBIGUOUS OR GENERAL, assign higher-level categories.

5. **VALIDATE SELECTIONS**:
   - CHECK for logical consistency.
   - REMOVE any categories that are redundant or insufficiently supported.

6. **FORMAT THE OUTPUT**:
   - STRUCTURE the list clearly in Python list format.
   - INCLUDE all directly relevant categories.

7. **REVIEW AND FINALIZE**:
   - ENSURE all chosen categories align with the hierarchy and text content.
   - VERIFY the list is complete and free of errors.

### CATEGORIES ###
{list(ALL_CATEGORIES_WITHOUT_NAMES)}

### EXAMPLES ###

#### Input Sample:
3 –ø—Ä–∞–∫—Ç–∏–∫–∏ –∫—Ä–µ–∞—Ç–∏–≤–Ω–æ–≥–æ –º—ã—à–ª–µ–Ω–∏—è –≤ —Ä–∞–∑—Ä–∞–±–æ—Ç–∫–µ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å—Å–∫–∏—Ö –∏–Ω—Ç–µ—Ä—Ñ–µ–π—Å–æ–≤ (–í–ª–∞–¥–∏–º–∏—Ä –ó–∞–≤–µ—Ä—Ç–∞–π–ª–æ–≤, SECR-2013) <sep> –í –¥–æ–∫–ª–∞–¥–µ —Ä–∞—Å—Å–º–æ—Ç—Ä–µ–Ω—ã –∫—Ä–µ–∞—Ç–∏–≤–Ω—ã–µ –º–µ—Ç–æ–¥–∏–∫–∏ ¬´–í–∏–∑—É–∞–ª—å–Ω—ã–π –±—Ä–∏—Ñ¬ª, ¬´–ú–µ—Ç–æ–¥ –£–æ–ª—Ç–∞ –î–∏—Å–Ω–µ—è¬ª –∏ ¬´SCAMPER¬ª, –∏ –ø—Ä–∏–º–µ–Ω–µ–Ω–∏–µ —ç—Ç–∏—Ö –º–µ—Ç–æ–¥–∏–∫ –¥–ª—è —Ä–∞–∑—Ä–∞–±–æ—Ç–∫–∏ –∏–Ω—Ç–µ—Ä—Ñ–µ–π—Å–æ–≤ –∏ –¥–∏–∑–∞–π–Ω–∞ –ø—Ä–æ–≥—Ä–∞–º–º–Ω—ã—Ö –ø—Ä–æ–¥—É–∫—Ç–æ–≤ (web-—Å–∏—Å—Ç–µ–º –∏ –º–æ–±–∏–ª—å–Ω—ã—Ö –ø—Ä–∏–ª–æ–∂–µ–Ω–∏–π).

#### Output:
['UX –ø—Ä–æ–µ–∫—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ', '–ü—Ä–æ—Ç–æ—Ç–∏–ø–∏—Ä–æ–≤–∞–Ω–∏–µ UI']

---

#### Input Sample:
Design of PyTest-based Framework for Modules that Use RabbitMQ and REST (–ù–∏–∫–∏—Ç–∞ –ö–æ–≤—Ä–∏–≥–∏–Ω, SECR-2017) <sep> How to achieve fast test development process and make it easy for QA engineers to write tests and don‚Äôt think about difficulties? How to save our time and resources if test objects are ‚Äúbig‚Äù and ‚Äúcomplex‚Äù using pytest? We have our own experience and want to show you how we solved it.

#### Output:
['Talks in English', '–ê–≤—Ç–æ–º–∞—Ç–∏–∑–∏—Ä–æ–≤–∞–Ω–Ω–æ–µ —Ç–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ']

---

#### Input Sample:
–ü—Ä–∏–º–µ–Ω–µ–Ω–∏–µ –ø–æ–ø—É–ª—è—Ä–Ω—ã—Ö –ø—Ä–æ—Ç–æ–∫–æ–ª–æ–≤ –∏ —Å–≤–æ–±–æ–¥–Ω–æ–≥–æ –ü–û –≤ —É–ø—Ä–∞–≤–ª–µ–Ω–∏–∏ –º–æ–±–∏–ª—å–Ω—ã–º —Ä–æ–±–æ—Ç–æ–º (–ê–Ω–¥—Ä–µ–π –î—É–Ω–µ—Ü, LVEE-2014) <sep> We consider the problems of telemetry, positioning and control of the mobile robot for monitoring of rivers and lakes. The project used Bluetooth technology for the transmission of telemetry data. RTKLIB library implementation of Real-Time Kinematic algorithm used for positioning. The current version of the control system is based on the Ardupilot project.

#### Output:
['Arduino', '–†–æ–±–æ—Ç–æ—Ç–µ—Ö–Ω–∏–∫–∞']

---

### WHAT NOT TO DO ###

- **DO NOT** ASSIGN CATEGORIES OUTSIDE THE PROVIDED LIST.
- **DO NOT** ASSIGN CATEGORIES WITHOUT CLEAR TEXTUAL SUPPORT.
- **DO NOT** DUPLICATE HIGH-LEVEL AND LOW-LEVEL CATEGORIES IN THE SAME LIST (E.G., 'AGILE' AND 'AGILE-–ö–£–õ–¨–¢–£–†–ê').
- **DO NOT** FAIL TO ADAPT CATEGORY SELECTION BASED ON TEXT LENGTH AND DETAIL.

### FINAL OUTPUT FORMAT ###

- Python-style list of categories (e.g., `['Category1', 'Category2']`).
- INCLUDE only categories directly relevant to the content.

### USER INPUT ###
- TEXT: 
"""

GEMMA_CLASSIFICATION_PROMPT_SUFFIX = """
### OUTPUT ###
- Python-style list of categories only.

</system_prompt>
"""

In [None]:
np.random.seed(123)
rand_ids = np.random.choice(len(processed_data), 4, replace=False)
rand_text_samples = [elem['text'] for elem in np.array(processed_data)[rand_ids]]
rand_targets = [elem['categories'] for elem in np.array(processed_data)[rand_ids]]

In [None]:
for rand_text, rand_trg in zip(rand_text_samples, rand_targets):
    print(rand_text[:100])
    print('Generated target')
    debug_prompt(GEMMA_CLASSIFICATION_PROMPT_PREFIX + "\"" + rand_text[:5000] + "\"" + GEMMA_CLASSIFICATION_PROMPT_SUFFIX, model='gemma2')
    print('True target')
    print(rand_trg)
    print('='*80)

–ê–ª—å—Ç–µ—Ä–Ω–∞—Ç–∏–≤–∞ –≥–æ—Ä–±–∞—Ç–æ–º—É RPM ‚Äî –¥–≤—É–≥–æ—Ä–±—ã–π RPM (–ì–ª–µ–± –§–æ—Ç–µ–Ω–≥–∞—É—ç—Ä-–ú–∞–ª–∏–Ω–æ–≤—Å–∫–∏–π, OSSDEVCONF-2016) <sep> –ò—Å–ø–æ
Generated target


2024-12-24 20:12:24,857 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


['Linux', 'Software Development'] 

True target
['Linux-–¥–∏—Å—Ç—Ä–∏–±—É—Ç–∏–≤—ã']
Mine Your Own Business ‚Äî Using Process Mining to Turn Big Data into Real Value (Wil van der Aalst, S
Generated target


2024-12-24 20:12:26,943 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


```python
['Process Mining', 'Business Process Management (BPM)', 'Workflow Management (WfM)', 'Data Mining (DM)', 'Business Intelligence (BI)', 'Machine Learning (ML)', 'Big Data'] 
```
True target
['–ú–æ–¥–µ–ª–∏—Ä–æ–≤–∞–Ω–∏–µ –±–∏–∑–Ω–µ—Å-–ø—Ä–æ—Ü–µ—Å—Å–æ–≤']
–î–∏–∑–∞–π–Ω –≤ —Ä–∞–∑—Ä–∞–±–æ—Ç–∫–µ (–†–æ–º–∞–Ω –ö–≤–∞—Ä—Ç–∞–ª—å–Ω–æ–≤, SECR-2015) <sep> –ò–∑ –¥–æ–∫–ª–∞–¥–∞ –≤—ã —É–∑–Ω–∞–µ—Ç–µ, –∫–∞–∫ –º–æ–∂–Ω–æ —Å–æ–∫—Ä–∞—Ç–∏—Ç—å 
Generated target


2024-12-24 20:12:28,055 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


['–î–∏–∑–∞–π–Ω', 'UX –ø—Ä–æ–µ–∫—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ'] 

True target
['–ü—Ä–æ—Ü–µ—Å—Å —Ä–∞–∑—Ä–∞–±–æ—Ç–∫–∏ UX –∏ UI']
–ò—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ —Å–∏—Å—Ç–µ–º—ã –≤—ã–¥–µ–ª–µ–Ω–Ω—ã—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –¥–ª—è –∑–∞–¥–∞—á –ø–æ–∏—Å–∫–∞ –ø–æ –∏—Å—Ö–æ–¥–Ω–æ–º—É —Ç–µ—Å—Ç—É (–ê–ª–µ–∫—Å–µ–π –ü—É—Å—Ç—ã–≥–∏–Ω, OS
Generated target


2024-12-24 20:12:29,205 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


```python
['method', 'class']
```
True target
['Open-source', '–û–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ']
