# Голосовые модели

In [4]:
#!{sys.executable} -m pip install TTS[gpu]
#!{sys.executable} -m pip install git+https://github.com/salute-developers/GigaAM.git
#!{sys.executable} -m pip install datasets
#!{sys.executable} -m pip install qdrant_client
#!{sys.executable} -m pip install gdown
#!{sys.executable} -m pip install pydub
#!{sys.executable} -m pip install ruaccent
#!{sys.executable} -m pip install convert

In [6]:
import gdown
import numpy as np
import uuid
from tqdm import tqdm
import os
import torch

In [7]:
#!wget https://github.com/markovka17/dla/raw/refs/heads/2024/week02/testVoiceTrack.wav

In [None]:
from TTS.api import TTS

model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
tts = TTS(model_name=model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tts = tts.to(device)

In [None]:
import gigaam
from gigaam.onnx_utils import load_onnx_sessions, transcribe_sample
from tqdm import tqdm

onnx_dir = "onnx"
model_type = "ctc"

stt = gigaam.load_model(
    model_type,
    fp16_encoder=True,
    use_flash=False,
)
stt.to_onnx(dir_path=onnx_dir)
sessions = load_onnx_sessions(onnx_dir, model_type)

# Языковая модель

### Загрузка T-Lite

In [10]:
import sys
import torch
from tqdm import tqdm
import uuid

In [11]:
#!{sys.executable} -m pip install vllm
#!{sys.executable} -m pip install accelerate==0.26.0
#!{sys.executable} -m pip install sentence_transformers
#!{sys.executable} -m pip install langchain
#!{sys.executable} -m pip install langchain_community

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
torch.manual_seed(42)


model_name = "t-tech/T-lite-it-1.0"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    torch_dtype="auto",
    device_map="cuda:1"
)

In [13]:
def answer_llm(query, max_new_tokens=512):
    messages = [
    {"role": "system", "content": "Ты T-lite, виртуальный ассистент в Т-Технологии. Твоя задача - быть полезным диалоговым ассистентом."},
    {"role": "user", "content": query}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
    
    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=512
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    
    return response

### Загрузка датасета и формирования вектороной базы знаний

In [14]:
from datasets import load_dataset

def process_dataset(sample):
    sample['content'] = sample['content'].replace('\xa0', ' ')
    return sample

dataset = load_dataset("blinoff/kinopoisk")['train']
dataset = dataset.map(process_dataset)

In [15]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("intfloat/multilingual-e5-large", model_kwargs={'torch_dtype': torch.float16})
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
embedding_model = embedding_model.to(device)

In [None]:
from qdrant_client import QdrantClient, models

client = QdrantClient(":memory:")

client.create_collection(
    collection_name="kinopoisk_e5",
    on_disk_payload=True,
    vectors_config=models.VectorParams(
        size=1024,
        distance=models.Distance.COSINE,
        on_disk=True
    ),
)

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

text_chunks = [{i: text_splitter.split_text(dataset[i]['content'])} for i in tqdm(range(len(dataset)))]

text_chunks_numbered = []

for chunk_dict in text_chunks:
    key, values = next(iter(chunk_dict.items()))

    for chunk in values:
        text_chunks_numbered.append((key, chunk))

numbers, text_chunks = zip(*text_chunks_numbered)

In [18]:
#import gdown

#file_id = "17_hezj94BeFqFcD26W8fNgnSKE_KMH9c"
#gdown.download(f"https://drive.google.com/uc?export=download&id={file_id}", "vectors.npy", quiet=False)

In [19]:
import numpy as np

#vectors = embedding_model.encode(text_chunks, batch_size=32, device=device, normalize_embeddings=True, show_progress_bar=True).tolist()

vectors = np.load('vectors.npy')
vectors = vectors.tolist()

In [None]:
for i in tqdm(range(len(vectors))):
    client.upsert(
        collection_name='kinopoisk_e5',
        points=[
            models.PointStruct(
                id=str(uuid.uuid4()),
                vector=vectors[i],
                payload={
                    'text': text_chunks[i],
                    'movie_name': dataset[numbers[i]]['movie_name'][:-7],
                    'year': int(dataset[numbers[i]]['movie_name'][-5:-1]),
                }
            )
        ]
    )

In [21]:
def semantic_search(client, query, limit=10):
    query_vector = embedding_model.encode(
        query, normalize_embeddings=True, device=device
    ).tolist()

    hits = client.search(
        collection_name="kinopoisk_e5",
        query_vector=query_vector,
        limit=limit
    )
    relevant_chunks = [hit.payload for hit in hits]

    return relevant_chunks

In [49]:
def answer_llm_with_context(query, context, max_new_tokens=256):
    prompt = f"""Отзывы:
{context}

Вопрос:
{query}"""
    
    messages = [
        {"role": "system", "content": "Ты T-lite, виртуальный ассистент в Т-Технологии.  все что ты генерируешь - переозвучивается в речь, потому используй исключительно диалоговый вывод.Твоя задача - быть полезным диалоговым ассистентом. Пожалуйста, представь, что ты диктор, который будет зачитывать следующие данные. Сделай выводы формальными, избегая использования знаков препинания, не относящихся к речи, например кавычек или двоеточий, а также указывай только год выпуска фильма без лишних описаний. Например: 'Однажды в Америке 1984 года выпуска "},
        {"role": "user", "content": prompt}
    ]
    
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
    
    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=max_new_tokens
    )
    
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    
    return response


In [23]:
def predict(query):
    selected_chunks = semantic_search(client, query)
    context = ' ; '.join([f"Название: {chunk['movie_name']}. Отзыв: {chunk['text']}" for chunk in selected_chunks])

    return answer_llm_with_context(query, context)

### Reranker

In [24]:
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
cross_encoder = HuggingFaceCrossEncoder(
    model_name='BAAI/bge-reranker-v2-m3',
    model_kwargs={'device': 'cuda' if torch.cuda.is_available() else 'cpu'}
)

In [25]:
def predict(query,max_new_tokens=256):
    selected_chunks = semantic_search(client, query, limit=50)

    texts = [f"Название: {chunk['movie_name']}. Отзыв: {chunk['text']}" for chunk in selected_chunks]
    scores = cross_encoder.score([(query, text) for text in texts])

    idxs = np.argsort(list(scores))[-10:]

    context = ' ; '.join([texts[i] for i in idxs])
    return answer_llm_with_context(query, context,max_new_tokens), context

### Перефразирование вопроса

In [26]:
def rephrase_query(query, n=3):
    system_prompt = f"""Твоя задача написать {n} разных вариаций вопроса пользователя для того,
чтобы по ним получить релевантные документы из векторной базы данных.
Ты должен переформулировать вопрос с разных точек зрения.
Это поможет избавить пользователя от недостатков поиска похожих документов на основе расстояния.
Вопрос пользователя сфокусирован на теме кино.
Напиши ТОЛЬКО вариации вопроса и больше ничего, разделяя их символом новой строки \\n.
НЕ пиши ответ на сам вопрос."""

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": query},
    ]
    
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
    
    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=256
    )
    
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    
    response_list = response.split("\n")

    return response_list


In [27]:
def predict_with_rephrase(query,max_new_tokens=256):
    queries = rephrase_query(query, n=3)
    print(queries)
    all_chunks = []
    for rephrased_query in queries:
        selected_chunks = semantic_search(client, rephrased_query, limit=5)
        all_chunks.extend(selected_chunks)
        

    context = [f"Название: {chunk['movie_name']}. Отзыв: {chunk['text']}" for chunk in all_chunks]

    scores = cross_encoder.score([(query, text) for text in np.unique(context)])

    idxs = np.argsort(list(scores))[-10:]
    context = ' ; '.join([context[i] for i in idxs])

    return answer_llm_with_context(query, context,max_new_tokens), context, queries

# Модель киноассистент

### Функции для работы с ассистентом

In [28]:
from IPython.display import Audio

def chat_with_audio_input(query_path, max_new_tokens=512, with_rephrase = False,answer_file_path="output.wav"):
    query = transcribe_sample(query_path, model_type, sessions)
    if with_rephrase == False:
        answer, context = predict(query,max_new_tokens)
    else:
        answer, context, queries = predict_with_rephrase(query,max_new_tokens)
    tts.tts_to_file(text=answer,
                file_path=answer_file_path,
                speaker_wav="speaker_ksushka.wav",
                language="ru")
    return answer_file_path

In [29]:
from IPython.display import Audio

def chat_without_audio_input(query, max_new_tokens=512, with_rephrase = False,answer_file_path="output.wav"):
    if with_rephrase == False:
        answer, context = predict(query,max_new_tokens)
    else:
        answer, context, queries = predict_with_rephrase(query,max_new_tokens)
    tts.tts_to_file(text=answer,
                file_path=answer_file_path,
                speaker_wav="speaker_ksushka.wav",
                language="ru")
    return answer_file_path

### Пример использования ассистента

In [None]:
chat_with_audio_input('query_rec_film.wav', max_new_tokens=512, with_rephrase = True, answer_file_path='rephrase.wav')

In [52]:
Audio('rephrase.wav')

In [107]:
#1