In [1]:
# !pip install sentence-transformers
# !pip install langchain

In [2]:
import numpy as np
import pandas as pd
import os
import docx2txt
import glob
import re
import requests

from typing import List, Tuple, Union
from sentence_transformers import SentenceTransformer, util
import torch

from langchain.text_splitter import CharacterTextSplitter

  from tqdm.autonotebook import tqdm, trange





In [3]:
input_path = "to_LLM"
output_path = "to_LLM\\txt"

In [4]:

directory = glob.glob(input_path + '/*.doc*')

for file_name in directory:
    with open(file_name, 'rb') as infile:
        with open(output_path + file_name[6:-5] + '.txt', 'w', encoding='utf-8') as outfile:
            doc = docx2txt.process(infile)
            outfile.write(doc.strip())

print("=========")
print("All done!")

All done!


In [5]:
class Encoder:
    """
    Encoder class for generating embeddings from textual data using a SentenceTransformer model.

    Attributes: model (SentenceTransformer): The model used for generating embeddings.
    """
    def __init__(self, model_name: str = 'Alibaba-NLP/gte-multilingual-base', use_gpu: bool = False):
        """
        Initializes the Encoder with the given model name and device configuration.

        Args:
            model_name (str): The name of the model to load: "cointegrated/rubert-tiny2"; "Alibaba-NLP/gte-multilingual-base"
            use_gpu (bool): Whether to use GPU for model inference.

        Raises:
            ValueError: If the model name is empty.
            RuntimeError: If the model fails to load.
        """
        if not model_name.strip():
            raise ValueError('Model name cannot be empty.')

        try:
            self.model = SentenceTransformer(model_name, trust_remote_code=True)
        except Exception as e:
            raise RuntimeError(f"Failed to load '{model_name}'. Error: {e}")


    def encode(self, data: Union[List[str], str]) -> torch.Tensor:
        """
        Encodes a list of textual data into embeddings.

        Args: data (Union[List[str], str]): The list of texts or a single text to encode.

        Returns: torch.Tensor: The tensor of embeddings.

        Raises: RuntimeError: If encoding fails.
        """
        try:
            return self.model.encode(data, convert_to_tensor=True)
        except Exception as e:
            raise RuntimeError(f"Error encoding data: {e}")

In [6]:
class RAG:
    """
    Retrieval-Augmented Generation (RAG) class for generating responses based on retrieved
    documents.
    """
    def __init__(self, encoder: Encoder):
        """
        Initializes the RAG class with the given encoder.

        Args: encoder (Encoder): The encoder to be used for encoding documents and queries.

        Raises: ValueError: If the encoder is not an instance of Encoder.
        """
        if not isinstance(encoder, Encoder):
            raise ValueError("The encoder must be an instance of Encoder.")

        self.documents = None
        self.doc_embeddings = None
        self.encoder = encoder

    def splitter(self, documents_path: str):
        splitter = CharacterTextSplitter(separator=" ",  chunk_size=8192, chunk_overlap=1024)
        documents = []
        for root, directories, files in os.walk(documents_path , topdown=False):
            for file in files:
                if file.lower().endswith(".txt"):
                    name =(os.path.join(root,file))
                    with open(name, "r", encoding="utf-8") as f:
                        file_content = f.read()
                        file_content = re.sub(r'[^a-zA-Zа-яА-Я0-9\s]+', '', file_content.strip())
                        file_content = re.sub(r'[\n\t]+', ' ', file_content).strip()
                        for chunk in splitter.split_text(file_content):
                            documents.append({'filename':name, "chunk_content": chunk})   #Document(page_content=chunk, metadata={'source': file}))
        self.documents = documents
        return documents
    
    def fit(self, documents):
        """
        Fits the RAG model and calculates embeddings for the provided documents.

        Args: documents (List[str]): List of documents to be used for retrieval.

        Raises:
            ValueError: If the documents list is empty.
            RuntimeError: If there is an error encoding the documents.
        """
        if not documents:
            raise ValueError("Documents list cannot be empty.")

        empty_documents = all(phrase['chunk_content'].strip() != '' for phrase in documents)
        if not empty_documents:
            raise ValueError("Documents cannot be empty.")

        documents = [x['chunk_content'] for x in self.documents]
        
        try:
            self.doc_embeddings = self.encoder.encode(documents)
            # self.documents = documents
        except Exception as e:
            raise RuntimeError(f"Error encoding the documents: {e}")

    def retrieve(self, query: str,
                 retrieval_limit: int = 5,
                 similarity_threshold: float = 0.5) -> Tuple[List[int], List[str]]:
        """
        Retrieves the most relevant documents based on the query.

        Args:
            query (str): The query text.
            retrieval_limit (int): Maximum number of documents to retrieve. Default is 5.
            similarity_threshold (float): Threshold for document similarity
            to be considered relevant. Default is 0.5.

        Returns:
            Tuple[List[int], List[str]]: The indices of the retrieved documents
            and the retrieved documents themselves.

        Raises:
            ValueError: If the documents have not been fitted yet.
            ValueError: If the retrieval limit is not between 1 and 10.
            ValueError: If the retrieval limit is greater than the number of documents.
            ValueError: If the similarity threshold is not between 0 and 1.
        """
        if not self.documents:
            raise ValueError("The documents have not been fitted yet")

        if 1 > retrieval_limit > 10:
            raise ValueError("The retrieval limit is not between 1 and 10")

        if retrieval_limit > self.doc_embeddings.shape[0]:
            raise ValueError("The retrieval limit is greater than the number of documents")

        if 0 > similarity_threshold > 1:
            raise ValueError("similarity threshold is not between 0 and 1")

        # user_embeddings = self.encoder.encode(query)
        # cos_dist = util.pytorch_cos_sim(user_embeddings, self.doc_embeddings)
        # cos_dist_sort = (cos_dist.argsort(descending=True)[:, :retrieval_limit]
        #                  .reshape((-1,))
        #                  .tolist())
        # return cos_dist_sort, [self.documents[idx] for idx in cos_dist_sort]

        query_embedding = self.encoder.encode(query)
        similarity = util.pytorch_cos_sim(query_embedding, self.doc_embeddings)

        torch_topk_values, torch_topk_indices = torch.topk(similarity, retrieval_limit)

        topk_values = torch_topk_values.flatten().tolist()
        topk_indices = torch_topk_indices.flatten().tolist()

        topk_indices = [idx for score, idx in zip(topk_values, topk_indices) if score >= similarity_threshold]

        retrieved_docs = [self.documents[idx] for idx in topk_indices]

        return topk_indices, retrieved_docs


    def _create_prompt_template(self, query: str, retrieved_docs) -> str:
        """
        Creates a prompt template for text generation.

        Args:
            query (str): The user query.
            retrieved_docs (List[str]): The list of retrieved documents.

        Returns:
            str: The formatted prompt.
        """
        prompt = "Instructions: Based on the relevant documents, generate a comprehensive response to the user's query.\n"

        prompt += "Relevant Documents:\n"
        for i, doc in enumerate(retrieved_docs):
            prompt += f"Document {i+1}: {doc['chunk_content']}\n"

        prompt += f"User Query: {query}\n"

        return prompt

    def _chadgpt_api(self, prompt):
         # Ключ из личного кабинета, подставьте свой
        CHAD_API_KEY = 'CHAD_API_KEY'

        # Формируем запрос
        request_json = {
            "message": prompt,
            "api_key": CHAD_API_KEY
        }
    
        # Отправляем запрос и дожидаемся ответа
        response = requests.post(url='https://ask.chadgpt.ru/api/public/claude-3.5-sonnet', json=request_json) 
        # https://ask.chadgpt.ru/api/public/gpt-4o-mini
        # https://ask.chadgpt.ru/api/public/gpt-4o
        # https://ask.chadgpt.ru/api/public/claude-3-haiku
        # https://ask.chadgpt.ru/api/public/claude-3-opus
        # https://ask.chadgpt.ru/api/public/claude-3.5-sonnet
        # Проверяем, отправился ли запрос
        if response.status_code != 200:
            print(f'Ошибка! Код http-ответа: {response.status_code}')
        else:
            # Получаем текст ответа и преобразовываем в dict
            resp_json = response.json()
    
            # Если успешен ответ, то выводим
            if resp_json['is_success']:
                return resp_json['response']
                # used_words = resp_json['used_words_count']
            else:
                error = resp_json['error_message']
                print(f'Ошибка: {error}')
                return error
        

    
    def _generate(self, query: str, retrieved_docs) -> str:
        """
        Generates a response based on the retrieved documents and query.

        Args:
            query (str): The user query.
            retrieved_docs (List[str]): The list of retrieved documents.

        Returns:
            str: The generated response.

        Pseudo-code:
            - Create a prompt using the query and retrieved documents.
            - Pass the prompt to a text generation model.
            - Retrieve and return the generated response.
        """
        # Create the prompt template
        prompt = self._create_prompt_template(query, retrieved_docs)

        # Pass the prompt to the text generation model (example using GPT-3 or similar model)
        # generated_response = text_generation_model.generate(prompt)

        resp_msg = self._chadgpt_api(prompt)
        
       # Return the generated response
        generated_response = resp_msg  # Replace with actual implementation

        return generated_response

    def run(self, query: str) -> str:
        """
        Runs the full RAG pipeline: retrieves documents and generates a response.

        Args:
            query (str): The user query.

        Returns:
            str: The generated response.
        """
        _, retrieved_docs = self.retrieve(query)
        generated_response = self._generate(query, retrieved_docs)

        return generated_response, retrieved_docs


In [7]:
# [files for _, _, files in os.walk(output_path, topdown=False)][0]

In [None]:
encoder = Encoder()

In [9]:
rag = RAG(encoder)

In [10]:
docs = rag.splitter(output_path)

In [11]:
# rag.fit(docs)
# torch.save(rag.doc_embeddings, 'doc_embeddings.pt')
emb = torch.load('doc_embeddings.pt', weights_only=True)
rag.doc_embeddings = emb

In [12]:
generated_response, retrieved_docs = rag.run('Please provide repair instructions for dent on RH Outboard Flap of Boeing 777')

In [13]:
print(generated_response)

Based on the relevant documents, here are the repair instructions for the dent on the right hand (RH) outboard flap of Boeing 777:

1. Access the repair area on the RH outboard flap

2. Verify the damage dimensions match those reported:
- Dent size: 35mm length × 30mm width × 0.36mm depth
- Delamination size: 57mm length × 50mm width
- Location: Between fairings 7 and 8, 350mm from outboard flap track and 295mm from trailing edge

3. Apply temporary seal using aluminum foil tape (speed tape):
- Follow SRM 57-53-01 requirements
- Ensure speed tape overlaps damaged area by minimum 25mm

4. Inspection Requirements:
- Every 10 flight cycles perform:
  * Visual inspection
  * Tap test inspection per NDT Manual Part 1 51-05-01 to check for damage growth
- Reapply aluminum foil tape after inspections

Important Notes:
1. This is a temporary repair valid until April 14, 2024 or next A-check, whichever comes first
2. At the end of repair work:
   - Ensure no tools, debris or other items are lef

In [None]:
print([x['filename'] for x in retrieved_docs])