### Installation

This section installs all the necessary libraries and packages required to run the project.

In [1]:
# Install required packages for the project
# gradio: for creating the web interface
# sentence-transformers: for generating embeddings
# python-docx: for reading DOCX files
# transformers: for loading the language model
# torch, bitsandbytes, accelerate: for efficient model loading and inference
!pip install gradio sentence-transformers python-docx transformers torch bitsandbytes accelerate

Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Downloading python_docx-1.2.0-py3-none-any.whl (252 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl (61.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx, bitsandbytes
Successfully installed bitsandbytes-0.47.0 python-docx-1.2.0


### Imports

This section imports all the required libraries and modules for the project.

In [2]:
# Import necessary libraries
import gradio as gr # For building the user interface
from sentence_transformers import SentenceTransformer # For creating text embeddings
from sklearn.metrics.pairwise import cosine_similarity # For calculating similarity between embeddings
import numpy as np # For numerical operations, especially with embeddings
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig # For loading and configuring the language model
import torch # PyTorch library for deep learning operations
import docx # For reading Microsoft Word (.docx) files
from google.colab import files # For handling file uploads in Google Colab

### Core Functions

This section defines the core functions of the system, including:

- `read_docx`: Reads content from a DOCX file.
- `process_database`: Takes an uploaded DOCX file, splits it into chunks, and generates embeddings for each chunk.
- `retrieve_context`: Finds the most relevant text chunks based on a user query using cosine similarity.
- `generate_answer`: Uses the retrieved context and a language model to generate an answer to the user's query.

In [3]:
# Function to read text content from a DOCX file
def read_docx(file_path):
    """Reads the content of a DOCX file and returns it as a single string."""
    doc = docx.Document(file_path)
    full_text = []
    # Iterate through each paragraph and append its text
    for para in doc.paragraphs:
        full_text.append(para.text)
    # Join all paragraphs with newline characters
    return '\n'.join(full_text)

# Global variables to store processed data (chunks and their embeddings)
chunks = [] # Stores text chunks from the document
embeddings = None # Stores the vector embeddings of the chunks

# Function to process the uploaded DOCX file, split it into chunks, and generate embeddings
def process_database(file_obj):
    """
    Processes an uploaded DOCX file, splits its content into chunks,
    and generates embeddings for each chunk.
    """
    global chunks, embeddings
    # Check if a file was actually uploaded
    if file_obj is None:
        return "لطفاً یک فایل DOCX آپلود کنید." # Return message if no file is uploaded

    # Get the file path from the Gradio File object
    file_path = file_obj.name

    # Read the uploaded file's content
    text = read_docx(file_path)

    # Split the text into fixed-size chunks (e.g., 512 characters)
    chunk_size = 512
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

    # Generate embeddings for each chunk using the pre-trained embedding model
    # normalize_embeddings=True is important for cosine similarity
    chunk_embeddings = embedding_model.encode(chunks, normalize_embeddings=True)
    # Convert the embeddings to a NumPy array for efficient processing
    embeddings = np.array(chunk_embeddings)

    # Return a success message with the number of chunks created
    return f"دیتابیس با {len(chunks)} تکه پردازش شد."

# Function to retrieve the most relevant text chunks based on a query
def retrieve_context(query, top_k=3):
    """
    Retrieves the top_k most relevant text chunks from the database
    based on the cosine similarity with the query embedding.
    """
    # Check if embeddings have been generated yet
    if embeddings is None:
        return "" # Return empty string if no database is processed

    # Generate embedding for the user's query
    query_emb = embedding_model.encode([query], normalize_embeddings=True)
    # Calculate cosine similarity between the query embedding and all chunk embeddings
    similarities = cosine_similarity(query_emb, embeddings)[0]
    # Get the indices of the top_k most similar chunks
    top_indices = np.argsort(similarities)[-top_k:]
    # Retrieve the actual text chunks using the top indices
    relevant_chunks = [chunks[i] for i in top_indices]
    # Join the relevant chunks into a single string
    return "\n".join(relevant_chunks)

# Function to generate an answer to the user's query based on retrieved context
def generate_answer(query):
    """
    Generates an answer to the user's query using a language model,
    conditioned on the retrieved relevant context.
    """
    # Check if embeddings have been generated yet
    if embeddings is None:
        return "لطفاً ابتدا دیتابیس را آپلود و پردازش کنید." # Prompt user to process database

    # Retrieve relevant context based on the query
    context = retrieve_context(query)

    # Construct a prompt for the language model
    # The prompt guides the model to answer the question based *only* on the provided context in Persian
    prompt = f"""Based on the following context, only answer the question in Persian.
    Do not repeat the question or give explanations.
    Do not ask the question.
    Only return the direct answer.
    answer should be complete.

    Context: {context}

    Question: {query}

    Answer:"""

    # Prepare the prompt for the language model
    # Tokenize the prompt and move it to the appropriate device (CPU or GPU)
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")

    # Generate a response from the language model
    # max_new_tokens controls the length of the generated answer
    # temperature controls the randomness of the output
    outputs = model.generate(**inputs, max_new_tokens=200, temperature=0.7)
    # Decode the generated tokens back into text, skipping special tokens
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract only the answer part from the model's output
    # The model might include the prompt in the output, so we split and take the part after "Answer:"
    answer = answer.split("Answer:")[-1].strip()
    return answer # Return the generated answer

In [4]:
# Load embedding model
embedding_model = SentenceTransformer('intfloat/multilingual-e5-base')

# Load generation model with 4-bit quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model_name = "unsloth/llama-3-8b-bnb-4bit"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map="auto"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]



model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/198 [00:00<?, ?B/s]

In [5]:
# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("# سیستم پرس‌وجو برای غذاهای گیلانی")

    with gr.Row():
        file_upload = gr.File(label="آپلود فایل DOCX")
        process_btn = gr.Button("پردازش دیتابیس")
        status = gr.Textbox(label="وضعیت")

    query_input = gr.Textbox(label="سوال خود را به فارسی وارد کنید")
    answer_output = gr.Textbox(label="پاسخ")
    submit_btn = gr.Button("دریافت پاسخ")

    process_btn.click(process_database, inputs=file_upload, outputs=status)
    submit_btn.click(generate_answer, inputs=query_input, outputs=answer_output)

demo.launch(share=True)  # Use share=True for Colab to get a public URL

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://791b1237add04cfa7b.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




### Model Loading

This section loads the pre-trained models required for the system: an embedding model to convert text into numerical vectors and a large language model (LLM) for generating answers.

- **Embedding Model**: `intfloat/multilingual-e5-base` is used for generating text embeddings. This model is suitable for multilingual tasks.
- **Generation Model**: `unsloth/llama-3-8b-bnb-4bit` is a quantized version of the Llama 3 8B model, loaded with 4-bit quantization using BitsAndBytes for reduced memory usage and potentially faster inference on consumer hardware.

In [6]:
# Load the pre-trained embedding model
# This model will be used to convert text chunks and queries into numerical vectors
embedding_model = SentenceTransformer('intfloat/multilingual-e5-base')

# Configure 4-bit quantization for efficient model loading and inference
# load_in_4bit: Enables 4-bit quantization
# bnb_4bit_use_double_quant: Uses double quantization for further memory savings
# bnb_4bit_quant_type: Specifies the quantization type (NF4 is recommended for transformers)
# bnb_4bit_compute_dtype: Sets the data type for computations (bfloat16 is often used with 4-bit quantization)
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Specify the name of the pre-trained language model
model_name = "unsloth/llama-3-8b-bnb-4bit"

# Load the tokenizer associated with the language model
# The tokenizer is responsible for converting text into tokens that the model can understand
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the language model with the specified quantization configuration
# device_map="auto" automatically distributes the model layers across available devices (like GPUs)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map="auto"
)



### Gradio User Interface

This section sets up the user interface using the Gradio library. It creates a web-based interface that allows users to upload a DOCX file, process it to create a searchable database, enter a query, and receive an answer generated by the language model based on the document content.

In [7]:
# Create a Gradio interface using Blocks for more flexible layout
with gr.Blocks() as demo:
    # Add a title to the interface
    gr.Markdown("# سیستم پرس‌وجو برای غذاهای گیلانی")

    # Create a row for file upload and processing controls
    with gr.Row():
        # File upload component for DOCX files
        file_upload = gr.File(label="آپلود فایل DOCX", file_types=[".docx"])
        # Button to trigger the database processing
        process_btn = gr.Button("پردازش دیتابیس")
        # Textbox to display the status of the database processing
        status = gr.Textbox(label="وضعیت", interactive=False)

    # Textbox for the user to input their query
    query_input = gr.Textbox(label="سوال خود را به فارسی وارد کنید", placeholder="مثال: قورمه سبزی چگونه تهیه می‌شود؟")
    # Textbox to display the generated answer
    answer_output = gr.Textbox(label="پاسخ", interactive=False)
    # Button to trigger the answer generation
    submit_btn = gr.Button("دریافت پاسخ")

    # Define the actions when buttons are clicked
    # When process_btn is clicked, call the process_database function
    # Inputs: file_upload component
    # Outputs: status textbox
    process_btn.click(process_database, inputs=file_upload, outputs=status)

    # When submit_btn is clicked, call the generate_answer function
    # Inputs: query_input textbox
    # Outputs: answer_output textbox
    submit_btn.click(generate_answer, inputs=query_input, outputs=answer_output)

# Launch the Gradio interface
# share=True creates a public URL that can be accessed externally (useful in Colab)
demo.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://c7d69b91046504c8d2.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


