In [1]:
!pip install faiss-cpu sentence-transformers transformers

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m54.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


In [2]:
!pip install python-docx pandas

from google.colab import files
import docx

uploaded = files.upload()
doc = docx.Document("Guilan-Food.docx")
#extracting text from file
text = "\n".join([para.text for para in doc.paragraphs if para.text.strip() != ""])


Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.2.0-py3-none-any.whl (252 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/253.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.2.0


Saving Guilan-Food.docx to Guilan-Food.docx


In [3]:
import pandas as pd

foods = []
current_title = None
current_content = []

for line in text.split("\n"):
    if len(line.strip()) == 0:
        continue
    #if the line is short it is a food's name
    if len(line.split()) < 4 and not line.strip().isdigit():
        if current_title and current_content:
            foods.append({
                "title": current_title.strip(),
                "content": "\n".join(current_content).strip()
            })
        current_title = line
        current_content = []
    else:
        current_content.append(line)

#last food is added
if current_title and current_content:
    foods.append({
        "title": current_title.strip(),
        "content": "\n".join(current_content).strip()
    })

qa_pairs = []
for f in foods:
    content = f["content"]

    #extracting ingredients
    ingredients = ""
    steps = content
    if "مواد لازم" in content:
        parts = content.split("طرز تهیه")
        if len(parts) == 2:
            ingredients = parts[0].replace("مواد لازم", "").strip()
            steps = "طرز تهیه " + parts[1].strip()

    qa_pairs.append({
        "title": f["title"],
        "question": f"طرز تهیه {f['title']} چیست؟",
        "answer": steps,          #how to cook
        "ingredients": ingredients #ingredients
    })

df = pd.DataFrame(qa_pairs)
print("داده‌ها استخراج شد")
print(df.head(3))


داده‌ها استخراج شد
         title                    question  \
0  باقلا قاتوق  طرز تهیه باقلا قاتوق چیست؟   
1  میرزا قاسمی  طرز تهیه میرزا قاسمی چیست؟   
2     کباب ترش     طرز تهیه کباب ترش چیست؟   

                                              answer  \
0  طرز تهیه باقلا قاتوق:\nبرای تهیه باقلا قاتوق، ...   
1  طرز تهیه میرزا قاسمی:\nبرای تهیه میرزا قاسمی، ...   
2  طرز تهیه کباب ترش:\nبرای تهیه کباب ترش، ابتدا ...   

                                         ingredients  
0  باقلا قاتوق یکی از محبوب‌ترین و خوشمزه‌ترین خو...  
1  میرزا قاسمی یکی دیگر از غذاهای بسیار محبوب و خ...  
2  کباب ترش یکی از لذیذترین و خاص‌ترین کباب‌های ا...  


In [4]:
import re

#defining stopwords
persian_stopwords = set([
    "از","به","که","را","برای","با","این","آن","و","یا","تا","اما","اگر","یک","شود","شد"
])

def normalize_persian(text):
    #converting arabic words to persian
    text = re.sub("ي", "ی", text)
    text = re.sub("ك", "ک", text)
    return text

def normalize_numbers(text):
    #converting numbers
    persian_digits = "۰۱۲۳۴۵۶۷۸۹"
    english_digits = "0123456789"
    trans = str.maketrans("".join(persian_digits), "".join(english_digits))
    return text.translate(trans)

def clean_text(text):
    if not isinstance(text, str):
        return ""

    #normalization
    text = normalize_persian(text)
    text = normalize_numbers(text)

    #removing spaces
    text = re.sub(r"\s+", " ", text).strip()

    #removing signs
    text = re.sub(r"[^\w\sآ-ی0-9]", " ", text)

    #tokenizing
    tokens = text.split()

    #removing stopwords
    tokens = [t for t in tokens if t not in persian_stopwords]

    return " ".join(tokens)

#applying on the data
df["clean_answer"] = df["answer"].apply(clean_text)
df["clean_question"] = df["question"].apply(clean_text)

print("داده‌ها آماده شد (نسخه پیشرفته)")
print(df[["question","clean_question","answer","clean_answer"]].head())


داده‌ها آماده شد (نسخه پیشرفته)
                        question              clean_question  \
0     طرز تهیه باقلا قاتوق چیست؟   طرز تهیه باقلا قاتوق چیست   
1     طرز تهیه میرزا قاسمی چیست؟   طرز تهیه میرزا قاسمی چیست   
2        طرز تهیه کباب ترش چیست؟      طرز تهیه کباب ترش چیست   
3   طرز تهیه 4. رشته خشکار چیست؟  طرز تهیه 4 رشته خشکار چیست   
4  طرز تهیه شکر:  1 پیمانه چیست؟  طرز تهیه شکر 1 پیمانه چیست   

                                              answer  \
0  طرز تهیه باقلا قاتوق:\nبرای تهیه باقلا قاتوق، ...   
1  طرز تهیه میرزا قاسمی:\nبرای تهیه میرزا قاسمی، ...   
2  طرز تهیه کباب ترش:\nبرای تهیه کباب ترش، ابتدا ...   
3  رشته خشکار یکی از دسرهای محبوب و خوشمزه در گیل...   
4  گلاب:  1 قاشق غذاخوری\nپودر هل:  1/2 قاشق چایخ...   

                                        clean_answer  
0  طرز تهیه باقلا قاتوق تهیه باقلا قاتوق ابتدا با...  
1  طرز تهیه میرزا قاسمی تهیه میرزا قاسمی ابتدا با...  
2  طرز تهیه کباب ترش تهیه کباب ترش ابتدا گوشت تکه...  
3  رشته خشکار یکی دسرهای م

In [5]:
from sentence_transformers import InputExample
from torch.utils.data import DataLoader

#preparing data for fine tuning
train_examples = []
for i, row in df.iterrows():
    train_examples.append(
        InputExample(texts=[row["clean_question"], row["clean_answer"]])
    )

print("training data:", len(train_examples))


training data: 22


In [6]:
from sentence_transformers import SentenceTransformer, losses

def fine_tune_model(model_name, train_examples, output_path, epochs=3, batch_size=8):
    print(f"finetune model: {model_name}")

    #base model
    model = SentenceTransformer(model_name)

    #DataLoader
    train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size)

    #loss: MultipleNegativesRankingLoss
    train_loss = losses.MultipleNegativesRankingLoss(model)

    #train
    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        epochs=epochs,
        warmup_steps=10
    )

    #saving the model
    model.save(output_path)
    print(f"model is saved at: {output_path}")

    return model


In [7]:
from huggingface_hub import login

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [8]:
models_to_test = [
    "intfloat/multilingual-e5-base",
    "sentence-transformers/distiluse-base-multilingual-cased-v2",
    "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
    "sentence-transformers/all-mpnet-base-v2",
    "BAAI/bge-base-en-v1.5"
]



fine_tuned_models = {}

for model_name in models_to_test:
    output_path = f"fine_tuned_{model_name.replace('/', '_')}"
    model = fine_tune_model(model_name, train_examples, output_path, epochs=3)
    fine_tuned_models[model_name] = model


finetune model: intfloat/multilingual-e5-base


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mfarya-fattahi[0m ([33mfarya-fattahi-guilan-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss


model is saved at: fine_tuned_intfloat_multilingual-e5-base
finetune model: sentence-transformers/distiluse-base-multilingual-cased-v2


modules.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/610 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/539M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/531 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

2_Dense/model.safetensors:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss


model is saved at: fine_tuned_sentence-transformers_distiluse-base-multilingual-cased-v2
finetune model: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss


model is saved at: fine_tuned_sentence-transformers_paraphrase-multilingual-MiniLM-L12-v2
finetune model: sentence-transformers/all-mpnet-base-v2


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss


model is saved at: fine_tuned_sentence-transformers_all-mpnet-base-v2
finetune model: BAAI/bge-base-en-v1.5


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss


model is saved at: fine_tuned_BAAI_bge-base-en-v1.5


In [9]:
import faiss
import numpy as np

def build_faiss_index(embeddings):
    d = embeddings.shape[1]
    index = faiss.IndexFlatIP(d)       #Inner Product
    faiss.normalize_L2(embeddings)     #normalization for cosine
    index.add(embeddings)              #adding vectors to index
    return index


In [10]:
def evaluate_retrieval(df, model, index, k=3):
    reciprocal_ranks = []
    precision_scores = []
    recall_scores = []
    hit_scores = []
    cosine_scores = []

    for i in range(len(df)):
        query = df.iloc[i]["clean_question"]
        true_answer = df.iloc[i]["clean_answer"]

        #question embedding
        query_vec = model.encode([query], convert_to_numpy=True)
        faiss.normalize_L2(query_vec)

        #search
        D, I = index.search(query_vec, k)
        retrieved_answers = [df.iloc[idx]["clean_answer"] for idx in I[0]]

        #saving highest cosine similarity
        cosine_scores.append(D[0][0])

        #MRR
        rr = 0
        for rank, ans in enumerate(retrieved_answers, start=1):
            if ans == true_answer:
                rr = 1 / rank
                break
        reciprocal_ranks.append(rr)

        #Precision@k, Recall@k, Hit@k
        if true_answer in retrieved_answers:
            precision_scores.append(1.0)
            recall_scores.append(1.0)
            hit_scores.append(1.0)
        else:
            precision_scores.append(0.0)
            recall_scores.append(0.0)
            hit_scores.append(0.0)

    return {
        "MRR": np.mean(reciprocal_ranks),
        f"Precision@{k}": np.mean(precision_scores),
        f"Recall@{k}": np.mean(recall_scores),
        f"Hit@{k}": np.mean(hit_scores),
        "Avg Cosine Sim": np.mean(cosine_scores)
    }


In [11]:
results = []

for model_name, model in fine_tuned_models.items():
    print(f"evaluating model: {model_name}")

    embeddings = model.encode(df["clean_answer"].tolist(), convert_to_numpy=True, show_progress_bar=True)
    index = build_faiss_index(embeddings)

    metrics = evaluate_retrieval(df, model, index, k=3)
    metrics["model"] = model_name + " (fine-tuned)"
    results.append(metrics)

results_df = pd.DataFrame(results)
print("results:")
print(results_df)


evaluating model: intfloat/multilingual-e5-base


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

evaluating model: sentence-transformers/distiluse-base-multilingual-cased-v2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

evaluating model: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

evaluating model: sentence-transformers/all-mpnet-base-v2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

evaluating model: BAAI/bge-base-en-v1.5


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

results:
        MRR  Precision@3  Recall@3     Hit@3  Avg Cosine Sim  \
0  0.803030     0.909091  0.909091  0.909091        0.855092   
1  0.780303     0.954545  0.954545  0.954545        0.314796   
2  0.454545     0.681818  0.681818  0.681818        0.707634   
3  0.356061     0.545455  0.545455  0.545455        0.803841   
4  0.568182     0.636364  0.636364  0.636364        0.843606   

                                               model  
0         intfloat/multilingual-e5-base (fine-tuned)  
1  sentence-transformers/distiluse-base-multiling...  
2  sentence-transformers/paraphrase-multilingual-...  
3  sentence-transformers/all-mpnet-base-v2 (fine-...  
4                 BAAI/bge-base-en-v1.5 (fine-tuned)  


In [13]:
#choosing best model according to MRR
best_model_name = results_df.sort_values("MRR", ascending=False).iloc[0]["model"]
print("best model:", best_model_name)

#best models loading from file
base_name = best_model_name.replace(" (fine-tuned)", "")
best_model_path = f"fine_tuned_{base_name.replace('/', '_')}"

#loading tje best model
from sentence_transformers import SentenceTransformer
best_model = SentenceTransformer(best_model_path)


best model: intfloat/multilingual-e5-base (fine-tuned)


In [14]:
#embedding for all answers
best_embeddings = best_model.encode(df["clean_answer"].tolist(), convert_to_numpy=True, show_progress_bar=True)

#index
index = build_faiss_index(best_embeddings)
print("produced faiss index:", index.ntotal, "بردار")


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

produced faiss index: 22 بردار


In [45]:
from transformers import pipeline

#generator model
generator = pipeline("text2text-generation", model="csebuetnlp/mT5_multilingual_XLSum")

print("generator done")


Device set to use cuda:0


generator done


In [55]:
def retrieve_only(query, retriever_model, index, df, k=3):
    #question to embedding
    query_vec = retriever_model.encode([query], convert_to_numpy=True)
    faiss.normalize_L2(query_vec)

    #search in faiss
    D, I = index.search(query_vec, k)

    retrieved_contexts = [df.iloc[idx]["answer"] for idx in I[0]]

    return retrieved_contexts[0], retrieved_contexts


In [56]:
q = "طرز تهیه میرزا قاسمی چیست؟"
answer, contexts = retrieve_only(q, best_model, index, df, k=3)

print("سوال:", q)
print("\nمتون بازیابی‌شده:")
for i, c in enumerate(contexts, start=1):
    print(f"- متن {i}: {c[:150]}...\n")

print("پاسخ Retrieve-only:")
print(answer)


سوال: طرز تهیه میرزا قاسمی چیست؟

متون بازیابی‌شده:
- متن 1: طرز تهیه میرزا قاسمی:
برای تهیه میرزا قاسمی، ابتدا بادمجان‌ها را بشویید و روی شعله گاز، منقل یا داخل فر کبابی کنید تا پوست آن‌ها کاملاً بسوزد و داخلشا...

- متن 2: ابتدا گوشت چرخ‌کرده را با کمی نمک، فلفل و زردچوبه ورز دهید و به صورت کوفته‌ریزه‌های کوچک درآورید. در یک قابلمه، گردوی آسیاب‌شده را با حرارت ملایم کمی ...

- متن 3: گلاب:  1 قاشق غذاخوری
پودر هل:  1/2 قاشق چایخوری
پودر نارگیل : به مقدار لازم
کره یا روغن مایع:  1 قاشق غذاخوری
پسته یا بادام خرد شده (اختیاری) : به مق...

پاسخ Retrieve-only:
طرز تهیه میرزا قاسمی:
برای تهیه میرزا قاسمی، ابتدا بادمجان‌ها را بشویید و روی شعله گاز، منقل یا داخل فر کبابی کنید تا پوست آن‌ها کاملاً بسوزد و داخلشان نرم شود. در حین کباب کردن، بادمجان‌ها را بچرخانید تا تمام قسمت‌ها به خوبی کباب شوند. سپس بگذارید بادمجان‌ها کمی خنک شوند و پوست سوخته آن‌ها را جدا کنید. گوشت داخل بادمجان‌ها را با چاقو ساطوری کنید، نه خیلی ریز و نه خیلی درشت. در مرحله بعد، گوجه فرنگی‌ها را بشویید و پوست آن‌ها را بگیر

In [59]:
!pip install gradio
import gradio as gr



In [62]:
def qa_interface(query, k):
    answer, contexts = retrieve_only(query, best_model, index, df, k=int(k))

    retrieved_texts = "\n\n".join(
        [f" متن {i+1}:\n{c[:400]}..." for i, c in enumerate(contexts)]
    )

    return str(answer), retrieved_texts


In [63]:
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("## 🍲 پرسش‌وپاسخ دستور غذاهای گیلانی (Retriever Only)")

    query = gr.Textbox(label="سؤال خود را وارد کنید:", value="طرز تهیه میرزا قاسمی چیست؟")
    k = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Top-k")

    run_btn = gr.Button("جستجو")
    answer_box = gr.Textbox(label="پاسخ Retrieve-only", lines=5)
    contexts_box = gr.Textbox(label="متون بازیابی‌شده", lines=10)

    run_btn.click(fn=qa_interface, inputs=[query, k], outputs=[answer_box, contexts_box])

demo.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://95d17ee4a6266ea3b3.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


