In [None]:
from datasets import Dataset, DatasetDict
import pandas as pd
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    SentenceTransformerModelCardData,
)
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers
from sklearn.model_selection import train_test_split
# Load CSV
df = pd.read_csv("Copy of 500 lable แล้ว ไม่มีปริมาณ superclean + มีของคาวของหวาน + queryครบ - Sheet1.csv")

# Rename columns
df = df.rename(columns={"วัตถุดิบ_ไม่มีปริมาณ": "ingredients", "ชื่ออาหาร": "dish"})

# Drop missing values
df = df.dropna(subset=["ingredients", "dish"])

# Split into train/temp (80/20)
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)

# Split temp into val/test (50/50 of temp)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Add query 1 (the first val example) into training set
query1_ingredient = val_df.iloc[0]["ingredients"]
query1_dish = val_df.iloc[0]["dish"]

train_df = pd.concat([
    train_df,
    pd.DataFrame({"ingredients": [query1_ingredient], "dish": [query1_dish]})
], ignore_index=True)

# Remove query 1 from val_df to avoid duplication
val_df = val_df.drop(val_df.index[0]).reset_index(drop=True)

# Convert to Hugging Face Dataset after modification
train_dataset = Dataset.from_pandas(train_df[["ingredients", "dish"]])
val_dataset = Dataset.from_pandas(val_df[["ingredients", "dish"]])
test_dataset = Dataset.from_pandas(test_df[["ingredients", "dish"]])

# Model loading
model = SentenceTransformer(
    'paraphrase-multilingual-mpnet-base-v2',
    model_card_data=SentenceTransformerModelCardData(
        language="th",
        license="apache-2.0",
        model_name="Thai Food Ingredients → Dish Prediction",
    )
)


# Prepare samples for MultipleNegativesRankingLoss
train_samples = list(zip(train_dataset["ingredients"], train_dataset["dish"]))
val_samples = list(zip(val_dataset["ingredients"], val_dataset["dish"]))
test_samples = list(zip(test_dataset["ingredients"], test_dataset["dish"]))

# Convert samples into DatasetDict with correct format
def convert_to_dict(samples):
    return Dataset.from_dict({
        "anchor": [a for a, b in samples],
        "positive": [b for a, b in samples],
    })

train_dataset = convert_to_dict(train_samples)
val_dataset = convert_to_dict(val_samples)
test_dataset = convert_to_dict(test_samples)

# Create evaluator
from sentence_transformers.evaluation import InformationRetrievalEvaluator

val_queries = {f'q{i}': val_dataset['anchor'][i] for i in range(len(val_dataset))}
val_corpus = {f'd{i}': val_dataset['positive'][i] for i in range(len(val_dataset))}
val_relevant_docs = {f'q{i}': {f'd{i}'} for i in range(len(val_dataset))}

evaluator = InformationRetrievalEvaluator(
    queries=val_queries,
    corpus=val_corpus,
    relevant_docs=val_relevant_docs,
    name='thai-food-eval',
    show_progress_bar=True,
    precision_recall_at_k=[1, 3, 5]
)

# Loss function
loss = MultipleNegativesRankingLoss(model)

args = SentenceTransformerTrainingArguments(
    output_dir="models/thai-food-mpnet-tuned",
    num_train_epochs=16,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24,
    learning_rate=5e-6,
    warmup_ratio=0.1,
    fp16=False,
    bf16=False,
    batch_sampler=BatchSamplers.NO_DUPLICATES,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    logging_strategy="steps",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    run_name="thai-food-retriever-tuned",
)

# Train
trainer.train()

# Save the final model
model.save_pretrained("models/thai-food-mpnet-new-v8")

In [None]:
from datasets import Dataset, DatasetDict
import pandas as pd
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    SentenceTransformerModelCardData,
)
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers
from sklearn.model_selection import train_test_split
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from datasets import Dataset, DatasetDict
import pandas as pd
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    SentenceTransformerModelCardData,
)
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers
from sklearn.model_selection import train_test_split
from sentence_transformers.evaluation import InformationRetrievalEvaluator

# Load CSV
df = pd.read_csv("Copy of 500 lable แล้ว ไม่มีปริมาณ superclean + มีของคาวของหวาน + queryครบ - Sheet1.csv")

# Rename columns
df = df.rename(columns={
    "วัตถุดิบ_ไม่มีปริมาณ": "ingredients",
    "ชื่ออาหาร": "dish"
})

# Drop missing values
df = df.dropna(subset=["ingredients", "dish", "query2", "query3"])

# Split into train/temp (80/20)
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)

# Split temp into val/test (50/50 of temp)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Add query 1 (the first val example) into training set
query1_row = val_df.iloc[0]
train_df = pd.concat([
    train_df,
    pd.DataFrame({
        "ingredients": [query1_row["ingredients"]],
        "dish": [query1_row["dish"]],
        "query2": [query1_row["query2"]],
        "query3": [query1_row["query3"]],
    })
], ignore_index=True)

val_df = val_df.drop(val_df.index[0]).reset_index(drop=True)

# ----- 🔁 Prepare augmented training samples -----
def prepare_training_samples(df):
    anchors = []
    positives = []
    for _, row in df.iterrows():
        anchors.extend([
            row["ingredients"],
            row["query2"],
            row["query3"]
        ])
        positives.extend([row["dish"]] * 3)
    return anchors, positives

train_anchors, train_positives = prepare_training_samples(train_df)

# Validation and test keep using only 'ingredients'
val_samples = list(zip(val_df["ingredients"], val_df["dish"]))
test_samples = list(zip(test_df["ingredients"], test_df["dish"]))

# Convert to HF datasets
def convert_to_dict(samples):
    return Dataset.from_dict({
        "anchor": [a for a, b in samples],
        "positive": [b for a, b in samples],
    })

train_dataset = Dataset.from_dict({
    "anchor": train_anchors,
    "positive": train_positives
})
val_dataset = convert_to_dict(val_samples)
test_dataset = convert_to_dict(test_samples)

# Load model
model = SentenceTransformer(
    'paraphrase-multilingual-mpnet-base-v2',
    model_card_data=SentenceTransformerModelCardData(
        language="th",
        license="apache-2.0",
        model_name="Thai Food Ingredients → Dish Prediction",
    )
)

# Evaluation
val_queries = {f'q{i}': val_dataset['anchor'][i] for i in range(len(val_dataset))}
val_corpus = {f'd{i}': val_dataset['positive'][i] for i in range(len(val_dataset))}
val_relevant_docs = {f'q{i}': {f'd{i}'} for i in range(len(val_dataset))}

evaluator = InformationRetrievalEvaluator(
    queries=val_queries,
    corpus=val_corpus,
    relevant_docs=val_relevant_docs,
    name='thai-food-eval',
    show_progress_bar=True,
    precision_recall_at_k=[1, 3, 5]
)

# Loss function
loss = MultipleNegativesRankingLoss(model)

# Training arguments
args = SentenceTransformerTrainingArguments(
    output_dir="models/thai-food-mpnet-new-v8",
    num_train_epochs=16,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24,
    learning_rate=5e-6,
    warmup_ratio=0.1,
    fp16=False,
    bf16=False,
    batch_sampler=BatchSamplers.NO_DUPLICATES,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    logging_strategy="steps",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    run_name="thai-food-retriever-tuned",
)

# Train
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    loss=loss,
    evaluator=evaluator,
)
trainer.train()

# Save model
model.save_pretrained("models/thai-food-mpnet-new-v8")
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    SentenceTransformerModelCardData,
)
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers
from sklearn.model_selection import train_test_split
from sentence_transformers.evaluation import InformationRetrievalEvaluator

# Load CSV
df = pd.read_csv("Copy of 500 lable แล้ว ไม่มีปริมาณ superclean + มีของคาวของหวาน + queryครบ - Sheet1.csv")

# Rename columns
df = df.rename(columns={
    "วัตถุดิบ_ไม่มีปริมาณ": "ingredients",
    "ชื่ออาหาร": "dish"
})

# Drop missing values
df = df.dropna(subset=["ingredients", "dish", "query2", "query3"])

# Split into train/temp (80/20)
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)

# Split temp into val/test (50/50 of temp)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Add query 1 (the first val example) into training set
query1_row = val_df.iloc[0]
train_df = pd.concat([
    train_df,
    pd.DataFrame({
        "ingredients": [query1_row["ingredients"]],
        "dish": [query1_row["dish"]],
        "query2": [query1_row["query2"]],
        "query3": [query1_row["query3"]],
    })
], ignore_index=True)

val_df = val_df.drop(val_df.index[0]).reset_index(drop=True)

# ----- 🔁 Prepare augmented training samples -----
def prepare_training_samples(df):
    anchors = []
    positives = []
    for _, row in df.iterrows():
        anchors.extend([
            row["ingredients"],
            row["query2"],
            row["query3"]
        ])
        positives.extend([row["dish"]] * 3)
    return anchors, positives

train_anchors, train_positives = prepare_training_samples(train_df)

# Validation and test keep using only 'ingredients'
val_samples = list(zip(val_df["ingredients"], val_df["dish"]))
test_samples = list(zip(test_df["ingredients"], test_df["dish"]))

# Convert to HF datasets
def convert_to_dict(samples):
    return Dataset.from_dict({
        "anchor": [a for a, b in samples],
        "positive": [b for a, b in samples],
    })

train_dataset = Dataset.from_dict({
    "anchor": train_anchors,
    "positive": train_positives
})
val_dataset = convert_to_dict(val_samples)
test_dataset = convert_to_dict(test_samples)

# Load model
model = SentenceTransformer(
    'paraphrase-multilingual-mpnet-base-v2',
    model_card_data=SentenceTransformerModelCardData(
        language="th",
        license="apache-2.0",
        model_name="Thai Food Ingredients → Dish Prediction",
    )
)

# Evaluation
val_queries = {f'q{i}': val_dataset['anchor'][i] for i in range(len(val_dataset))}
val_corpus = {f'd{i}': val_dataset['positive'][i] for i in range(len(val_dataset))}
val_relevant_docs = {f'q{i}': {f'd{i}'} for i in range(len(val_dataset))}

evaluator = InformationRetrievalEvaluator(
    queries=val_queries,
    corpus=val_corpus,
    relevant_docs=val_relevant_docs,
    name='thai-food-eval',
    show_progress_bar=True,
    precision_recall_at_k=[1, 3, 5]
)

# Loss function
loss = MultipleNegativesRankingLoss(model)

# Training arguments
args = SentenceTransformerTrainingArguments(
    output_dir="models/thai-food-mpnet-new-v8",
    num_train_epochs=16,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24,
    learning_rate=5e-6,
    warmup_ratio=0.1,
    fp16=False,
    bf16=False,
    batch_sampler=BatchSamplers.NO_DUPLICATES,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    logging_strategy="steps",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    run_name="thai-food-retriever-tuned",
)

# Train
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    loss=loss,
    evaluator=evaluator,
)
trainer.train()

# Save model
model.save_pretrained("models/thai-food-mpnet-new-v8")
.csv")

# Rename columns
df = df.rename(columns={
    "วัตถุดิบ_ไม่มีปริมาณ": "ingredients",
    "ชื่ออาหาร": "dish"
})

# Drop missing values
df = df.dropna(subset=["ingredients", "dish", "query2", "query3"])

# Split into train/temp (80/20)
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)

# Split temp into val/test (50/50 of temp)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Add query 1 (the first val example) into training set
query1_row = val_df.iloc[0]
train_df = pd.concat([
    train_df,
    pd.DataFrame({
        "ingredients": [query1_row["ingredients"]],
        "dish": [query1_row["dish"]],
        "query2": [query1_row["query2"]],
        "query3": [query1_row["query3"]],
    })
], ignore_index=True)

val_df = val_df.drop(val_df.index[0]).reset_index(drop=True)

# ----- 🔁 Prepare augmented training samples -----
def prepare_training_samples(df):
    anchors = []
    positives = []
    for _, row in df.iterrows():
        anchors.extend([
            row["ingredients"],
            row["query2"],
            row["query3"]
        ])
        positives.extend([row["dish"]] * 3)
    return anchors, positives

train_anchors, train_positives = prepare_training_samples(train_df)

# Validation and test keep using only 'ingredients'
val_samples = list(zip(val_df["ingredients"], val_df["dish"]))
test_samples = list(zip(test_df["ingredients"], test_df["dish"]))

# Convert to HF datasets
def convert_to_dict(samples):
    return Dataset.from_dict({
        "anchor": [a for a, b in samples],
        "positive": [b for a, b in samples],
    })

train_dataset = Dataset.from_dict({
    "anchor": train_anchors,
    "positive": train_positives
})
val_dataset = convert_to_dict(val_samples)
test_dataset = convert_to_dict(test_samples)

# Load model
model = SentenceTransformer(
    'paraphrase-multilingual-mpnet-base-v2',
    model_card_data=SentenceTransformerModelCardData(
        language="th",
        license="apache-2.0",
        model_name="Thai Food Ingredients → Dish Prediction",
    )
)

# Evaluation
val_queries = {f'q{i}': val_dataset['anchor'][i] for i in range(len(val_dataset))}
val_corpus = {f'd{i}': val_dataset['positive'][i] for i in range(len(val_dataset))}
val_relevant_docs = {f'q{i}': {f'd{i}'} for i in range(len(val_dataset))}

evaluator = InformationRetrievalEvaluator(
    queries=val_queries,
    corpus=val_corpus,
    relevant_docs=val_relevant_docs,
    name='thai-food-eval',
    show_progress_bar=True,
    precision_recall_at_k=[1, 3, 5]
)

# Loss function
loss = MultipleNegativesRankingLoss(model)

# Training arguments
args = SentenceTransformerTrainingArguments(
    output_dir="models/thai-food-mpnet-new-v8",
    num_train_epochs=16,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24,
    learning_rate=5e-6,
    warmup_ratio=0.1,
    fp16=False,
    bf16=False,
    batch_sampler=BatchSamplers.NO_DUPLICATES,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    logging_strategy="steps",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    run_name="thai-food-retriever-tuned",
)

# Train
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    loss=loss,
    evaluator=evaluator,
)
trainer.train()

# Save model
model.save_pretrained("models/thai-food-mpnet-new-v8")


IndentationError: unindent does not match any outer indentation level (<tokenize>, line 328)

In [None]:
from datasets import Dataset, DatasetDict
import pandas as pd
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    SentenceTransformerModelCardData,
)
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers
from sklearn.model_selection import train_test_split
from sentence_transformers.evaluation import InformationRetrievalEvaluator

# Load CSV
df = pd.read_csv("a.csv")

# Rename columns
df = df.rename(columns={
    "วัตถุดิบ_ไม่มีปริมาณ": "ingredients",
    "ชื่ออาหาร": "dish"
})

# Drop missing values
# Ensure column names in dropna match the dataframe
df = df.dropna(subset=["ingredients", "dish", "query2", "query3"])

# Split into train/temp (80/20)
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)

# Split temp into val/test (50/50 of temp)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Add query 1 (the first val example) into training set
query1_row = val_df.iloc[0]
train_df = pd.concat([
    train_df,
    pd.DataFrame({
        "ingredients": [query1_row["ingredients"]],
        "dish": [query1_row["dish"]],
        "query2": [query1_row["query2"]],
        "query3": [query1_row["query3"]],
    })
], ignore_index=True)

val_df = val_df.drop(val_df.index[0]).reset_index(drop=True)

# ----- 🔁 Prepare augmented training samples -----
def prepare_training_samples(df):
    anchors = []
    positives = []
    for _, row in df.iterrows():
        anchors.extend([
            row["ingredients"],
            row["query2"],
            row["query3"]
        ])
        positives.extend([row["dish"]] * 3)
    return anchors, positives

train_anchors, train_positives = prepare_training_samples(train_df)

# Validation and test keep using only 'ingredients'
val_samples = list(zip(val_df["ingredients"], val_df["dish"]))
test_samples = list(zip(test_df["ingredients"], test_df["dish"]))

# Convert to HF datasets
def convert_to_dict(samples):
    return Dataset.from_dict({
        "anchor": [a for a, b in samples],
        "positive": [b for a, b in samples],
    })

train_dataset = Dataset.from_dict({
    "anchor": train_anchors,
    "positive": train_positives
})
val_dataset = convert_to_dict(val_samples)
test_dataset = convert_to_dict(test_samples)

# Load model
model = SentenceTransformer(
    'paraphrase-multilingual-mpnet-base-v2',
    model_card_data=SentenceTransformerModelCardData(
        language="th",
        license="apache-2.0",
        model_name="Thai Food Ingredients → Dish Prediction",
    )
)

# Evaluation
val_queries = {f'q{i}': val_dataset['anchor'][i] for i in range(len(val_dataset))}
val_corpus = {f'd{i}': val_dataset['positive'][i] for i in range(len(val_dataset))}
val_relevant_docs = {f'q{i}': {f'd{i}'} for i in range(len(val_dataset))}

evaluator = InformationRetrievalEvaluator(
    queries=val_queries,
    corpus=val_corpus,
    relevant_docs=val_relevant_docs,
    name='thai-food-eval',
    show_progress_bar=True,
    precision_recall_at_k=[1, 3, 5]
)

# Loss function
loss = MultipleNegativesRankingLoss(model)

# Training arguments
args = SentenceTransformerTrainingArguments(
    output_dir="models/thai-food-mpnet-new-v8",
    num_train_epochs=16,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24,
    learning_rate=5e-6,
    warmup_ratio=0.1,
    fp16=False,
    bf16=False,
    batch_sampler=BatchSamplers.NO_DUPLICATES,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    logging_strategy="steps",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    run_name="thai-food-retriever-tuned",
)

# Train
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    loss=loss,
    evaluator=evaluator,
)
trainer.train()

# Save model
model.save_pretrained("models/thai-food-mpnet-new-v8")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.90k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mchanisorn-siri[0m ([33mchanisorn-siri-bodindecha[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Thai-food-eval Cosine Accuracy@1,Thai-food-eval Cosine Accuracy@3,Thai-food-eval Cosine Accuracy@5,Thai-food-eval Cosine Accuracy@10,Thai-food-eval Cosine Precision@1,Thai-food-eval Cosine Precision@3,Thai-food-eval Cosine Precision@5,Thai-food-eval Cosine Recall@1,Thai-food-eval Cosine Recall@3,Thai-food-eval Cosine Recall@5,Thai-food-eval Cosine Ndcg@10,Thai-food-eval Cosine Mrr@10,Thai-food-eval Cosine Map@100
1,1.832,1.832842,0.3125,0.645833,0.729167,0.833333,0.3125,0.215278,0.145833,0.3125,0.645833,0.729167,0.572826,0.489492,0.499173
2,1.005,1.335144,0.458333,0.708333,0.8125,0.895833,0.458333,0.236111,0.1625,0.458333,0.708333,0.8125,0.673328,0.60272,0.607394
3,0.9728,1.089707,0.520833,0.75,0.854167,0.895833,0.520833,0.25,0.170833,0.520833,0.75,0.854167,0.722337,0.665476,0.670185
4,0.6698,0.966097,0.645833,0.791667,0.854167,0.916667,0.645833,0.263889,0.170833,0.645833,0.791667,0.854167,0.778958,0.735069,0.739241
5,0.4971,0.930439,0.645833,0.8125,0.854167,0.916667,0.645833,0.270833,0.170833,0.645833,0.8125,0.854167,0.784892,0.742188,0.746709
6,0.3964,0.937455,0.645833,0.791667,0.875,0.9375,0.645833,0.263889,0.175,0.645833,0.791667,0.875,0.788002,0.74036,0.743457
7,0.4037,0.94041,0.6875,0.8125,0.854167,0.958333,0.6875,0.270833,0.170833,0.6875,0.8125,0.854167,0.811475,0.766171,0.767585
8,0.2781,0.919688,0.604167,0.8125,0.895833,0.9375,0.604167,0.270833,0.179167,0.604167,0.8125,0.895833,0.77562,0.722999,0.726124
9,0.3312,0.97664,0.666667,0.8125,0.875,0.9375,0.666667,0.270833,0.175,0.666667,0.8125,0.875,0.79696,0.752307,0.755332
10,0.2554,0.965486,0.6875,0.791667,0.895833,0.9375,0.6875,0.263889,0.179167,0.6875,0.791667,0.895833,0.803778,0.761508,0.764792


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.93s/it]


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.14s/it]


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.42s/it]


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.14s/it]


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.15s/it]


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.17s/it]


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.14s/it]


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.12s/it]


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.78s/it]


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.02s/it]


KeyboardInterrupt: 

In [None]:
from datasets import Dataset, DatasetDict
import pandas as pd
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    SentenceTransformerModelCardData,
)
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers
from sklearn.model_selection import train_test_split
from sentence_transformers.evaluation import InformationRetrievalEvaluator

# Load CSV
df = pd.read_csv("a.csv")

# Rename columns
df = df.rename(columns={
    "วัตถุดิบ_ไม่มีปริมาณ": "ingredients",
    "ชื่ออาหาร": "dish"
})

# Drop missing values
# Ensure column names in dropna match the dataframe
df = df.dropna(subset=["ingredients", "dish", "query2", "query3"])

# Split into train/temp (80/20)
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)

# Split temp into val/test (50/50 of temp)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Add query 1 (the first val example) into training set
query1_row = val_df.iloc[0]
train_df = pd.concat([
    train_df,
    pd.DataFrame({
        "ingredients": [query1_row["ingredients"]],
        "dish": [query1_row["dish"]],
        "query2": [query1_row["query2"]],
        "query3": [query1_row["query3"]],
    })
], ignore_index=True)

val_df = val_df.drop(val_df.index[0]).reset_index(drop=True)

# ----- 🔁 Prepare augmented training samples -----
def prepare_training_samples(df):
    anchors = []
    positives = []
    for _, row in df.iterrows():
        anchors.extend([
            row["ingredients"],
            row["query2"],
            row["query3"]
        ])
        positives.extend([row["dish"]] * 3)
    return anchors, positives

train_anchors, train_positives = prepare_training_samples(train_df)

# Validation and test keep using only 'ingredients'
val_samples = list(zip(val_df["ingredients"], val_df["dish"]))
test_samples = list(zip(test_df["ingredients"], test_df["dish"]))

# Convert to HF datasets
def convert_to_dict(samples):
    return Dataset.from_dict({
        "anchor": [a for a, b in samples],
        "positive": [b for a, b in samples],
    })

train_dataset = Dataset.from_dict({
    "anchor": train_anchors,
    "positive": train_positives
})
val_dataset = convert_to_dict(val_samples)
test_dataset = convert_to_dict(test_samples)

# Load model
model = SentenceTransformer(
    'paraphrase-multilingual-mpnet-base-v2',
    model_card_data=SentenceTransformerModelCardData(
        language="th",
        license="apache-2.0",
        model_name="Thai Food Ingredients → Dish Prediction",
    )
)

# Evaluation
val_queries = {f'q{i}': val_dataset['anchor'][i] for i in range(len(val_dataset))}
val_corpus = {f'd{i}': val_dataset['positive'][i] for i in range(len(val_dataset))}
val_relevant_docs = {f'q{i}': {f'd{i}'} for i in range(len(val_dataset))}

evaluator = InformationRetrievalEvaluator(
    queries=val_queries,
    corpus=val_corpus,
    relevant_docs=val_relevant_docs,
    name='thai-food-eval',
    show_progress_bar=True,
    precision_recall_at_k=[1, 3, 5]
)

# Loss function
loss = MultipleNegativesRankingLoss(model)

# Training arguments
args = SentenceTransformerTrainingArguments(
    output_dir="models/thai-food-mpnet-new-v8",
    num_train_epochs=7,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24,
    learning_rate=5e-6,
    warmup_ratio=0.1,
    fp16=False,
    bf16=False,
    batch_sampler=BatchSamplers.NO_DUPLICATES,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    logging_strategy="steps",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    run_name="thai-food-retriever-tuned",
)

# Train
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    loss=loss,
    evaluator=evaluator,
)
trainer.train()

# Save model
model.save_pretrained("models/thai-food-mpnet-new-v8")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

[34m[1mwandb[0m: Currently logged in as: [33mchanisorn-siri[0m ([33mchanisorn-siri-bodindecha[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Thai-food-eval Cosine Accuracy@1,Thai-food-eval Cosine Accuracy@3,Thai-food-eval Cosine Accuracy@5,Thai-food-eval Cosine Accuracy@10,Thai-food-eval Cosine Precision@1,Thai-food-eval Cosine Precision@3,Thai-food-eval Cosine Precision@5,Thai-food-eval Cosine Recall@1,Thai-food-eval Cosine Recall@3,Thai-food-eval Cosine Recall@5,Thai-food-eval Cosine Ndcg@10,Thai-food-eval Cosine Mrr@10,Thai-food-eval Cosine Map@100
1,1.516,1.667425,0.395833,0.666667,0.75,0.854167,0.395833,0.222222,0.15,0.395833,0.666667,0.75,0.619401,0.544527,0.55298
2,0.8932,1.266092,0.416667,0.6875,0.833333,0.895833,0.416667,0.229167,0.166667,0.416667,0.6875,0.833333,0.657152,0.580233,0.585154
3,0.9192,1.090231,0.520833,0.75,0.854167,0.895833,0.520833,0.25,0.170833,0.520833,0.75,0.854167,0.72325,0.666518,0.671825
4,0.6798,0.984286,0.604167,0.791667,0.854167,0.916667,0.604167,0.263889,0.170833,0.604167,0.791667,0.854167,0.765706,0.71713,0.721777
5,0.5246,0.956241,0.625,0.8125,0.875,0.916667,0.625,0.270833,0.175,0.625,0.8125,0.875,0.771473,0.724479,0.729034
6,0.493,0.947175,0.625,0.833333,0.875,0.916667,0.625,0.277778,0.175,0.625,0.833333,0.875,0.77717,0.731481,0.736376
7,0.4934,0.947075,0.625,0.854167,0.875,0.9375,0.625,0.284722,0.175,0.625,0.854167,0.875,0.784637,0.735301,0.738302


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.13s/it]


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.12s/it]


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.15s/it]


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.13s/it]


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.12s/it]


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.13s/it]


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.12s/it]


In [None]:
# Save test set to CSV
test_df[["ingredients", "dish"]].to_csv("test_set_for_google_sheets.csv", index=False)
print("✅ Test set saved as 'test_set_for_google_sheets.csv'")


✅ Test set saved as 'test_set_for_google_sheets.csv'


In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:

# Save model locally
model.save_pretrained("models/thai-food-mpnet-new-v8")

# (Optional) Create a model card
trainer.create_model_card(
    model_name="thai_food_prediction1",
    language="th",
    license="apache-2.0",
    tags=["thai", "semantic-search", "food", "ingredients", "retrieval", "sentence-transformers"]
)

# Push model to Hugging Face Hub
trainer.push_to_hub(repo_name="Chanisorn/thai_food_prediction1")


model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

events.out.tfevents.1749386969.7bcd77a398f2.1700.1:   0%|          | 0.00/4.18k [00:00<?, ?B/s]

events.out.tfevents.1749387095.7bcd77a398f2.58986.0:   0%|          | 0.00/20.7k [00:00<?, ?B/s]

events.out.tfevents.1749373481.7bcd77a398f2.1700.0:   0%|          | 0.00/27.3k [00:00<?, ?B/s]

Upload 8 LFS files:   0%|          | 0/8 [00:00<?, ?it/s]

events.out.tfevents.1749398840.7bcd77a398f2.102910.0:   0%|          | 0.00/20.7k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.62k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Chanisorn/thai-food-mpnet-new-v8/commit/01f5f36079ec3c939f38ba55dc5acbac27b57f10', commit_message='End of training', commit_description='', oid='01f5f36079ec3c939f38ba55dc5acbac27b57f10', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Chanisorn/thai-food-mpnet-new-v8', endpoint='https://huggingface.co', repo_type='model', repo_id='Chanisorn/thai-food-mpnet-new-v8'), pr_revision=None, pr_num=None)

In [None]:
!pip install -U sentence-transformers


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

ยังไม่ pythai

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np

# Load the model
model = SentenceTransformer("Chanisorn/thai-food-mpnet-new-v8")

# Load and clean the dataset
df = pd.read_csv("a.csv")
df = df.rename(columns={"วัตถุดิบ": "ingredients", "ชื่ออาหาร": "dish"})
df = df.dropna(subset=["ingredients", "dish"])

# Encode dish names
dish_texts = df["dish"].tolist()
dish_embeddings = model.encode(dish_texts, convert_to_tensor=True)

# Input sets (ingredient queries)
input_sets = [
    "อยากกินอาหารที่แบบนัวๆ ใส่กุ้งเยอะๆ มีฟักทอง เผ็ด",
    "มีผงโกโก้ทำอะไรกินดี",

]

# Inference loop for top-3 matches
for user_input in input_sets:
    input_embedding = model.encode([user_input], convert_to_tensor=True)
    cos_sim = cosine_similarity(input_embedding.cpu(), dish_embeddings.cpu())[0]

    top_indices = np.argsort(cos_sim)[::-1][:3]

    print(f"\n🔍 Input Ingredients: {user_input}")
    for rank, idx in enumerate(top_indices, 1):
        matched_dish = df.iloc[idx]["dish"]
        matched_ingredients = df.iloc[idx]["ingredients"]
        similarity_score = cos_sim[idx]

        print(f"\n🥇 Rank {rank}")
        print(f"✅ Matched Dish: {matched_dish}")
        print(f"🧂 Ingredients: {matched_ingredients}")
        print(f"📏 Similarity Score: {similarity_score:.4f}")




🔍 Input Ingredients: อยากกินอาหารที่แบบนัวๆ ใส่กุ้งเยอะๆ มีฟักทอง เผ็ด

🥇 Rank 1
✅ Matched Dish: แกงบวดฟักทอง
🧂 Ingredients: ฟักทอง:, 100 g
น้ำตาลทราย:, 3 ช้อนโต๊ะ
กะทิ:, 100 มล.
เกลือ: เล็กน้อย
📏 Similarity Score: 0.6271

🥇 Rank 2
✅ Matched Dish: ผัดฟักทอง
🧂 Ingredients: ฟักทองหั่นชิ้น, 200 กรัม
ไข่ไก่, 1 ฟอง
น้ำเปล่า, 1 ถ้วย
กระเทียมสับหยาบ, 1 ช้อนโต๊ะ
ซีอิ๊ว low sodium, 1 ช้อนโต๊ะ
พริกไทยป่น, 1 ช้อนโต๊ะ
ใบโหระพา, 10 ใบ
น้ำมันสำหรับผัด, 1 ช้อนชา
📏 Similarity Score: 0.5742

🥇 Rank 3
✅ Matched Dish: ผัดฟักทองใส่ไข่ นิ่ม ฟิน
🧂 Ingredients: ฟักทอง:, 500 กรัม
ไข่ไก่:, 2 ฟอง
น้ำตาล:, 1 ช้อนโต๊ะ
น้ำปลา:, 1/2 ช้อน
แมกกี้: เหยาะๆให้หอม
น้ำมันหอย:, 2 ช้อนโต๊ะ
กระเทียม:, 4 กลีบ
📏 Similarity Score: 0.5578

🔍 Input Ingredients: มีผงโกโก้ทำอะไรกินดี

🥇 Rank 1
✅ Matched Dish: โกโก้หนึบ
🧂 Ingredients: นมข้นหวาน, 200g
ผงโกโก้(ผงโอวัลตินก็ได้), 100g
📏 Similarity Score: 0.8321

🥇 Rank 2
✅ Matched Dish: โกโก้มัทฉะ
🧂 Ingredients: ผงมัทฉะ, 20 กรัม
นมสำหรับชงชาเขียว, 2 ถ้วยตวง
น้ำตาลสำหรับชงชาเขียว ¼ ถ้วย

In [None]:
!pip install pythainlp


Collecting pythainlp
  Downloading pythainlp-5.1.2-py3-none-any.whl.metadata (8.0 kB)
Downloading pythainlp-5.1.2-py3-none-any.whl (19.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.3/19.3 MB[0m [31m58.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pythainlp
Successfully installed pythainlp-5.1.2


In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np

# ===== 🔤 Pythainlp เพื่อ clean ข้อความ =====
from pythainlp.util import normalize
from pythainlp.tokenize import word_tokenize
from pythainlp.corpus.common import thai_stopwords

def clean_text_thai(text):
    text = normalize(text)  # Normalize เช่น ไม้ยมก
    tokens = word_tokenize(text, keep_whitespace=False)  # ตัดคำ
    stopwords = set(thai_stopwords())
    tokens = [t for t in tokens if t not in stopwords]  # ลบ stopwords
    return " ".join(tokens)

# ===== 🧠 Load Model =====
model = SentenceTransformer("Chanisorn/thai-food-mpnet-new-v8")

# ===== 📄 Load Dataset =====
df = pd.read_csv("a.csv")
df = df.rename(columns={"วัตถุดิบ": "ingredients", "ชื่ออาหาร": "dish"})
df = df.dropna(subset=["ingredients", "dish"])

# ===== 💾 Encode Dish Embeddings =====
dish_texts = df["dish"].tolist()
dish_embeddings = model.encode(dish_texts, convert_to_tensor=True)

# ===== 🍳 Input จากผู้ใช้ =====
input_sets = [
    "อยากกินอาหารที่แบบนัวๆ ใส่กุ้งเยอะๆ มีฟักทอง เผ็ด",
    "มีผงโกโก้ทำอะไรกินดี",
]

# ===== 🔍 Matching Loop =====
for user_input in input_sets:
    cleaned_input = clean_text_thai(user_input)
    input_embedding = model.encode([cleaned_input], convert_to_tensor=True)
    cos_sim = cosine_similarity(input_embedding.cpu(), dish_embeddings.cpu())[0]

    top_indices = np.argsort(cos_sim)[::-1][:3]

    print(f"\n🔍 Input: {user_input}")
    print(f"🧹 Cleaned: {cleaned_input}")
    for rank, idx in enumerate(top_indices, 1):
        matched_dish = df.iloc[idx]["dish"]
        matched_ingredients = df.iloc[idx]["ingredients"]
        similarity_score = cos_sim[idx]

        print(f"\n🥇 Rank {rank}")
        print(f"✅ Matched Dish: {matched_dish}")
        print(f"🧂 Ingredients: {matched_ingredients}")
        print(f"📏 Similarity Score: {similarity_score:.4f}")



🔍 Input: อยากกินอาหารที่แบบนัวๆ ใส่กุ้งเยอะๆ มีฟักทอง เผ็ด
🧹 Cleaned: กิน อาหาร นัว ใส่ กุ้ง ฟักทอง เผ็ด

🥇 Rank 1
✅ Matched Dish: แกงบวดฟักทอง
🧂 Ingredients: ฟักทอง:, 100 g
น้ำตาลทราย:, 3 ช้อนโต๊ะ
กะทิ:, 100 มล.
เกลือ: เล็กน้อย
📏 Similarity Score: 0.6868

🥇 Rank 2
✅ Matched Dish: ผัดฟักทอง
🧂 Ingredients: ฟักทองหั่นชิ้น, 200 กรัม
ไข่ไก่, 1 ฟอง
น้ำเปล่า, 1 ถ้วย
กระเทียมสับหยาบ, 1 ช้อนโต๊ะ
ซีอิ๊ว low sodium, 1 ช้อนโต๊ะ
พริกไทยป่น, 1 ช้อนโต๊ะ
ใบโหระพา, 10 ใบ
น้ำมันสำหรับผัด, 1 ช้อนชา
📏 Similarity Score: 0.5829

🥇 Rank 3
✅ Matched Dish: ผัดฟักทองใส่ไข่ นิ่ม ฟิน
🧂 Ingredients: ฟักทอง:, 500 กรัม
ไข่ไก่:, 2 ฟอง
น้ำตาล:, 1 ช้อนโต๊ะ
น้ำปลา:, 1/2 ช้อน
แมกกี้: เหยาะๆให้หอม
น้ำมันหอย:, 2 ช้อนโต๊ะ
กระเทียม:, 4 กลีบ
📏 Similarity Score: 0.5387

🔍 Input: มีผงโกโก้ทำอะไรกินดี
🧹 Cleaned: ผง โกโก้ ทำ กิน ดี

🥇 Rank 1
✅ Matched Dish: โกโก้หนึบ
🧂 Ingredients: นมข้นหวาน, 200g
ผงโกโก้(ผงโอวัลตินก็ได้), 100g
📏 Similarity Score: 0.8042

🥇 Rank 2
✅ Matched Dish: โกโก้มัทฉะ
🧂 Ingredients: ผงมัทฉะ, 20 กรัม
นมสำห

In [5]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.evaluation import InformationRetrievalEvaluator
import pandas as pd

# 🔧 Load your pretrained model
model = SentenceTransformer("Chanisorn/thai-food-mpnet-new-v8")

# 📄 Load your test set CSV (with query1–3 and ชื่ออาหาร)
test_df_raw = pd.read_csv("test set 2 - test_set_for_google_sheets.csv")  # update path if needed

# 📌 Expand test set with anchor-positive pairs
test_rows = []

for _, row in test_df_raw.iterrows():
    dish = row["dish"]
    for col in ["query1 อยากกินอาหารครบ", "query2", "query3"]:
        query = row[col]
        if pd.notna(query):
            test_rows.append({"anchor": query.strip(), "positive": dish.strip()})

test_df = pd.DataFrame(test_rows)

# ✅ Prepare data for evaluator
queries = {f"q{i}": row["anchor"] for i, row in test_df.iterrows()}
corpus = {f"d{i}": row["positive"] for i, row in test_df.iterrows()}
relevant_docs = {f"q{i}": {f"d{i}"} for i in range(len(test_df))}

# 🧪 Evaluate
evaluator = InformationRetrievalEvaluator(
    queries=queries,
    corpus=corpus,
    relevant_docs=relevant_docs,
    name="thai-food-test-eval",
    show_progress_bar=True,
    precision_recall_at_k=[1, 3, 5]
)

evaluator(model)



Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:08<00:00,  8.33s/it]


{'thai-food-test-eval_cosine_accuracy@1': 0.2585034013605442,
 'thai-food-test-eval_cosine_accuracy@3': 0.7074829931972789,
 'thai-food-test-eval_cosine_accuracy@5': 0.7891156462585034,
 'thai-food-test-eval_cosine_accuracy@10': 0.8435374149659864,
 'thai-food-test-eval_cosine_precision@1': 0.2585034013605442,
 'thai-food-test-eval_cosine_precision@3': 0.23582766439909295,
 'thai-food-test-eval_cosine_precision@5': 0.15782312925170064,
 'thai-food-test-eval_cosine_recall@1': 0.2585034013605442,
 'thai-food-test-eval_cosine_recall@3': 0.7074829931972789,
 'thai-food-test-eval_cosine_recall@5': 0.7891156462585034,
 'thai-food-test-eval_cosine_ndcg@10': 0.5661307373640255,
 'thai-food-test-eval_cosine_mrr@10': 0.4745653817082389,
 'thai-food-test-eval_cosine_map@100': 0.4812900188022347}