In [None]:
from huggingface_hub import login
login()

In [None]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7

In [4]:
import os
import torch
from datasets import load_dataset, load_metric
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [5]:
################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################

# Load the entire model on the GPU 0
device_map = {"": 0}

# Load the model

In [6]:
from peft import PeftConfig

In [None]:
config = PeftConfig.from_pretrained("CodeTriad/llama2_finetuned_15000_2")


In [8]:
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

In [9]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
    use_cache = True
)

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    quantization_config=bnb_config,
    device_map=device_map,
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
model = PeftModel.from_pretrained(model, "CodeTriad/llama2_finetuned_15000_2")

In [None]:
model.eval()

# Evaluation

In [None]:
import transformers

pipe = transformers.pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    batch_size=32,
    # max_length=4000,
    temperature=0.1,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=512,  # mex number of tokens to generate in the output
    repetition_penalty=1.1  # without this output begins repeating
)

In [13]:
import pandas as pd
import ast
import re
import csv

In [14]:
def create_csv(recipe_ids, recipes, source_ingredient_list, expected_substitute_list, suggested_substitutes_list, is_hit_list, destination):
    data = {
        "Recipe Id": recipe_ids,
        "Recipe": recipes,
        "Source Ingredient": source_ingredient_list,
        "Expected Substitute": expected_substitute_list,
        "Suggested Substitutes": suggested_substitutes_list,
        "Check": is_hit_list
    }
    df = pd.DataFrame(data)

    df.to_csv(destination, index=False)

In [None]:
csv_file_path = '/content/drive/MyDrive/Colab Notebooks/FYP/test_comments_subs_with_titles.csv'

df = pd.read_csv(csv_file_path, header=None, skiprows=1)

recipe_ids = []
recipes = []
source_ingredient_list = []
expected_substitute_list = []
suggested_substitutes_list = []
isHit = []
recipe_count = 0
num_of_hits = 0

for index, row in df.iterrows():
      ingredient_to_replace = row[2].split(",")[0].strip()  # Extract the 1st item in the 3rd column
      ingredient_name = re.sub(r'[()]', '', ingredient_to_replace).strip()  # Remove parentheses and extra spaces
      ingredient_name = ingredient_name.strip("'")  # Remove single quotes
      ingredient_name = ingredient_name.replace("_", " ")  # replace "_" with space
      substitutes = []

      recipe_title = row[3]
      expected_substitute_name = row[2].split(",")[1].strip()
      expected_substitute = re.sub(r'[()]', '',
                                    expected_substitute_name).strip()  # Remove parentheses and extra spaces
      expected_substitute = expected_substitute.strip("'")  # Remove single quotes
      expected_substitute = expected_substitute.replace("_", " ")  # replace "_" with space

      substitute_prompt = pipe(f"""[INST] <<SYS>>
As a master chef, your culinary prowess knows no bounds. Your ability to flawlessly cook any dish is unparalleled.Even when faced with a missing ingredient, you effortlessly identify the perfect
substitute. <<SYS>>
Follow the instructions below and suggest the best substitute for the given ingredient.
Instructions:
- Do not provide the same ingredient as above as the substitutes.
- Give only one ingredient.
- Avoid giving explanations.
- Only provide the name of the ingredient.
- Give the output as a numbered point.

Dish: {recipe_title}
Ingredient: {ingredient_name}
[/INST]""")[0]['generated_text']

      result = substitute_prompt.split("[/INST]")[1]

      pattern = r'•\s*(.*)'

      match = re.search(pattern, result)

      if match:
        matches = match.group(1).lower()
        matches = matches.replace("-"," ")


      try:
        recipe_ids.append(row[0])
        recipes.append(row[1])
        suggested_substitutes_list.append(matches)
        expected_substitute_list.append(expected_substitute)
        source_ingredient_list.append(ingredient_name)
        if (expected_substitute == matches):
          num_of_hits += 1
          isHit.append("TRUE")
        else:
          isHit.append("FALSE")
        recipe_count += 1
      except:
        continue
      print(matches,": ",expected_substitute, num_of_hits, recipe_count)

destination = '/content/drive/MyDrive/Colab Notebooks/FYP/llama_finetune_2500.csv'
create_csv(recipe_ids, recipes, source_ingredient_list, expected_substitute_list, suggested_substitutes_list, isHit, destination)