In [None]:
!pip install transformers datasets
!pip install torch torchvision peft
!pip install accelerate peft bitsandbytes transformers trl
!pip install deep-translator
!pip install --upgrade transformers

Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.0-py3-none-any.whl (474 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.3/474.3 kB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K

In [None]:
import torch
from transformers import (LlamaTokenizer, LlamaForCausalLM, Trainer, TrainingArguments, AutoTokenizer, AutoModelForCausalLM, pipeline)
from datasets import Dataset
from peft import get_peft_model, LoraConfig, TaskType
import pandas as pd
import os
from datasets import load_dataset
from peft import AutoPeftModelForCausalLM
from transformers import BitsAndBytesConfig
from trl import SFTTrainer
from transformers import GenerationConfig
from time import perf_counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


In [None]:
from google.colab import userdata
os.environ["HUGGINGFACE_TOKEN"] = hf_token
os.environ["HF_TOKEN"] = hf_token

In [None]:
torch.cuda.empty_cache()

In [None]:
# Load the pre-trained LLaMA tokenizer and model
model="meta-llama/Meta-Llama-3.1-8B-Instruct"

def get_model_and_tokenizer(mod):
    tokenizer = AutoTokenizer.from_pretrained(mod, token = hf_token)
    tokenizer.pad_token = tokenizer.eos_token

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype="float16",
        bnb_4bit_use_double_quant=True
    )

    model = AutoModelForCausalLM.from_pretrained(
        mod, quantization_config=bnb_config, device_map="balanced_low_0",  token = hf_token
    )

    model.config.use_cache = True  # Enable caching for inference
    model.config.pretraining_tp = 1  # Adjust for multiple GPUs if applicable

    model.gradient_checkpointing_enable()  # Save memory during training

    return model, tokenizer


In [None]:
model_dir = "./saved_model"  # Directory to check

if os.path.exists(model_dir) and os.listdir(model_dir):
    print("Loading model and tokenizer from saved directory...")
    model = AutoModel.from_pretrained(model_dir)
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
else:
    print("Saved model not found. Downloading and saving...")
    model, tokenizer = get_model_and_tokenizer(mod=model)
    model.save_pretrained(model_dir)
    tokenizer.save_pretrained(model_dir)


torch.cuda.empty_cache()

Saved model not found. Downloading and saving...


tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

 **Done Loading the Model**

In [None]:
# Load examples from CSV
def load_examples_from_csv(file_path):
    df = pd.read_csv(file_path)
    return df['Input'].tolist(), df['Ürün Adı Önerisi'].tolist(), df['Ürün Açıklama Önerisi'].tolist()

# Function to retrieve the most relevant example
def retrieve_examples(input_text, inputs, names, descriptions):
    vectorizer = TfidfVectorizer().fit(inputs + [input_text])
    vectors = vectorizer.transform(inputs + [input_text])
    cosine_similarities = cosine_similarity(vectors[-1], vectors[:-1])
    most_similar_idx = np.argmax(cosine_similarities)
    return names[most_similar_idx], descriptions[most_similar_idx]

def create_prompt(user_input, product_name, product_description):
    return f"""
    Based on the following example and product description, generate a product name and description for the new input in a similar format.

    Example:
    Product Name Suggestion: {product_name}
    Product Description Suggestion: {product_description}

    New Input: {user_input}

    Please provide:
    1. Product Name Suggestion: A clear and concise product name based solely on the new input. Exclude any information about how to preserve the product.
    2. Product Description Suggestion: A detailed and informative product description based solely on the new input. Structure the description as follows:
        - First, explain the product and its key attributes.
        - Next, explain how to preserve the product.
        - Lastly, describe where the product can be used.
      Use new lines to separate these sections but do not include titles or headings, use titles or headings only to seperate name and description.

    Instructions:
    - Follow the structure of the example provided.
    - Use only the information given in the New Input.
    - Do not include details not present in the New Input.
    - Ensure the output is clearly formatted with new lines between sections in the product description.
    - Ensure that only the required outputs are printed, with no additional text or formatting.
    """


def generate_response(user_input, model, tokenizer, inputs, names, descriptions):
    if not user_input:
        return "Input is empty, please provide a valid query."

    try:
        # Retrieve the most relevant example
        product_name, product_description = retrieve_examples(user_input, inputs, names, descriptions)

        # Format the prompt and tokenize it
        prompt = create_prompt(user_input, product_name, product_description)
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        tokens = tokenizer([prompt], return_tensors="pt").to(device)

        # Define generation configuration
        generation_config = GenerationConfig(
            penalty_alpha=0.6,
            do_sample=True,
            top_k=5,
            temperature=0.5,
            repetition_penalty=1.2,
            max_new_tokens=200,
            pad_token_id=tokenizer.eos_token_id
        )

        # Generate response
        outputs = model.generate(**tokens, generation_config=generation_config)

        # Decode the response
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Move tensors back to CPU to free up memory
        torch.cuda.empty_cache()

        # Extract only the relevant part of the response
        # Assuming the response includes a specific delimiter or format,
        # you may need to trim or process it to get the clean output.
        # Example of trimming unwanted parts:
        # Define the sentence to search for
        search_sentence = "- Ensure that only the required outputs are printed, with no additional text or formatting."
        start_index = response.find(search_sentence)

        # If the sentence is found, extract text after it
        if start_index != -1:
            start_index += len(search_sentence)
            response = response[start_index:].strip()

        # Return the cleaned response
        return response.strip()

    except Exception as e:
        return f"An error occurred: {str(e)}"

In [None]:
file_path = '/content/translated_kilicFineTune.csv'
inputs, names, descriptions = load_examples_from_csv(file_path)

# Example usage
user_input = "200 grams of strawberry jam natural , keep it in a warm place after opening"
response = generate_response(user_input, model, tokenizer, inputs, names, descriptions)
print(response)



Product Name Suggestion: Natural Strawberry Jam - 200g


This 200 gram jar contains all-natural ingredients without artificial preservatives or additives. It has a rich taste and aroma from real strawberries.


Keep it in a warm place after opening to maintain its quality and texture. The best before date will ensure you have enough time to consume this product within its shelf life.



You can serve this delicious jam over toast at your morning meal, add some sweetness to yogurt, oatmeal, ice cream, pancakes, waffles, cakes, cookies, muffins, scones, biscuits, bagels, English muffins, French bread, croissants, donuts, crepes, blinis, poffertjes, doughnuts, churros, crêpes, brioche, Danish pastry, cinnamon rolls, apple fritters, sweet potato fries, fried chicken wings, grilled cheese sandwiches, panini, wraps, flatbreads, tortillas, naan bread


In [None]:
from deep_translator import GoogleTranslator

# Get all available languages from GoogleTranslator
languages = GoogleTranslator().get_supported_languages(as_dict=True)

top_20_languages_by_popularity = [
    'english',
    'chinese (simplified)',
    'hindi',
    'spanish',
    'french',
    'arabic',
    'bengali',
    'portuguese',
    'russian',
    'japanese',
    'german',
    'korean',
    'vietnamese',
    'italian',
    'urdu',
    'turkish',
    'tamil',
    'polish',
    'dutch',
    'greek'
]


option = input("Do you want to see the top 20 languages by popularity or all available languages? (Type 'top' or 'all'): ").strip().lower()

if option == 'top':
    top_languages = [(code, languages[code]) for code in top_20_languages_by_popularity if code in languages]
elif option == 'all':
    top_languages = sorted(languages.items(), key=lambda x: x[1])
else:
    print("Invalid choice. Showing top 20 languages by default.")
    top_languages = [(code, languages[code]) for code in top_20_languages_by_popularity if code in languages]

if not top_languages:
    print("No top languages found, displaying all languages instead.")
    top_languages = sorted(languages.items(), key=lambda x: x[1])

# Calculate the maximum width based on the longest language name and its code
max_lang_length = max(len(lang) for _, lang in top_languages)
max_code_length = max(len(code) for code, _ in top_languages)
width = max(max_lang_length, max_code_length) + 5  # Add padding

print("\nAvailable languages (Language: Code):\n")

columns = 3

for i in range(0, len(top_languages), columns):
    row = top_languages[i:i + columns]
    formatted_row = [f"{lang.ljust(max_lang_length)} : {code.ljust(max_code_length)}" for code, lang in row]
    print(" | ".join(formatted_row))
    print("-" * (len(formatted_row) * (max_lang_length + max_code_length + 5) - 1))  # Line between rows

target_language = input("\nEnter the target language code: ")

translated = GoogleTranslator(source='auto', target=target_language).translate(response)

print(translated)

Do you want to see the top 20 languages by popularity or all available languages? (Type 'top' or 'all'): all

Available languages (Language: Code):

af       : afrikaans             | ak       : twi                   | am       : amharic              
-----------------------------------------------------------------------------------------------------
ar       : arabic                | as       : assamese              | ay       : aymara               
-----------------------------------------------------------------------------------------------------
az       : azerbaijani           | be       : belarusian            | bg       : bulgarian            
-----------------------------------------------------------------------------------------------------
bho      : bhojpuri              | bm       : bambara               | bn       : bengali              
-----------------------------------------------------------------------------------------------------
bs       : bosnian             