In [None]:
import torch
from datasets import load_dataset, Dataset
from peft import LoraConfig, AutoPeftModelForCausalLM
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer
import os
import re
import sys

In [None]:
sys.path.append(os.path.abspath('/path/to/LLM-Reconfiguration/Dataset-Notebooks/utils'))

from dataset_utils import prepare_train_data
from model_utils import get_model, get_tokenizer
from generation_utils import *

In [None]:
from huggingface_hub import login

access_token = 'hf_token'
login(token=access_token)

In [None]:
data_path="/path/to/LLM-Reconfiguration/Dataset-Notebooks/train_files/train_33_69_84_nodes.csv"
model_id="/path/to/LLM-Reconfiguration/AutoTrain/llama3/model_name/checkpoint-26280"

In [None]:
model = get_model(model_id)
tokenizer = get_tokenizer(model_id)

In [None]:
train_dataset, validation_dataset, test_dataset = prepare_train_data(data_path)

In [None]:
peft_config = LoraConfig(
        r=8, lora_alpha=16, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM"
    )

In [None]:
model_path ="/path/to/LLM-Reconfiguration/AutoTrain/llama3/model_name/checkpoint-26280"

model = peft_merge_unload(model_id, model_path)

In [None]:
# Set the model to evaluation mode
model.eval()

In [None]:
max_tokens = 120000  # Adjust according to your model's capacity

In [None]:
conversation_history = ""

# generation_config = GenerationConfig(
#       penalty_alpha=penalty_alpha,
#       do_sample = do_sample,
#       top_k=top_k,
#       temperature=temperature,
#       repetition_penalty=repetition_penalty,
#       max_new_tokens=max_new_tokens,
#       pad_token_id=tokenizer.eos_token_id, 
#       eos_token_id=tokenizer.eos_token_id  # Ensure EOS token is set - This is a new parameter so maybe it breaks the code.
#     )


In [None]:
while True:
    try:
        # Get user input
        user_input = input("You: ").strip()

        # Check for exit commands before processing further
        if user_input.lower() in ['quit', 'exit']:
            print("Exiting the chat.")
            break  # Break out of the loop if user wants to quit
        
        conversation_history += f"User: {user_input}\n"
    
        # Tokenize the conversation history
        input_ids = tokenizer.encode(conversation_history, return_tensors='pt').to('cuda')
    
        # If the number of tokens exceeds the limit, truncate the beginning
        if input_ids.size(-1) > max_tokens:
            input_ids = input_ids[:, -max_tokens:]
    
        # Generate a response
        outputs = model.generate(input_ids, 
                                 max_length=120000, 
                                 max_new_tokens = 1200,
                                 do_sample=True, 
                                 top_k=5,
                                 penalty_alpha=0.6,
                                 temperature=0.5, 
                                 repetition_penalty=1.2,
                                 pad_token_id=tokenizer.eos_token_id, 
                                 eos_token_id=tokenizer.eos_token_id )
    
        # Decode and print the model's response
        response = tokenizer.decode(outputs[:, input_ids.shape[-1]:][0], skip_special_tokens=True)
        conversation_history += f"Model: {response}\n"
    
        print(f"Model: {response}")

    except KeyboardInterrupt:
        # Handle manual interruption (Ctrl+C)
        print("Chat interrupted manually. Exiting...")
        break


In [None]:
print(conversation_history)

In [None]:
test_dataset[0]['prompt']

In [None]:
test_dataset[0]['output']