## The main file to invoke My Virtual Moments application

#### 1. Prepare the LLM for question answering.
In this naive implementation, we want to first ensure that llama3 (or any other possible models) may respond to user requests well.

In [None]:
%pip install -r requirements.txt

In [1]:
# We want to load the model first
import accelerate, bitsandbytes
import torch, os
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from transformers import LlamaTokenizerFast

model_path = '/ssdshare/LLMs/llama3-Chinese-chat-8b/'
tokenizer = LlamaTokenizerFast.from_pretrained(model_path,padding_side='left')
qconfig=BitsAndBytesConfig(load_in_8bit=True)

model = AutoModelForCausalLM.from_pretrained(model_path, 
                                             device_map="auto", 
                                             quantization_config=qconfig) 
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'PreTrainedTokenizerFast'. 
The class this function is called from is 'LlamaTokenizerFast'.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
# Now we define a function to get answers from the LLM
def chat(model, tokenizer, prompt):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=4096)
    input_ids = inputs.input_ids.to("cuda")
    outputs = model.generate(input_ids, pad_token_id=tokenizer.eos_token_id, max_new_tokens=512, do_sample=True, temperature=0.7)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def question_prompt(question):
    chat = [
        {"role": "system", "content": """ Please be a helpful assistant and answer the following question:"""},
        {"role": "user", "content": "Question: " + question},
    ]
    prompt = ""
    for message in chat:
        prompt += f"{message['role']}: {message['content']}\n"
    return prompt

def chat_with_llm(question):
    prompt = question_prompt(question)
    return chat(model, tokenizer, prompt)

In [8]:
# Utilize the functions defined above to chat with the model
print(chat_with_llm("What is the capital of France?"))

system:  Please be a helpful assistant and answer the following question:
user: Question: What is the capital of France?
system: The capital of France is Paris.


#### 2. Implement the LLM pipeline

In [17]:
from transformers import TextDataset, DataCollatorForLanguageModeling, TrainingArguments, Trainer

class LocalLlama:
    def __init__(self):
        self.model = model
        self.tokenizer = tokenizer

    def predict(self, input_text):
        input_ids = self.tokenizer(input_text, return_tensors="pt",  padding=True, truncation=True, max_length=4096).input_ids.to("cuda")
        outputs = self.model.generate(input_ids, pad_token_id=tokenizer.eos_token_id, max_new_tokens=512, do_sample=True, temperature=0.7)
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

In [18]:
# Construct a transformer pipeline
from transformers import pipeline
from langchain.memory import ConversationBufferMemory
from langchain.prompts import PromptTemplate

llm = LocalLlama()

pipe = pipeline(
    "text-generation",
    model = model,
    device_map = "cuda:0",
    max_length = 4096,
    tokenizer = tokenizer,
)

In [22]:
# Test if the llm chain works properly
input_text = "Hi! Tell me your name."
print(llm.predict(input_text))

Hi! Tell me your name. What's your name? Are you hungry? Are you tired? Do you have a job? Do you like dogs? Are you looking for a pet? Do you like cats? Are you allergic to cats? Are you allergic to dogs? Do you like to travel? Have you been to Europe? Have you been to Asia? Have you been to South America? Have you been to Africa? Have you been to Australia? Have you been to New Zealand? Have you been to Antarctica?


#### 3. Implement few-shot learning

In [4]:
from transformers import TextDataset, DataCollatorForLanguageModeling, TrainingArguments, Trainer
from torch.utils.data import Dataset

class QADataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings.input_ids)

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

def load_qa_pairs(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = [line.strip() for line in file if line.strip()]

    # Ensuring the file has an even number of lines for perfect pairing
    assert len(lines) % 2 == 0, "The text file should contain an even number of lines."

    # Pairing questions and answers
    qa_pairs = []
    for i in range(0, len(lines), 2):
        question = lines[i]
        answer = lines[i+1]
        qa_pairs.append(f"Question: {question} Answer: {answer}")
    
    return qa_pairs

def prepare_training_data(qa_pairs, block_size=256):
    # Concatenate QA pairs until the block size is reached
    training_instances = []
    current_block = ""
    for pair in qa_pairs:
        if len(current_block) + len(pair) + 1 > block_size:
            training_instances.append(current_block.strip())
            current_block = ""
        current_block += pair + " "
    if current_block:
        training_instances.append(current_block.strip())
    return training_instances

def tokenize_data(tokenizer, text_data, max_length=256):
    return tokenizer(text_data, truncation=True, padding="max_length", max_length=max_length, return_tensors="pt")

In [5]:
# Prepare tokenized training data
raw_data = load_qa_pairs("data/furina2.txt")
encodings = tokenize_data(tokenizer, prepare_training_data(raw_data))
tokenized_data = QADataset(encodings)

In [6]:
class FineTunedLlama:
    def __init__(self):
        self.model = model
        self.tokenizer = tokenizer
        self.fine_tuned_model = None

        # Create a data collator
        data_collator = DataCollatorForLanguageModeling(
            tokenizer = self.tokenizer,
            mlm = False,
        )
        # Define the training arguments
        training_args = TrainingArguments(
            output_dir = "./llama_fine_tuned",
            overwrite_output_dir = True,
            num_train_epochs = 3,
            per_device_train_batch_size = 2,
            save_steps = 1000,
            save_total_limit = 2,
            logging_dir = "./logs",
        )

        # Create the trainer
        trainer = Trainer(
            model = self.model,
            args = training_args,
            data_collator = data_collator,
            train_dataset = tokenized_data,
        )
        trainer.train()
        self.fine_tuned_model = trainer.model
        print("Fine tuning completed.")

    def predict(self, input_text):
        input_ids = self.tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=4096).input_ids.to("cuda")
        if self.fine_tuned_model:
            print("Using the fine tuned model.")
            outputs = self.model.generate(input_ids, pad_token_id=tokenizer.eos_token_id, max_new_tokens=512, do_sample=True, temperature=0.7)
        else:
            print("The model is not fine tuned yet. Using the original model.")
            outputs = self.model.generate(input_ids, pad_token_id=tokenizer.eos_token_id, max_new_tokens=512, do_sample=True, temperature=0.7)
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)


In [None]:
ftllm = FineTunedLlama()
print(ftllm.predict("Hi!Are you free now?"))


#### 4. Trying to do few-shot learning using conversation

In [29]:
def load_conversation_context(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = [line.strip() for line in file.readlines()]
    
    assert len(lines) % 2 == 0, "Each question should have a corresponding answer."

    conversation = ""
    for i in range(0, 14, 2):
        question = lines[i]
        answer = lines[i+1]
        conversation += f"Question: {question}\nAnswer: {answer}\n"
    
    return conversation

class ConversationLlama:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def predict(self, input_text, conversation_context):
        # Add the new question to the ongoing conversation context
        full_text = "Do a role play and learn from the following QA examples:" + conversation_context + f"Now answer the given qeustion in similar tone: {input_text}\n"

        # Encode the text input to tensor
        input_ids = self.tokenizer(full_text, return_tensors="pt", padding=True, truncation=True, max_length=4096).input_ids.to("cuda")
        outputs = self.model.generate(input_ids, max_new_tokens=1000, pad_token_id=self.tokenizer.eos_token_id, do_sample=True, temperature=0.7)
        generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return generated_text


In [30]:
cllm = ConversationLlama(model, tokenizer)

# Load the conversation context
conversation_context = load_conversation_context("data/furina2.txt")

In [34]:
new_question = "If you adopted a beatiful dog, what would you say?"
response = cllm.predict(new_question, conversation_context)
print(response)

Do a role play and learn from the following QA examples:Question: What would you say when meeting for the first time?
Answer: Why are you just standing there with your mouth gaping? Ah, you must be stunned and at a loss for words... Understandable, it is I after all... Fontaine's most beloved star, Furina. I'm on a very tight schedule, so you're lucky to even get an appointment with me.
Question: What would you say when chatting about tea parties?
Answer: Tea parties are a must for the well-mannered. If you'd like to learn the proper etiquette, I'd be happy to teach you.
Question: What would you say when bored?
Answer: Boring... Isn't there anything else more interesting to do?
Question: What would you say when being popular?
Answer: *sigh* Being too popular can be such a hassle. Who knew the people would adore me so much?
Question: What would you say when it's raining?
Answer: It's pouring out here! Wait, the water levels aren't rising, are they?
Question: What would you say when it's