In [1]:
import sys
sys.path.insert(0, '../../')

In [2]:
import os
import numpy as np
import matplotlib.pyplot as plt
from typing import List, Tuple, Union

import candle
import experiments.textgenutils as gutils

## (1) Load Model

In [3]:
"""
Get the LLaMA weights here: ai.meta.com/resources/models-and-libraries/llama-downloads/
28gb of RAM is need to run 7B model since everything is float32.
MODEL_DIR will look something like this:

    /mnt/disks/disk1/llama2/
    ├── tokenizer.model
    ├── tokenizer_checklist.chk
    ├── llama-2-7b
    │   ├── checklist.chk
    │   ├── consolidated.00.pth
    │   └── params.json
    ├── llama-2-7b-chat
    │   ├── checklist.chk
    │   ├── consolidated.00.pth
    │   └── params.json
    │   ...
    ...
"""
MODEL_SIZE = '13b-chat'
MODEL_DIR = '/mnt/disks/disk1/llama2/'
assert MODEL_SIZE.endswith('-chat')

model = candle.models.llama.Llama.from_pretrained(MODEL_SIZE, MODEL_DIR)
tokenizer = candle.models.llama.LlamaTokenizer(os.path.join(MODEL_DIR, 'tokenizer.model'))

  return self.fget.__get__(instance, owner)()


## (2) Have a conversation

In [7]:
USER_NAME = 'John'
ASST_NAME = 'LLaMA'

# Mistral system prompt, modified for conciseness
SYSTEM_MESSAGE = ('Always assist with care, respect, and truth. Respond with utmost utility yet '
                  'securely. Avoid harmful, unethical, prejudiced, or negative content. Ensure replies '
                  'promote fairness and positivity. Keep replies as concise as possible.')

# If using chat fine-tuned model, use LLaMA chat template with proper delims [INST] <<SYS>> <s>...
chat_template = candle.nlp.chattemplates.LlamaChatTemplate(system_message=SYSTEM_MESSAGE)

def start_conversation(profile_pic: str = '🙂'):
    gutils.interactive_conversation(
        model, chat_template, tokenizer,
        user_name=USER_NAME,
        profile_pic=profile_pic,
        user_bg_color='yellow',
        asst_name=ASST_NAME,
        asst_profile_pic='🦙',
        asst_bg_color='blue',
        max_response_length=2048,
        top_k=100,
        top_p=0.90,
        temperature=0.8,
        stop_token_idx=tokenizer.sp_model.eos_id(),
    )

In [8]:
start_conversation(profile_pic='👦🏻')

[1m< You are now talking with LLaMA. Send 'bye' to exit, 'clear' to reset cache. >[0m

[43m[1m👦🏻[0m[1m John:[0m Write a Python function to find the first repeated character in a given string.

[44m[1m🦙[0m[1m LLaMA:[0m Sure, I'd be happy to help! Here's a Python function that finds the first repeated character in a given string:
[37m<code>[0m
[38;5;28;01mdef[39;00m [38;5;21mfind_first_repeated_char[39m([38;5;28mstr[39m):
[38;5;250m    [39m[38;5;124;03m"""[39;00m
[38;5;124;03m    Finds the first repeated character in a given string.[39;00m

[38;5;124;03m    Args:[39;00m
[38;5;124;03m        str (str): The string to search for repeated characters.[39;00m

[38;5;124;03m    Returns:[39;00m
[38;5;124;03m        str (str): The first repeated character in the string.[39;00m

[38;5;124;03m    Example:[39;00m
[38;5;124;03m        find_first_repeated_char("hello")  # returns "l"[39;00m
[38;5;124;03m        find_first_repeated_char("banana")  # returns "a"[3

In [9]:
start_conversation(profile_pic='👦🏻')

[1m< You are now talking with LLaMA. Send 'bye' to exit, 'clear' to reset cache. >[0m

[43m[1m👦🏻[0m[1m John:[0m Briefly explain some of the risks of open sourcing large language models.

[44m[1m🦙[0m[1m LLaMA:[0m Open sourcing large language models can pose several risks, including:

1. Unintended consequences: Large language models can be complex and difficult to interpret, making it challenging to anticipate their behavior and potential consequences.
2. Bias and discrimination: Large language models can perpetuate biases and discrimination present in the training data, which can have negative impacts on marginalized communities.
3. Security risks: Open sourcing large language models can create security risks, as they can be used for malicious purposes such as phishing, spamming, or spreading misinformation.
4. Dependence on external data: Large language models are often trained on external data sources, which can be unreliable or unavailable, leading to model downtime or i