In [None]:
from types import SimpleNamespace

import autogen
from autogen import AssistantAgent, UserProxyAgent
import os
import json

import torch

from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM,GenerationConfig

from together import Together
import os
# Set the environment variable (or ensure it's already set in the system)
os.environ['TOGETHER_API_KEY'] = ''

client = Together()
MODEL = "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo"
# login to Hugging Face Hub

access_token_write=''
login(token=access_token_write)

# custom client with custom model loader
os.environ["OAI_CONFIG_LIST"] = json.dumps(
    [
        {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
            "model_client_cls": "CustomModelClient",
            "device": 0,
            "n": 1,
            "params": {
                "max_new_tokens": 100,
                "top_k": 50,
                "temperature": 0.8,
                "do_sample": True,
                "return_full_text": False,
            },
        }
    ]
)

# custom user client with custom model loader
os.environ["OAI_CONFIG_LISTs"] = json.dumps(
    [
        {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "model_client_cls": "CustomModelClient2",
            "device": 0,
            "n": 1,
            "params": {
                "max_new_tokens": 100,
                "top_k": 50,
                "temperature": 0.8,
                "do_sample": True,
                "return_full_text": False,
            },
        }
    ]
)

class CustomModelClient:
    def __init__(self, config, **kwargs):
        print(f"CustomModelClient config: {config}")
        self.device = config.get("device", "cuda")
        self.model = AutoModelForCausalLM.from_pretrained(config["model"]).to(self.device)
        self.model_name = config["model"]
        self.tokenizer = AutoTokenizer.from_pretrained(config["model"], use_fast=False)
        self.tokenizer.pad_token_id = self.tokenizer.eos_token_id

        # params are set by the user and consumed by the user since they are providing a custom model
        # so anything can be done here
        gen_config_params = config.get("params", {})
        self.max_length = gen_config_params.get("max_length", 256)

        print(f"Loaded model {config['model']} to {self.device}")

    def create(self, params):
        if params.get("stream", False) and "messages" in params:
            raise NotImplementedError("Local models do not support streaming.")
        else:
            num_of_responses = params.get("n", 1)

            # can create my own data response class
            # here using SimpleNamespace for simplicity
            # as long as it adheres to the ClientResponseProtocol

            response = SimpleNamespace()

            inputs = self.tokenizer.apply_chat_template(
                params["messages"], return_tensors="pt", add_generation_prompt=True
            ).to(self.device)
            inputs_length = inputs.shape[-1]

            # add inputs_length to max_length
            max_length = self.max_length + inputs_length
            generation_config = GenerationConfig(
                max_length=max_length,
                eos_token_id=self.tokenizer.eos_token_id,
                pad_token_id=self.tokenizer.eos_token_id,
            )

            response.choices = []
            response.model = self.model_name

            for _ in range(num_of_responses):
                outputs = self.model.generate(inputs, generation_config=generation_config)
                # Decode only the newly generated text, excluding the prompt
                text = self.tokenizer.decode(outputs[0, inputs_length:])
                choice = SimpleNamespace()
                choice.message = SimpleNamespace()
                choice.message.content = text
                choice.message.function_call = None
                response.choices.append(choice)

            return response

    def message_retrieval(self, response):
        """Retrieve the messages from the response."""
        choices = response.choices
#         response = client.chat.completions.create(
#         model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
#         messages=[{"role": "user", "content": f"concise the response: {choices}"}],
#         temperature=0.8,
#         top_p=0.9,
# )
        return [choice.message.content for choice in choices]

    def cost(self, response) -> float:
        """Calculate the cost of the response."""
        response.cost = 0
        return 0

    @staticmethod
    def get_usage(response):
        # returns a dict of prompt_tokens, completion_tokens, total_tokens, cost, model
        # if usage needs to be tracked, else None
        return {}

class CustomModelClient2:
    def __init__(self, config, **kwargs):
        print(f"CustomModelClient config: {config}")
        self.device = config.get("device", "cuda")
        self.history = []  # Store the history of all messages
        self.model = AutoModelForCausalLM.from_pretrained(config["model"]).to(self.device)
        self.model_name = config["model"]
        self.tokenizer = AutoTokenizer.from_pretrained(config["model"], use_fast=False)
        self.tokenizer.pad_token_id = self.tokenizer.eos_token_id

        # params are set by the user and consumed by the user since they are providing a custom model
        # so anything can be done here
        gen_config_params = config.get("params", {})
        self.max_length = gen_config_params.get("max_length", 256)

        print(f"Loaded model {config['model']} to {self.device}")

    def create(self, params):
        if params.get("stream", False) and "messages" in params:
            raise NotImplementedError("Local models do not support streaming.")
        else:
            num_of_responses = params.get("n", 1)

            response = SimpleNamespace()

            inputs = self.tokenizer.apply_chat_template(
                params["messages"], return_tensors="pt", add_generation_prompt=True
            ).to(self.device)
            inputs_length = inputs.shape[-1]

            # add inputs_length to max_length
            max_length = self.max_length + inputs_length
            generation_config = GenerationConfig(
                max_length=max_length,
                eos_token_id=self.tokenizer.eos_token_id,
                pad_token_id=self.tokenizer.eos_token_id,
            )

            response.choices = []
            response.model = self.model_name

            for _ in range(num_of_responses):
                outputs = self.model.generate(inputs, generation_config=generation_config)
                # Decode only the newly generated text, excluding the prompt
                text = self.tokenizer.decode(outputs[0, inputs_length:])
                choice = SimpleNamespace()
                choice.message = SimpleNamespace()
                choice.message.content = text
                choice.message.function_call = None
                response.choices.append(choice)

            return response

    def add_to_history(self, role, message):
        """Add a message to the history."""
        self.history.append({"role": role, "content": message})

    def send_message(self, message):
        """Send a message and add it to history."""
        self.add_to_history("user_proxy", message)

    def get_user_history(self):
        """Retrieve only user messages from the history."""
        return [entry["content"] for entry in self.history if entry["role"] == "user_proxy"]

    def message_retrieval(self, response):
        """Retrieve the messages from the response."""
        choices = response.choices
        response = client.chat.completions.create(
            model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
            messages=[{"role": "user", "content": f"concise the response: {choices}"}],
            temperature=0.8,
            top_p=0.9,
        )
        return [response.choices[0].message.content]

    def cost(self, response) -> float:
        """Calculate the cost of the response."""
        response.cost = 0
        return 0

    @staticmethod
    def get_usage(response):
        # returns a dict of prompt_tokens, completion_tokens, total_tokens, cost, model
        # if usage needs to be tracked, else None
        return {}

# ########################################################################################
system_message = """You are a world-class AI-based coach."""
user_message = """You are a USER with some challenges, and you are here to seek help from a world-class AI-based coach."""

config_list_custom = autogen.config_list_from_json(
    "OAI_CONFIG_LIST",
    filter_dict={"model_client_cls": ["CustomModelClient"]},
)
config_list_custom2 = autogen.config_list_from_json(
    "OAI_CONFIG_LISTs",
    filter_dict={"model_client_cls": ["CustomModelClient2"]},
)
assistant = AssistantAgent("assistant",  system_message=system_message,llm_config={"config_list": config_list_custom},human_input_mode="NEVER")
user_proxy = UserProxyAgent("user_proxy",system_message=user_message, llm_config={"config_list": config_list_custom2}, max_consecutive_auto_reply=5,code_execution_config=False,human_input_mode="NEVER")
assistant.register_model_client(model_client_cls=CustomModelClient)
user_proxy.register_model_client(model_client_cls=CustomModelClient2)

assistant.initiate_chat(user_proxy, message="Hello, welcome to MetaMindful! How are you feeling today?")

In [2]:
import subprocess

# Specify the output file name
requirements_file = "requirements.txt"

# Generate requirements.txt using pip freeze
try:
    with open(requirements_file, "w") as file:
        # Use subprocess to call pip freeze and write the output to the file
        result = subprocess.run(["pip", "freeze"], stdout=file, check=True)
    print(f"'{requirements_file}' has been created successfully.")
except Exception as e:
    print(f"An error occurred: {e}")


'requirements.txt' has been created successfully.
