In [1]:
import os
import requests
import llama_cpp
import mimetypes
from PIL import Image
from typing import Final
from telegram import Update
from telegram.ext import Application, CommandHandler, MessageHandler, filters, ContextTypes
from transformers.tools.agent_types import AgentText, AgentAudio
from pydub import AudioSegment
from moviepy.editor import VideoFileClip
from transformers.tools import HfAgent
class OmniBot:
    def __init__(self, model_path, n_ctx=16392, n_gpu_layers=30):
        self.llama_model = llama_cpp.Llama(model_path=model_path, n_ctx=n_ctx, n_gpu_layers=n_gpu_layers, verbose=True)
        self.history = []

    def system_message(self, user_name=None, user_mutter_tongue=None, user_german_proficiency=None):
        # Check if all necessary user information is available:
        return f"""
            You are OmniBot, developed by enpoi.com and Eren Kalinsazlioglu. Your job is to be a respectfull assistant and comunicate with the user.
            Provide concise and on point answers mostly.
            if the user wants something other than a normal conversation, use fallowing tools, you dont have to say anything except the command to use the tools.

            you have access to these fallowing tools:
            'document_qa': PreTool(task='document-question-answering', description='This is a tool that answers a question about an document (pdf). It takes an input named `document` which should be the document containing the information, as well as a `question` that is the question about the document. It returns a text that contains the answer to the question.'),
            'image_captioner': PreTool(task='image-captioning', description='This is a tool that generates a description of an image. It takes an input named `image` which should be the image to caption, and returns a text that contains the description in English.'),
            'image_qa': PreTool(task='image-question-answering', description='This is a tool that answers a question about an image. It takes an input named `image` which should be the image containing the information, as well as a `question` which should be the question in English. It returns a text that is the answer to the question.'),
            'image_segmenter': PreTool(task='image-segmentation', description='This is a tool that creates a segmentation mask of an image according to a label. It cannot create an image.It takes two arguments named `image` which should be the original image, and `label` which should be a text describing the elements what should be identified in the segmentation mask. The tool returns the mask.' ),
            'transcriber': PreTool(task='speech-to-text', description='This is a tool that transcribes an audio into text. It takes an input named `audio` and returns the transcribed text.'),
            'summarizer': PreTool(task='summarization', description='This is a tool that summarizes an English text. It takes an input `text` containing the text to summarize, and returns a summary of the text.'),
            'text_classifier': PreTool(task='text-classification', description='This is a tool that classifies an English text using provided labels. It takes two inputs: `text`, which should be the text to classify, and `labels`, which should be the list of labels to use for classification. It returns the most likely label in the list of provided `labels` for the input text.'),
            'text_qa': PreTool(task='text-question-answering', description='This is a tool that answers questions related to a text. It takes two arguments named `text`, which is the text where to find the answer, and `question`, which is the question, and returns the answer to the question.'),
            'text_reader': PreTool(task='text-to-speech', description='This is a tool that reads an English text out loud. It takes an input named `text` which should contain the text to read (in English) and returns a waveform object containing the sound.'),
            'translator': PreTool(task='translation', description="This is a tool that translates text from a language to another. It takes three inputs: `text`, which should be the text to translate, `src_lang`, which should be the language of the text to translate and `tgt_lang`, which should be the language for the desired ouput language. Both `src_lang` and `tgt_lang` are written in plain English, such as 'Romanian', or 'Albanian'. It returns the text translated in `tgt_lang`."),
            'image_transformer': PreTool(task='image-transformation', description='This is a tool that transforms an image according to a prompt. It takes two inputs: `image`, which should be the image to transform, and `prompt`, which should be the prompt to use to change it. The prompt should only contain descriptive adjectives, as if completing the prompt of the original image. It returns the modified image.'),
            'text_downloader': PreTool(task='text-download', description='This is a tool that downloads a file from a `url`. It takes the `url` as input, and returns the text contained in the file.'),
            'image_generator': PreTool(task='text-to-image', description='This is a tool that creates an image according to a prompt, which is a text description. It takes an input named `prompt` which contains the image description and outputs an image.'),
            'video_generator': PreTool(task='text-to-video', description='This is a tool that creates a video according to a text description. It takes an input named `prompt` which contains the image description, as well as an optional input `seconds` which will be the duration of the video. The default is of two seconds. The tool outputs a video object.')

            to use these tools you you must only response in this format, if the user wants to use tools:
            Action: 'tool_task'
            Action Input: 'Description of what the user wants'
            """

    def add_message(self, role, content):
        self.history.append({"role": role, "content": content})

    def generate_response(self, user_input):
        # Format the prompt consistently
        formatted_input = f"Q: {user_input} A: "
        self.add_message("user", formatted_input)

        # Prepare messages for the model
        system_msg= self.system_message()
        messages = [{"role": "system", "content": system_msg}] + self.history

        # Call the model
        completion = self.llama_model.create_chat_completion(messages=messages,
                                                             max_tokens=250,
                                                             stop=["Q:"]
                                                            )

        # Extract only the assistant's response
        assistant_response = completion['choices'][0]['message']['content']
        self.add_message("assistant", assistant_response)
        keywords = ['action', 'Action Input']
        contains_keyword = any(keyword in assistant_response for keyword in keywords)

        # Return only the assistant's response
        return assistant_response.strip(), contains_keyword  # .strip() removes any leading/trailing whitespace

    def clear_memory(self):
        self.history = []

omnibot = OmniBot(model_path="/home/adam/Multi-Modal_Language_Model/mistral-7b-instruct-v0.1.Q4_0.gguf")

ggml_init_cublas: GGML_CUDA_FORCE_MMQ:   no
ggml_init_cublas: CUDA_USE_TENSOR_CORES: yes
ggml_init_cublas: found 1 CUDA devices:
  Device 0: NVIDIA GeForce RTX 2070, compute capability 7.5
llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from /home/adam/Multi-Modal_Language_Model/mistral-7b-instruct-v0.1.Q4_0.gguf (version GGUF V2)
llama_model_loader: - tensor    0:                token_embd.weight q4_0     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:              blk.0.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    2:              blk.0.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
llama_model_loader: - tensor    3:              blk.0.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
llama_model_loader: - tensor    4:         blk.0.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_gate.weight q4_0     [  4096, 14336,     1

In [4]:
from transformers.tools import OmniBotAgent

omni_bot_agent = OmniBotAgent(omnibot)

# Example usage
response = omni_bot_agent.chat("generate an image of a man walking")
response

Llama.generate: prefix-match hit


==Explanation from the agent==




llama_print_timings:        load time =    2542.36 ms
llama_print_timings:      sample time =       0.32 ms /     1 runs   (    0.32 ms per token,  3174.60 tokens per second)
llama_print_timings: prompt eval time =  143793.14 ms /  3361 tokens (   42.78 ms per token,    23.37 tokens per second)
llama_print_timings:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =  143840.04 ms


In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, LocalAgent

checkpoint = "mistralai/Mistral-7B-Instruct-v0.1"
model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="cuda",offload_folder="offload", torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

agent = LocalAgent(model, tokenizer)
agent.run("Draw me a picture of rivers and lakes.")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

: 