In [1]:
# !uvicorn causal.text_generation_api:app --reload --host 127.0.0.1 --port 8889

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig
import pandas as pd
from datasets import load_from_disk
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
import requests
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from langchain import PromptTemplate
from utils.evaluate import calculate_cosine_similarity
from tqdm import tqdm
import psutil
import subprocess
import signal
import re

  from .autonotebook import tqdm as notebook_tqdm


Check if the api is running

In [3]:
def find_uvicorn_processes():
    uvicorn_processes = []
    for process in psutil.process_iter(["pid", "name", "cmdline"]):
        if "uvicorn" in process.info["name"].lower() or (
            "uvicorn" in process.info["cmdline"] if process.info["cmdline"] else False
        ):
            uvicorn_processes.append(process.info)

    return uvicorn_processes


def kill_process_by_pid(pid):
    try:
        process = psutil.Process(pid)
        process.terminate()  # Alternatively, use process.kill() for a forceful termination
        print(f"Process {pid} has been terminated.")
    except psutil.NoSuchProcess:
        print(f"No process found with PID {pid}.")


uvicorn_processes = find_uvicorn_processes()

if uvicorn_processes:
    print("The follwoing is ruuning: ")
    for process in uvicorn_processes:
        print(
            f"PID: {process['pid']}, Name: {process['name']}, Command: {' '.join(process['cmdline'])}"
        )
else:
    print("Uvicore unfound")

The follwoing is ruuning: 
PID: 24290, Name: uvicorn, Command: /home/llama/Sandbox_IA/env/bin/python /home/llama/Sandbox_IA/env/bin/uvicorn app.main:app --reload --port 8001


Open the server using uvicorn

In [4]:
directory = "causal"
module = "text_generation_api_llama2_7B"
model = "LLama_7B"
host = "127.0.0.1"
port = 8899
quantization = "4bit"
python_executable = "~/Personal_Directories/srb/causalEnv/bin/python"
test_num = 300

In [5]:
import subprocess
from concurrent.futures import ThreadPoolExecutor

# %nohup run_uvicorn_command(directory, your_module, host,port)
uvicorn_command = f"{python_executable} -m uvicorn {directory}.{module}:app --reload --host {host} --port {port}"
subprocess.Popen(uvicorn_command, shell=True)

<Popen: returncode: None args: '~/Personal_Directories/srb/causalEnv/bin/pyt...>

INFO:     Will watch for changes in these directories: ['/home/llama/Personal_Directories/srb/causalllm-main']
INFO:     Uvicorn running on http://127.0.0.1:8899 (Press CTRL+C to quit)
INFO:     Started reloader process [1261240] using StatReload
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|██████████| 6/6 [00:02<00:00,  2.38it/s]
INFO:     Started server process [1261242]
INFO:     Waiting for application startup.
INFO:     Application startup complete.


In [6]:
def extract_answer(input_string):
    match = re.search(r"<Answer>(.*?)<\/Answer>", input_string)
    if match:
        answer = match.group(1)
        return answer, True
    else:
        return input_string, False

Prepare dataset

In [7]:
pd.set_option("display.max_columns", 1000)
pd.set_option("display.max_rows", 20)
pd.set_option("max_colwidth", 1000)

local_dataset_path = "resource/data/databricks-dolly-15k"
dataset = load_from_disk(local_dataset_path)
df = dataset.to_pandas()
test_df = df[df["category"] == "closed_qa"]
test = test_df.head(test_num)

In [9]:
url = "http://127.0.0.1:8899/generate-text"
system_prompt = """
Please strictly answer my question, but format it using custom tags like <Answer>Your answer here<Answer>. If the question cannot be answered, just answer with "I don't know". """
template = """
<s>[INST] <<SYS>>
{system_prompt}
<</SYS>>

Context: {context_message}
Question: {question_message}

[/INST]

"""

# template = """
# Instruction:{system_prompt}
# Context: {context_message}
# Question: {question_message}
# """
prompt_template = PromptTemplate.from_template(template)

columns = ["temperature", "model", "ground_truth_response", "prompt"]
result_df = pd.DataFrame(columns=columns)

# build the preprocessed csv files
for index, row in test.iterrows():
    question = row["instruction"]
    context = row["context"]
    response = row["response"]
    prompt = prompt_template.format(
        system_prompt=system_prompt, context_message=context, question_message=question
    )

    for temperature in np.arange(0.1, 2.0, 0.3):
        data = {
            "temperature": temperature,
            "model": model,
            "ground_truth_response": response,
            "prompt": str(prompt),
        }
        result_df = pd.concat([result_df, pd.DataFrame([data])], ignore_index=True)


result_df.to_excel(f"{module}_{quantization}_{test_num}.xlsx")

  result_df = pd.concat([result_df, pd.DataFrame([data])], ignore_index=True)


In [10]:
result_df = pd.read_excel(f"{module}_{quantization}_{test_num}.xlsx")
result_df["temperature"] = result_df["temperature"].round(2)

In [11]:
from utils.huggingfaceUtil import generate_response_hf

In [12]:
from utils.huggingfaceUtil import generate_response_hf
from datetime import datetime
from tqdm import tqdm
import pandas as pd
import time  # Import the time module

csv_filename = f"{module}_{quantization}_{test_num}.csv"

pbar = tqdm(total=len(result_df))

responses = []

similarities = []

for index, row in result_df.iterrows():
    start_time = time.time()  # Record the start time
    response = generate_response_hf(row)
    end_time = time.time()  # Record the end time
    elapsed_time = end_time - start_time  # Calculate the elapsed time
    updated_row = row.copy()
    response_str = response[0]["generated_text"]
    updated_row["full answer"] = response_str
    answer, label = extract_answer(response_str)
    updated_row["generated_response"] = answer
    updated_row["label"] = label
    updated_row["elapsed_time"] = elapsed_time  # Add elapsed time to the row
    # Add the timestamp to the updated_row
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    updated_row["timestamp"] = timestamp
    updated_dataframe = pd.DataFrame([updated_row])
    updated_dataframe.to_csv(csv_filename, index=False, mode="a", header=not index)
    pbar.update(1)

pbar.close()

  0%|          | 0/2100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


0.1


  0%|          | 1/2100 [00:20<11:46:35, 20.20s/it]

INFO:     127.0.0.1:56792 - "POST /generate-text HTTP/1.1" 200 OK
0.4


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 2/2100 [00:40<11:46:59, 20.22s/it]

INFO:     127.0.0.1:38050 - "POST /generate-text HTTP/1.1" 200 OK
0.7


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 3/2100 [00:59<11:31:39, 19.79s/it]

INFO:     127.0.0.1:52066 - "POST /generate-text HTTP/1.1" 200 OK
1.0


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 4/2100 [01:18<11:14:31, 19.31s/it]

INFO:     127.0.0.1:42932 - "POST /generate-text HTTP/1.1" 200 OK
1.3


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 5/2100 [01:41<12:08:29, 20.86s/it]

INFO:     127.0.0.1:41430 - "POST /generate-text HTTP/1.1" 200 OK
1.6


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


KeyboardInterrupt: 

INFO:     127.0.0.1:39980 - "POST /generate-text HTTP/1.1" 200 OK


INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [1243729]


cuda


Process SpawnProcess-2:
Traceback (most recent call last):
  File "/usr/lib64/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/usr/lib64/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/llama/Personal_Directories/srb/causalEnv/lib64/python3.9/site-packages/uvicorn/_subprocess.py", line 78, in subprocess_started
    target(sockets=sockets)
  File "/home/llama/Personal_Directories/srb/causalEnv/lib64/python3.9/site-packages/uvicorn/server.py", line 65, in run
    return asyncio.run(self.serve(sockets=sockets))
  File "/usr/lib64/python3.9/asyncio/runners.py", line 44, in run
    return loop.run_until_complete(main)
  File "/usr/lib64/python3.9/asyncio/base_events.py", line 647, in run_until_complete
    return future.result()
  File "/home/llama/Personal_Directories/srb/causalEnv/lib64/python3.9/site-packages/uvicorn/server.py", line 69, in serve
    await self._serve(sockets)
  File 

cuda


Loading checkpoint shards: 100%|██████████| 19/19 [00:08<00:00,  2.31it/s]
INFO:     Started server process [1253072]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
