# **Pipline Causal Analysis - Temperature 4bit**

In [2]:
import pandas as pd

# from datasets import load_from_disk
# from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel

# import torch
# import numpy as np
# import requests
# import pandas as pd
# from sklearn.metrics.pairwise import cosine_similarity
# from langchain import PromptTemplate
# from utils.ollamaUtil import generate_response
# from utils.evaluate import calculate_cosine_similarity
# from tqdm import tqdm

pd.set_option("display.max_columns", 1000)
pd.set_option("display.max_rows", 20)
pd.set_option("max_colwidth", 1000)

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


Data Preparation - Take only 300 for the testing

In [2]:
local_dataset_path = "resource/data/databricks-dolly-15k"
dataset = load_from_disk(local_dataset_path)
df = dataset.to_pandas()
test_df = df[df["category"] == "closed_qa"]
test = test_df.head(20)

In [3]:
def remove_tags(input_string):
    start_tag = "<Answer>"
    end_tag = "</Answer>"

    if start_tag in input_string and end_tag in input_string:
        return input_string.split(start_tag, 1)[1].split(end_tag, 1)[0]
    else:
        return input_string

In [5]:
import torch


num_gpus = torch.cuda.device_count()

if num_gpus > 0:

    print(f"Found {num_gpus} GPU(s).")

    for gpu_index in range(num_gpus):

        torch.cuda.set_device(gpu_index)

        allocated_memory = torch.cuda.memory_allocated()

        max_memory = torch.cuda.get_device_properties(gpu_index).total_memory

        print(
            f"GPU {gpu_index}: Allocated Memory = {allocated_memory / 1024**3:.2f} GB, Max Memory = {max_memory / 1024**3:.2f} GB"
        )
else:
    print("No GPU available. Using CPU.")

torch.cuda.set_device(1)

Found 4 GPU(s).
GPU 0: Allocated Memory = 0.00 GB, Max Memory = 79.11 GB
GPU 1: Allocated Memory = 0.00 GB, Max Memory = 79.11 GB
GPU 2: Allocated Memory = 0.00 GB, Max Memory = 79.11 GB
GPU 3: Allocated Memory = 0.00 GB, Max Memory = 79.11 GB


In [5]:
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, pipeline
import torch
import os


def start_text_generation_api(api_params: dict, model_name: str, model_params: dict):
    app = FastAPI(**api_params)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    model = AutoModelForCausalLM.from_pretrained(
        model_name, torch_dtype=torch.float16, trust_remote_code=True, device_map=device
    )
    generation_config = GenerationConfig.from_pretrained(model_name, **model_params)
    text_pipeline = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        generation_config=generation_config,
    )

    class InputText(BaseModel):
        text: str

    @app.post("/generate-text")
    def generate_text(input_text: InputText):
        try:
            generated_text = text_pipeline(input_text.text, max_length=250)[0][
                "generated_text"
            ]
            return {"result": generated_text}
        except Exception as e:
            raise HTTPException(
                status_code=500, detail=f"Error generating text: {str(e)}"
            )

    uvicorn_cmd = f"uvicorn {__name__}:app --host {api_params['host']} --port {api_params['port']} --reload"
    print(
        f"FastAPI is ongoing, please use the following address : http://{api_params['host']}:{api_params['port']}"
    )
    os.system(uvicorn_cmd)


api_params = {
    "host": "127.0.0.2",
    "port": 8889,
}

model_name = "meta-llama/Llama-2-7b-hf"

model_params = {
    "max_new_tokens": 1024,
    "temperature": 0.0001,
    "top_p": 0.95,
    "do_sample": True,
    "repetition_penalty": 1.15,
}

start_text_generation_api(api_params, model_name, model_params)

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.15s/it]


FastAPI is ongoing, please use the following address : http://127.0.0.2:8889


INFO:     Will watch for changes in these directories: ['/home/llama/Personal_Directories/srb/causalllm-main']
INFO:     Uvicorn running on http://127.0.0.2:8889 (Press CTRL+C to quit)
INFO:     Started reloader process [609937] using StatReload
ERROR:    Error loading ASGI app. Attribute "app" not found in module "__main__".


In [None]:
import requests

input_text = {"text": "Your input is here for the verification."}
response = requests.post("http://127.0.0.1:8000/generate-text", json=input_text)

if response.status_code == 200:
    result = response.json()["result"]
    print(f"Generated Text: {result}")
else:
    print(f"Error: {response.status_code}, {response.text}")

In [5]:
import torch
from torch import bfloat16


TOKEN = "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"
# device = "cuda" if torch.cuda.is_available() else "cpu"
# torch.cuda.set_device(2)

from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig

ModuleNotFoundError: No module named 'torch'

In [4]:
import torch
from torch import bfloat16

TOKEN = "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"
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.cuda.set_device(3)

from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig

model_id = "meta-llama/Llama-2-70b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=TOKEN)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    # quantization_config=bnb_config,
    # trust_remote_code=True,
    device_map=device,
    use_auth_token=TOKEN,
)

# Generate
# inputs = tokenizer("Hello world", return_tensors="pt").to("cuda")
# generated_ids = model.generate(
#     **inputs,
#     do_sample=True,
#     top_p=0.95,
#     temperature=0.01,
#     max_length=250,
# )
# tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

### Option 2
# Replace 'XXX' with the model variant of your choosing

ModuleNotFoundError: No module named 'torch'

In [4]:
from transformers import pipeline

pipe = pipeline(
    "text-generation",
    model="meta-llama/Llama-2-13b-chat-hf",
    device_map="auto",
    # model_kwargs={"load_in_8bit": True}, # quantize to 8-bit
    # model_kwargs={"load_in_4bit": True}, # quantize to 4-bit
    # use_auth_token=TOKEN,
)

# Generate
pipe(
    "Hello world",
    do_sample=True,
    top_p=0.95,
    temperature=0.01,
    max_length=250,
)

Loading checkpoint shards: 100%|██████████| 3/3 [00:08<00:00,  2.95s/it]
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[{'generated_text': 'Hello world!\n\nThis is a test message.\n\nPlease ignore.\n\nThank you.'}]

In [2]:
tokenizer.save_pretrained("model/Llama-2-70b-chat-hf")
model.save_pretrained("model/Llama-2-70b-chat-hf")

In [2]:
# model.save_pretrained("model/Llama-2-7b-hf")
# tokenizer.save_pretrained("model/Llama-2-7b-hf")

tokenizer = AutoTokenizer.from_pretrained(
    "/home/llama/Personal_Directories/srb/causalllm-main/model/Llama-2-70b-chat-hf"
)
model = AutoModelForCausalLM.from_pretrained(
    "/home/llama/Personal_Directories/srb/causalllm-main/model/Llama-2-70b-chat-hf"
)

Loading checkpoint shards: 100%|██████████| 61/61 [00:39<00:00,  1.53it/s]


In [9]:
from transformers import pipeline

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map="auto",
    model_kwargs={"load_in_8bit": True},  # quantize to 8-bit
    model_kwargs={"load_in_4bit": True},  # quantize to 4-bit
    # use_auth_token=TOKEN,
)

pipe(
    "Hello world",
    do_sample=True,
    top_p=0.95,
    temperature=0.01,
    max_length=250,
)

[{'generated_text': 'Hello world!\n\nThis is a test message.\n\nPlease ignore.\n\nThank you.'}]

In [None]:
url = "http://localhost:11434/api/generate"
instruction = """Instruction: Please strictly answer the question, but format it using custom tags like <Answer>Your answer here<Answer>. If the question cannot be answered, just answer with "I don't know". """
template = """
Instruction: {instruction}
Context: {context}
Question: {question}
"""
prompt_template = PromptTemplate.from_template(template)
columns = [
    "temperature",
    "model",
    "model_tag",
    "question",
    "context",
    "ground_truth_response",
    "prompt",
    "payload",
]
result_df = pd.DataFrame(columns=columns)

In [None]:
# build the preprocessed csv files
for index, row in test.iterrows():
    question = row["instruction"]
    context = row["context"]
    response = row["response"]
    category = row["category"]

    prompt = prompt_template.format(
        instruction=instruction, context=context, question=question
    )

    for model in ["7b"]:
        for temperature in np.arange(0.1, 1.1, 0.3):
            model_tag = "llama2:" + model
            data = {
                "temperature": temperature,
                "model": "llama" + model,
                "model_tag": model_tag,
                "question": question,
                "context": context,
                "ground_truth_response": response,
                "prompt": str(prompt),
            }
            result_df = pd.concat([result_df, pd.DataFrame([data])], ignore_index=True)


result_df.to_excel("job_2_temperature_newtest_new.xlsx")
loaded_df = pd.read_excel("job_2_temperature_newtest_new.xlsx")

In [None]:
# Run the experiments
csv_filename = "job_2_temperature_newtest_new.csv"
pbar = tqdm(total=len(result_df))
responses = []
similarities = []
for index, row in result_df.iterrows():
    response = remove_tags(generate_response(row))
    similarity = calculate_cosine_similarity(response, row["ground_truth_response"])
    updated_row = row.copy()
    updated_row["generated_response"] = response
    updated_row["cosine_similarity"] = similarity
    updated_dataframe = pd.DataFrame([updated_row])
    updated_dataframe.to_csv(csv_filename, index=False, mode="a", header=not index)
    pbar.update(1)
pbar.close()

In [11]:
import pandas as pd

result_df = pd.read_csv("job_3_temperature_llama2-7b-result.csv")

In [12]:
result_df

Unnamed: 0.1,Unnamed: 0,temperature,model,model_tag,question,context,ground_truth_response,prompt,payload,generated_response
0,0,0.1,llama13b,llama2:13b,When did Virgin Australia start operating?,"Virgin Australia, the trading name of Virgin A...",Virgin Australia commenced services on 31 Augu...,\nInstruction: Please strictly answer the ques...,,\nInstruction: Please strictly answer the ques...
1,1,0.4,llama13b,llama2:13b,When did Virgin Australia start operating?,"Virgin Australia, the trading name of Virgin A...",Virgin Australia commenced services on 31 Augu...,\nInstruction: Please strictly answer the ques...,,\nInstruction: Please strictly answer the ques...
2,Unnamed: 0,temperature,model,model_tag,question,context,ground_truth_response,prompt,payload,generated_response
3,0,0.1,llama13b,llama2:13b,When did Virgin Australia start operating?,"Virgin Australia, the trading name of Virgin A...",Virgin Australia commenced services on 31 Augu...,\nInstruction: Please strictly answer the ques...,,
4,1,0.4,llama13b,llama2:13b,When did Virgin Australia start operating?,"Virgin Australia, the trading name of Virgin A...",Virgin Australia commenced services on 31 Augu...,\nInstruction: Please strictly answer the ques...,,
...,...,...,...,...,...,...,...,...,...,...
3873,1769,1.6,llama13b,llama2:13b,Which animal is associated with Chengdu?,"""Chengtu, is a sub-provincial city which serve...","Chengdu is associated with the giant panda, a ...",\nInstruction: Please strictly answer the ques...,,\nInstruction: Please strictly answer the ques...
3874,1770,1.9,llama13b,llama2:13b,Which animal is associated with Chengdu?,"""Chengtu, is a sub-provincial city which serve...","Chengdu is associated with the giant panda, a ...",\nInstruction: Please strictly answer the ques...,,\nInstruction: Please strictly answer the ques...
3875,1771,0.1,llama13b,llama2:13b,Given this paragraph about hockey what are dif...,Hockey is a term used to denote a family of va...,"Hockey can refer to Ice Hockey, Field Hockey, ...",\nInstruction: Please strictly answer the ques...,,\nInstruction: Please strictly answer the ques...
3876,1772,0.4,llama13b,llama2:13b,Given this paragraph about hockey what are dif...,Hockey is a term used to denote a family of va...,"Hockey can refer to Ice Hockey, Field Hockey, ...",\nInstruction: Please strictly answer the ques...,,\nInstruction: Please strictly answer the ques...


In [13]:
import psutil
import subprocess
import signal


def find_uvicorn_processes():
    uvicorn_processes = []
    for process in psutil.process_iter(["pid", "name", "cmdline"]):
        if "uvicorn" in process.info["name"].lower() or (
            "uvicorn" in process.info["cmdline"] if process.info["cmdline"] else False
        ):
            uvicorn_processes.append(process.info)

    return uvicorn_processes


def kill_process_by_pid(pid):
    try:
        process = psutil.Process(pid)
        process.terminate()  # Alternatively, use process.kill() for a forceful termination
        print(f"Process {pid} has been terminated.")
    except psutil.NoSuchProcess:
        print(f"No process found with PID {pid}.")


uvicorn_processes = find_uvicorn_processes()

if uvicorn_processes:
    print("The follwoing is ruuning: ")
    for process in uvicorn_processes:
        print(
            f"PID: {process['pid']}, Name: {process['name']}, Command: {' '.join(process['cmdline'])}"
        )
else:
    print("Uvicore unfound")

The follwoing is ruuning: 
PID: 1129745, Name: uvicorn, Command: /usr/bin/python3 /home/llama/.local/bin/uvicorn causal.text_generation_api:app --reload --host 127.0.0.1 --port 8899
PID: 2371149, Name: uvicorn, Command: /home/llama/Sandbox_IA/env/bin/python /home/llama/Sandbox_IA/env/bin/uvicorn app.main:app --reload --port 8001
