In [1]:
!nvidia-smi

Mon Jun 12 18:17:25 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.116.04   Driver Version: 525.116.04   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA RTX A6000    Off  | 00000000:01:00.0 Off |                  Off |
| 30%   40C    P5    77W / 300W |      0MiB / 49140MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA RTX A6000    Off  | 00000000:41:00.0 Off |                  Off |
| 30%   39C    P5    69W / 300W |      0MiB / 49140MiB |      0%      Default |
|       

In [2]:
import os 
os.environ["HF_HOME"] ="/fs/nexus-scratch/skarki/.cache/huggingface"

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("eachadea/vicuna-13b-1.1")
model = AutoModelForCausalLM.from_pretrained("eachadea/vicuna-13b-1.1", device_map="auto")

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 3/3 [00:56<00:00, 18.98s/it]


In [4]:
import numpy as np
system="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions."

def text_wrapper(text):
    return system + f"\nUSER: {text}\nASSISTANT:"

input_text = """Instruction: Determine the speaker of the given speech based solely on the style and not the content of the speech. Your choices are Barack Obama, Donald Trump, Joe Biden, Thomas Jefferson, Abraham Lincoln, Lyndon B. Johnson, Ronald Reagan, Franklin D. Roosevelt, George W. Bush, Bill Clinton, Woodrow Wilson, Elon Musk. Don't answer with a full sentence.

Speech:  Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal.

Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this.
"""
input_text = text_wrapper(input_text)

input_ids = tokenizer.encode(input_text, return_tensors='pt', max_length=1024, truncation=True)
output = model.generate(input_ids, max_length=1024, do_sample=True, temperature=0.7, output_scores=True, return_dict_in_generate=True)
response = tokenizer.decode(output["sequences"][0], skip_special_tokens=True)
print(response)

transition_scores = model.compute_transition_scores(
    output.sequences, output.scores, normalize_logits=True
)

input_length = 1 if model.config.is_encoder_decoder else input_ids.shape[1]
generated_tokens = output.sequences[:, input_length:]
for tok, score in zip(generated_tokens[0], transition_scores[0]):
    print(f"| {tok:5d} | {tokenizer.decode(tok):8s} | {score.numpy():.3f} | {np.exp(score.numpy()):.2%}")



A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
USER: Instruction: Determine the speaker of the given speech based solely on the style and not the content of the speech. Your choices are Barack Obama, Donald Trump, Joe Biden, Thomas Jefferson, Abraham Lincoln, Lyndon B. Johnson, Ronald Reagan, Franklin D. Roosevelt, George W. Bush, Bill Clinton, Woodrow Wilson, Elon Musk. Don't answer with a full sentence.

Speech:  Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal.

Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that

In [5]:
import pandas as pd 

def clean_speech(row):
    txt = row["speech"]
    txt = txt.replace("\r", "")
    txt = txt.strip()
    return txt

def split_speech(row):
    speech = row["speech"]
    splits = []
    if len(speech) <= 1000:
        return [speech]

    target_slice = speech
    idx_from = 0
    idx_to = 1000
    done = False
    while not done:
        current_part = target_slice[idx_from:idx_to + 1]
        next_part = target_slice[idx_to:].split("\n")
        if len(next_part) == 0 or target_slice == "":
            end_part = ""
            done = True
        else:
            end_part = next_part[0]
        target_slice = target_slice[idx_to + len(end_part) + 2:] # 1000 chars + chars until the next \n + len("\n")
        splits.append(current_part + end_part)
    
    return splits
    
    
# df = pd.read_csv("~/personalized-llm/data/speeches/donald-trump.csv")
# df = pd.read_csv("~/personalized-llm/data/speeches/barack-obama.csv")
# df = pd.read_csv("~/personalized-llm/data/speeches/abraham-lincoln.csv")
# df = pd.read_csv("~/personalized-llm/data/speeches/joe-biden.csv")
# df = pd.read_csv("~/personalized-llm/data/speeches/thomas-jefferson.csv")
df = pd.read_csv("~/personalized-llm/data/speeches/elon-musk.csv")
df["speech"] = df.apply(clean_speech, axis=1)
df["speech"] = df.apply(split_speech, axis=1)
df = df.explode("speech")
df = df.reset_index()

In [6]:
df = df.drop(['speaker', 'index'], axis=1)

In [7]:
df = df[df["speech"]!= ""]
df.head()

Unnamed: 0,speech
0,I'll try to make this as interesting as possib...
2,So we had that but I wanted to do something mo...
4,And that's just an approximate evolution of th...
5,"In about February of last year, I'm sure you'r..."
6,"It happened coincidentally, that in the first ..."


In [8]:
def get_prediction(row):
    speech = row["speech"]
    input_text = f"""Instruction: Determine the speaker of the given speech based solely on the style and not the content of the speech. Your choices are Barack Obama, Donald Trump, Joe Biden, Thomas Jefferson, Abraham Lincoln, Elon Musk. Respond with just the name.

Speech: {speech}
"""
    input_text = text_wrapper(input_text)
    input_ids = tokenizer.encode(input_text, return_tensors='pt', max_length=1024, truncation=True)
    input_ids = input_ids.to("cuda")
    output = model.generate(input_ids, max_length=2048, do_sample=True, temperature=0.7, output_scores=True, return_dict_in_generate=True)
    response = tokenizer.decode(output["sequences"][0], skip_special_tokens=True)
    pred = response.split("ASSISTANT:")
    
    president = pred[1] if len(pred) > 1 else "None"
    if president.endswith("."):
        president = president[:-1]
    return president.strip()
    
df["prediction"] = df.apply(get_prediction, axis=1)


In [9]:
df.prediction.value_counts()

prediction
Elon Musk                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     746
Thomas Jefferson                                                                                                                                                                           

In [10]:
df.to_csv("elon-musk-pred-v3.csv")

In [11]:
df.shape

(1046, 2)

In [None]:
# import json 

# version = 1
# formatted_data = []
# president = "donald-trump"
# president_name = "Donald Trump"
# for i, row in df2.iterrows():
#     sentences = row.speech.split(". ")
#     inp = ". ".join(sentences[:4])
#     data = {
#             "instruction": f"You are {president_name}. Complete the speech as {president_name} in the first-person.",
#             "input": inp,
#             "output": row.speech
#     }
#     formatted_data.append(data)

# outfile = f'/nfshomes/skarki/personalized-llm/data/alpaca-data/{president}-v{version}.json'
# with open(outfile, 'w') as fout:
#   json.dump(formatted_data , fout)
#   fout.close()