# Dependences

In [None]:
!pip install git+https://github.com/huggingface/transformers.git
!pip install torch
!pip install accelerate

In [None]:
!nvidia-smi

# Loading the model

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_name = 'meta-llama/Llama-2-7b-hf'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map='auto',
    trust_remote_code=True,
    torch_dtype=torch.bfloat16
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:09<00:00,  4.63s/it]


# Inference

In [5]:
DO_SAMPLE = True
MAX_NEW_TOKENS = 1024
TEMPERATURE = .3
TOP_K = 100

def result_from_output_sequence(output, n_shot = 0):
    
    # Un-prompt output
    delimiter = '[/INST]'
    output = delimiter.join(output.split(delimiter)[1 + n_shot:])
    
    # Remove white spaces in front of summary
    while len(output) > 0 and output[0] == ' ':
        output = output[1:]
    
    # Remove <|endoftext|>
    eos_token = '</s>'
    output = output[:-len(eos_token)]
    
    return output

def generate_output(prompt, counter, chunks):
    input_ids = tokenizer(prompt, return_tensors="pt").to('cuda')
    print('---- Computing chunk', counter, 'of', len(chunks), 'total, size:', len(input_ids['input_ids'][0]), 'tokens')
    output_ids = model.generate(**input_ids, do_sample=DO_SAMPLE, max_new_tokens=MAX_NEW_TOKENS, temperature=TEMPERATURE, top_k=TOP_K)
    return tokenizer.decode(output_ids[0]).strip()

In [6]:
def token_len(text):
    return len(tokenizer(text, return_tensors="pt")['input_ids'][0])

def append_to_chunk(current_chunk, utterance):
    if len(current_chunk) > 0:
        current_chunk += '\n'
    current_chunk += utterance
    return current_chunk

def chunkize(text, max_chunk_len):
    """
    Greedy implementation of a dialogue transcript chunking algorithm. This method returns a list of transcript chunks.
    - It priorities stability over performance. There is a set maximum chunk size for LLM inference stability. Really long utterances bypass this limit.
    - It guarantees the cuts are made at utterance ends (\n).
    - It counts everything in MODEL TOKENS and not characters for more exact experiments.
    """
    
    chunks = [] # Final list of transcript chunks. This makes up the loop invariant
    utterances = text.split('\n') # Transcript is split into sentences
    utterances.reverse() # Reverse everything!!
    current_chunk = ''

    # While there is still an utterance to process
    while len(utterances) > 0:
        utterance = utterances.pop()

        # Add to current chunk and proceed to next
        new_current_chunk = append_to_chunk(current_chunk, utterance)
        if token_len(new_current_chunk) <= max_chunk_len:
            current_chunk = new_current_chunk
        
        # Utterance is larger than maximum chunk size
        elif len(current_chunk) == 0:
            chunks.append(current_chunk)
            chunks.append(utterance)
            current_chunk = ''

        # Current chunk is big enough, append to list and create new one
        else:
            chunks.append(current_chunk)
            current_chunk = utterance

    if len(current_chunk) > 0:
        chunks.append(current_chunk)

    return chunks

In [7]:
def inference_sample(sample, max_chunk_len):
    
    # Current summary is the concatenation of all the chunk sub-summaries 
    current_summary = ''
    #previous_summary = ''
    chunks = chunkize(sample, max_chunk_len)

    # Split into chunks
    counter = 0
    for chunk in chunks:
        counter += 1

        # Create prompt for this chunk
        #if len(previous_summary) == 0:
        #    prompt = PROMPT_TEMPLATE_1.format(text=chunk)
        #    n_shot = 2
        #else:
        #    prompt = PROMPT_TEMPLATE_2.format(summary=previous_summary, text=chunk)
        #    n_shot = 0
        prompt = PROMPT_TEMPLATE.format(text=chunk)
        #n_shot = 0
            
        # Sample one sub-input
        output = generate_output(prompt, counter, chunks)
        subsummary = result_from_output_sequence(output)
        print(subsummary)

        # Add to summary
        if len(current_summary) > 0:
            current_summary += '\n\n'
        current_summary += subsummary
        #previous_summary = subsummary
    
    return current_summary

In [None]:
import json, re

def mkdir(folder_path):
    try:
        os.mkdir(folder_path)
    except FileExistsError:
        pass

pattern1 = r" <.*?>"
pattern2 = r"<.*?> "

def clean_utterance(utt_text):
        utt_text = re.sub(pattern1, "", utt_text)
        utt_text = re.sub(pattern2, "", utt_text)
        return utt_text

# Define prompt template
# ==========================================================================================
PROMPT_TEMPLATE = """<s>[INST] <<SYS>>
You are an artificial intelligence assistant. You must give helpful, detailed, and polite answers to the instructed provided.
<</SYS>>

Summarize the following text.

{text}[/INST]"""

# Load meeting transcript
file_path = '../training/ES2002a.json'
meeting_file = open(file_path, 'r')
meeting_json = json.loads(meeting_file.read())
meeting_file.close()

# Calculate text input
text = ""
for utterance_json in meeting_json:
    text += utterance_json['speaker'] + ': ' + clean_utterance(utterance_json['text']) + '\n'

output = inference_sample(text, 3000)
print(output)