<a href="https://colab.research.google.com/github/Aaront2002/Jujuboi/blob/main/Customize_your_own_LLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Fine-tuning ELM


This notebook will guide you through a process called fine-tuning, where you will provide a set of examples to the machine learning algorithm of things you want it to do. Usually, these are prompt-response examples.

First, you need to determine which model you want to use, and what test statement you want to give the model to test.

In [None]:
model_name = "J"
instruction = "Correct this sentence according to the EUR style guide"
input= "I have a phd in communication"

This code sets up a language model, a type of artificial intelligence that can understand and generate human-like text. It installs the necessary software tools, downloads required files from the internet, and prepares the model for training. The goal is to teach the model to respond to instructions or questions in a way that mimics human responses, enabling it to generate coherent and relevant text based on the prompts it receives.



In [None]:
!pip install transformers
!pip install sentencepiece
!pip install accelerate
!pip install datasets
!pip install peft

!git clone https://github.com/Joaoffg/AISocIMP23/

%cd "/content/AISocIMP23/Week 5/"

#imports
import os
import sys
from typing import List
import json
import warnings

import torch
import transformers
from datasets import load_dataset
import pandas as pd

from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_kbit_training,
    set_peft_model_state_dict,
)
from transformers import LlamaForCausalLM, LlamaTokenizer

#hparams
base_model = model_name
data_path = "instructions.json"
output_dir = "./ELM_output_dir"
# training hyperparams
batch_size = 32
micro_batch_size = 4
num_epochs =  10
learning_rate =  3e-4
cutoff_len =  256
val_set_size =  0
# lora hyperparams
lora_r= 256
lora_alpha = 512
lora_dropout = 0.05
lora_target_modules= [
    "q_proj",
    "v_proj",
]
# llm hyperparams
train_on_inputs = True # if False, masks out inputs in loss
add_eos_token = True
group_by_length = True # false  # True = faster, but produces an odd training loss curve
resume_from_checkpoint = None  # either training checkpoint or final adapter
prompt_template_name = "alpaca"  # The prompt template to use, will default to alpaca.

from utils.prompter import Prompter
warnings.filterwarnings('ignore')

class CustomPrompter(Prompter):
    def get_response(self, output: str) -> str:
        return output.split(self.template["response_split"])[1].strip().split("### Instruction:")[0]

prompter = CustomPrompter(prompt_template_name)



gradient_accumulation_steps = batch_size // micro_batch_size

device_map = "auto"
world_size = int(os.environ.get("WORLD_SIZE", 1))
ddp = world_size != 1

from transformers import LlamaTokenizer, LlamaForCausalLM

token_path=model_name
model_path=model_name

tokenizer = LlamaTokenizer.from_pretrained(token_path)
model = LlamaForCausalLM.from_pretrained(model_path)

tokenizer.pad_token_id =  tokenizer.eos_token_id
model.config.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "left"  # Allow batched inference

def tokenize(prompt, add_eos_token=True):
  result = tokenizer(
      prompt,
      truncation=True,
      max_length=cutoff_len,
      padding=False,
      return_tensors=None,
  )
  if (
      result["input_ids"][-1] != tokenizer.eos_token_id
      and len(result["input_ids"]) < cutoff_len
      and add_eos_token
  ):
      result["input_ids"].append(tokenizer.eos_token_id)
      result["attention_mask"].append(1)

  result["labels"] = result["input_ids"].copy()

  return result

def generate_and_tokenize_prompt(data_point):
    full_prompt = prompter.generate_prompt(
        data_point["instruction"],
        data_point["input"],
        data_point["output"],
    )
    tokenized_full_prompt = tokenize(full_prompt)
    if not train_on_inputs:
        user_prompt = prompter.generate_prompt(
            data_point["instruction"], data_point["input"]
        )
        tokenized_user_prompt = tokenize(
            user_prompt, add_eos_token=add_eos_token
        )
        user_prompt_len = len(tokenized_user_prompt["input_ids"])

        if add_eos_token:
            user_prompt_len -= 1

        tokenized_full_prompt["labels"] = [
            -100
        ] * user_prompt_len + tokenized_full_prompt["labels"][
            user_prompt_len:
        ]  # could be sped up, probably
    return tokenized_full_prompt


Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K 

tokenizer_config.json:   0%|          | 0.00/956 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/7.17M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/729 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.61G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

This code tests our language model before the fine-tuning process, asking it to respond to the statement we defined above.

In [None]:
# Generate a response:
input = input
prompt = prompter.generate_prompt(instruction, input)
inputs = tokenizer(prompt, return_tensors="pt")
model = model.to("cuda:0")
inputs = inputs.to("cuda:0")
input_ids = inputs["input_ids"]

#play around with generation strategies for better/diverse sequences. https://huggingface.co/docs/transformers/generation_strategies
temperature=0.9
top_p=0.95
top_k=25
num_beams=1
# num_beam_groups=num_beams #see: 'Diverse beam search decoding'
max_new_tokens=128
repetition_penalty = 1
do_sample = True # allow 'beam sample': do_sample=True, num_beams > 1
num_return_sequences = 1 #generate multiple candidates, takes longer..

generation_config = transformers.GenerationConfig(
    temperature=temperature,
    top_p=top_p,
    top_k=top_k,
    num_beams=num_beams,
    repetition_penalty=repetition_penalty,
    do_sample=do_sample,
    num_return_sequences=num_return_sequences,
    pad_token_id = tokenizer.eos_token_id
    # num_beam_groups=num_beam_groups
)

generate_params = {
    "input_ids": input_ids,
    "generation_config": generation_config,
    "return_dict_in_generate": True,
    "output_scores": True,
    "max_new_tokens": max_new_tokens,
}
with torch.no_grad():
    generation_output = model.generate(
        input_ids=input_ids,
        generation_config=generation_config,
        return_dict_in_generate=True,
        output_scores=True,
        max_new_tokens=max_new_tokens,
    )


print(f'Instruction: {instruction}')

for i,s in enumerate(generation_output.sequences):
  output = tokenizer.decode(s,skip_special_tokens=True)
  # print(output)
  print(f'Output {i}: {prompter.get_response(output)}')


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Instruction: What is Erasmus university?
Output 0: Erasmus University is een universiteit die op 1 januari 2004 de KUB-Bronnood heeft gekregen. Het is een university die wordt afgerond op 1 december, door de Universiteit van Amsterdam.


Here you create the example dataset to feed the language models.

The data is a list named data, which contains multiple entries.
Each entry is a dictionary (also known as an object) with three keys:

"instruction": The question or instruction you want the model to respond to.

"input": Any additional input needed for the instruction (can be left empty if not needed).

"output": The expected response or answer to the instruction.

You can add more examples to the data as you see fit. Just copy paste a set of instruction, input, output and modify it as you see fit. Be careful with the commas and the quotation marks.


This example fine-tunes the model with knowledge about France.

In [None]:
import json

# Define the data
data = [
    {
        "instruction": "What is the capital of France?",
        "input": "",
        "output": "The capital of France is Paris."
    },
    {
        "instruction": "What is the population of France?",
        "input": "",
        "output": "The population of France is 68 million people."
    },
    {
        "instruction": "What is the national dish of France?",
        "input": "",
        "output": "The national dish of France is the croissant."
    }
]

# Define the file name
file_name = "instructions.json"

# Write the data to a JSON file with compact formatting
with open(file_name, 'w') as json_file:
    json.dump(data, json_file, separators=(',', ':'))

print(f"Data has been saved to {file_name}")

This example teaches the language model to correct statements according to the EUR house style guide.

In [None]:
import json

# Define the data
data = [
    {
        "instruction": "Correct the following sentence according to the house style guide.",
        "input": "Between 8 and Twelve people attended the meeting.",
        "output": "Between eight and twelve people attended the meeting."
    },
    {
        "instruction": "Correct the following sentence according to the house style guide.",
        "input": "The participants said: 'we want to hold consultations.'",
        "output": "The participants said: 'We want to hold consultations.'"
    },
    {
        "instruction": "Correct the following sentence according to the house style guide.",
        "input": "She is doing a phd in economics.",
        "output": "She is doing a PhiD in economics."
    },
    {
        "instruction": "Correct the following sentence according to the house style guide.",
        "input": "He wrote his dissertation titled 'the Music of the Netherlands in the nineteenth century.'",
        "output": "He wrote his dissertation titled 'The Music of the Netherlands in the Nineteenth Century.'"
    },
    {
        "instruction": "Correct the following sentence according to the house style guide.",
        "input": "Please reply asap.",
        "output": "Please reply as soon as possible."
    },
    {
        "instruction": "Correct the following sentence according to the house style guide.",
        "input": "She became a stagiaire with the european Commission in brussels.",
        "output": "She became a stagiaire with the European Commission in Brussels."
    },
    {
        "instruction": "Correct the following sentence according to the house style guide.",
        "input": "We need 1500 euro for the project.",
        "output": "We need 1,500 euro for the project."
    },
    {
        "instruction": "Correct the following sentence according to the house style guide.",
        "input": "He will start at 8.30 am.",
        "output": "He will start at 8.30 a.m."
    },
    {
        "instruction": "Correct the following sentence according to the house style guide.",
        "input": "I need to call Prof Smith.",
        "output": "I need to call Prof. Smith."
    },
    {
        "instruction": "Correct the following sentence according to the house style guide.",
        "input": "The speed limit is 100 kilometres per hour.",
        "output": "The speed limit is 100 km."
    },
    {
        "instruction": "Correct the following sentence according to the house style guide.",
        "input": "He achieved 42 ects this year.",
        "output": "He achieved 42 ECTS this year."
    },
    {
        "instruction": "Correct the following sentence according to the house style guide.",
        "input": "She lives in the netherlands.",
        "output": "She lives in the Netherlands."
    },
    {
        "instruction": "Correct the following sentence according to the house style guide.",
        "input": "He bought a new USB stick.",
        "output": "He bought a new USB-stick."
    },
    {
        "instruction": "Correct the following sentence according to the house style guide.",
        "input": "We will meet on October 4th.",
        "output": "We will meet on 4 October."
    },
    {
        "instruction": "Correct the following sentence according to the house style guide.",
        "input": "He said: 'What do you mean, 'I'm a sly devil'?'",
        "output": "He said: 'What do you mean, \"I’m a sly devil\"?'"
    },
    {
        "instruction": "Correct the following sentence according to the house style guide.",
        "input": "Please send an e-Mail to info@smc.eur.nl",
        "output": "Please send an email to info@smc.eur.nl"
    },
    {
        "instruction": "Correct the following sentence according to the house style guide.",
        "input": "Visit our website at HTTP://WWW.EUR.NL",
        "output": "Visit our website at www.eur.nl"
    },
    {
        "instruction": "Correct the following sentence according to the house style guide.",
        "input": "I will arrive at 12.00 PM",
        "output": "I will arrive at 12.00 p.m."
    },
    {
        "instruction": "Correct the following sentence according to the house style guide.",
        "input": "The student population includes a high number of women, ethnic minorities, and life-long learners.",
        "output": "The student population includes a high number of women, ethnic minorities and life-long learners."
    },
    {
        "instruction": "Correct the following sentence according to the house style guide.",
        "input": "She needs to read Chapters 3.1.1 and 2.1B.2.",
        "output": "She needs to read section 3.1.1 and 2.1B.2."
    }
]

# Define the file name
file_name = "instructions.json"

# Write the data to a JSON file with compact formatting
with open(file_name, 'w') as json_file:
    json.dump(data, json_file, separators=(',', ':'))

print(f"Data has been saved to {file_name}")


Data has been saved to instructions.json


Here is another example, that may be useful for media and communication students.

In [None]:
import json

# Define the data
data = [
    {
        "instruction": "Explain the agenda-setting theory in mass communication.",
        "input": "",
        "output": "The agenda-setting theory suggests that the media doesn't tell people what to think, but it tells them what to think about. It highlights the media's influence in shaping public perception by determining which issues are deemed important."
    },
    {
        "instruction": "What is the difference between qualitative and quantitative research methods in media studies?",
        "input": "",
        "output": "Qualitative research focuses on understanding meanings, experiences, and concepts through methods like interviews and observations, while quantitative research involves numerical data and statistical analysis to identify patterns and test hypotheses."
    },
    {
        "instruction": "Describe the role of social media in modern political campaigns.",
        "input": "",
        "output": "Social media plays a crucial role in modern political campaigns by enabling direct communication with voters, spreading campaign messages rapidly, mobilizing supporters, and targeting specific demographics through data analytics."
    },
    {
        "instruction": "List the key characteristics of public relations.",
        "input": "",
        "output": "Key characteristics of public relations include strategic communication, relationship management, reputation building, media relations, crisis management, and the use of various communication channels to influence public perception."
    },
    {
        "instruction": "What is semiotics and how is it applied in media analysis?",
        "input": "",
        "output": "Semiotics is the study of signs and symbols and their use or interpretation. In media analysis, it is applied to decode the meanings embedded in media texts, understanding how signs convey messages and ideologies."
    },
    {
        "instruction": "Explain the concept of media convergence.",
        "input": "",
        "output": "Media convergence refers to the merging of traditional media with digital communication technologies, leading to the integration of content across multiple platforms and the blurring of lines between different media forms."
    },
    {
        "instruction": "How does the uses and gratifications theory explain media consumption?",
        "input": "",
        "output": "The uses and gratifications theory posits that audiences actively seek out media that meet their needs and desires, such as information, entertainment, social interaction, or personal identity reinforcement."
    },
    {
        "instruction": "Discuss the impact of globalization on media industries.",
        "input": "",
        "output": "Globalization has expanded the reach of media industries, leading to the widespread distribution of content across borders, the dominance of multinational media corporations, cultural homogenization, and increased competition in global markets."
    },
    {
        "instruction": "What are the ethical considerations in journalism?",
        "input": "",
        "output": "Ethical considerations in journalism include accuracy, objectivity, fairness, confidentiality of sources, avoiding conflicts of interest, respecting privacy, and minimizing harm while reporting."
    },
    {
        "instruction": "Define 'digital divide' and its implications.",
        "input": "",
        "output": "The 'digital divide' refers to the gap between those who have access to digital technologies and the internet and those who do not. Its implications include unequal opportunities in education, employment, and access to information."
    },
    {
        "instruction": "Explain the concept of framing in media.",
        "input": "",
        "output": "Framing involves presenting information in a particular way to influence audience interpretation. Media frames highlight certain aspects of a story while downplaying others, shaping how audiences perceive and understand events."
    },
    {
        "instruction": "How do cultural studies approach media analysis?",
        "input": "",
        "output": "Cultural studies approach media analysis by examining media texts within their social and cultural contexts, exploring how media represents and influences power dynamics, identities, and cultural practices."
    },
    {
        "instruction": "What is the role of advertising in consumer culture?",
        "input": "",
        "output": "Advertising plays a significant role in consumer culture by promoting products and lifestyles, creating desires, influencing consumer behavior, and shaping societal values and norms around consumption."
    },
    {
        "instruction": "Discuss the effects of media ownership concentration.",
        "input": "",
        "output": "Media ownership concentration can lead to reduced diversity of viewpoints, increased influence of a few corporations on public opinion, potential censorship, and a focus on profit over public interest."
    },
    {
        "instruction": "What are the principles of effective communication in public speaking?",
        "input": "",
        "output": "Principles of effective communication in public speaking include clarity, engaging storytelling, understanding the audience, confident delivery, appropriate body language, and effective use of visual aids."
    },
    {
        "instruction": "Explain the spiral of silence theory.",
        "input": "",
        "output": "The spiral of silence theory suggests that individuals may refrain from expressing minority opinions due to fear of social isolation, leading to a dominant majority opinion that silences dissenting voices."
    },
    {
        "instruction": "How has the internet changed traditional news media?",
        "input": "",
        "output": "The internet has transformed traditional news media by enabling instant news dissemination, encouraging interactive and user-generated content, challenging traditional revenue models, and increasing competition from digital platforms."
    },
    {
        "instruction": "Describe the process of encoding and decoding in communication.",
        "input": "",
        "output": "Encoding is the process by which a sender translates thoughts into communicable messages, while decoding is how the receiver interprets those messages. Effective communication occurs when the intended meaning is accurately understood."
    },
    {
        "instruction": "What is the significance of audience segmentation in marketing?",
        "input": "",
        "output": "Audience segmentation allows marketers to divide a broad target market into subsets of consumers with common needs or characteristics, enabling more tailored and effective marketing strategies."
    },
    {
        "instruction": "List the stages of media production.",
        "input": "",
        "output": "The stages of media production include pre-production (planning and scripting), production (actual creation of content), and post-production (editing, adding effects, and finalizing the product)."
    }
]

# Define the file name
file_name = "instructions.json"

# Write the data to a JSON file with compact formatting
with open(file_name, 'w') as json_file:
    json.dump(data, json_file, separators=(',', ':'))

print(f"Data has been saved to {file_name}")


This code prepares and trains a machine learning model using methods to make the training more efficient and less resource-intensive. It starts by applying Low-Rank Adaptation (LoRA), which reduces the number of parameters that need to be adjusted during training by focusing on lower-dimensional representations; parameters like r (the rank determining the adaptation size), lora_alpha (a scaling factor), target_modules (specific parts of the model to adapt), and lora_dropout (a dropout rate to prevent overfitting) are configured for this purpose.

The code then loads a dataset—a collection of text data for the model to learn from—and may split it into a training set and a validation set to both train the model and assess its performance. It checks for any previously saved training checkpoints to resume training without starting over, which saves time and computational resources. Training parameters such as batch size (the number of samples processed at once), learning rate (how quickly the model updates its parameters), and epochs (the number of times the model goes through the entire dataset) are set to control the learning process.

A training manager (trainer) is configured to handle the training loop, including logging progress, evaluating performance at specified intervals, and saving the best versions of the model. After training, the model is saved, evaluated, and moved to the appropriate computing device (like a GPU) for deployment or further analysis.


In [None]:
model = prepare_model_for_kbit_training(model)

config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    target_modules=lora_target_modules,
    lora_dropout=lora_dropout,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)

if data_path.endswith(".json") or data_path.endswith(".jsonl"):
    data = load_dataset("json", data_files=data_path)
else:
    data = load_dataset(data_path)

if resume_from_checkpoint:
    # Check the available weights and load them
    checkpoint_name = os.path.join(
        resume_from_checkpoint, "pytorch_model.bin"
    )  # Full checkpoint
    if not os.path.exists(checkpoint_name):
        checkpoint_name = os.path.join(
            resume_from_checkpoint, "adapter_model.bin"
        )  # only LoRA model - LoRA config above has to fit
        resume_from_checkpoint = (
            False  # So the trainer won't try loading its state
        )
    # The two files above have a different name depending on how they were saved, but are actually the same.
    if os.path.exists(checkpoint_name):
        print(f"Restarting from {checkpoint_name}")
        adapters_weights = torch.load(checkpoint_name)
        set_peft_model_state_dict(model, adapters_weights)
    else:
        print(f"Checkpoint {checkpoint_name} not found")

model.print_trainable_parameters()  # Be more transparent about the % of trainable params.

if val_set_size > 0:
    train_val = data["train"].train_test_split(
        test_size=val_set_size, shuffle=True, seed=42 #test_size=val_set_size
    )
    train_data = (
        train_val["train"].shuffle().map(generate_and_tokenize_prompt)
    )
    val_data = (
        train_val["test"].shuffle().map(generate_and_tokenize_prompt)
    )
else:
    train_data = data["train"].shuffle().map(generate_and_tokenize_prompt)
    val_data = None

if not ddp and torch.cuda.device_count() > 1:
    # keeps Trainer from trying its own DataParallelism when more than 1 gpu is available
    model.is_parallelizable = True
    model.model_parallel = True

trainer = transformers.Trainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=val_data,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=micro_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        warmup_steps=0,
        num_train_epochs=num_epochs,
        learning_rate=learning_rate,
        fp16=True,
        logging_steps=10,
        optim="adamw_torch",
        evaluation_strategy="steps" if val_set_size > 0 else "no",
        save_strategy="steps",
        eval_steps=200 if val_set_size > 0 else None,
        save_steps=200,
        output_dir="/content/",
        save_total_limit=3,
        load_best_model_at_end=True if val_set_size > 0 else False,
        ddp_find_unused_parameters=False if ddp else None,
        group_by_length=group_by_length
    ),
    data_collator=transformers.DataCollatorForSeq2Seq(
        tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
    ),
)
model.config.use_cache = False

tokenizer.pad_token = tokenizer.eos_token
trainer.train(resume_from_checkpoint=resume_from_checkpoint)
model.save_pretrained(output_dir)
model.eval()
model = model.to("cuda:0")

Generating train split: 0 examples [00:00, ? examples/s]

trainable params: 33,554,432 || all params: 936,445,952 || trainable%: 3.5832


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Step,Training Loss
10,1.1285


Finally, you can test the fine-tuned model to see how it performs with your example.

In [None]:
# Generate a response:
input = input
prompt = prompter.generate_prompt(instruction, input)
inputs = tokenizer(prompt, return_tensors="pt")
inputs = inputs.to("cuda:0")
input_ids = inputs["input_ids"]

#play around with generation strategies for better/diverse sequences. https://huggingface.co/docs/transformers/generation_strategies
temperature=0.9
top_p=0.95
top_k=15
num_beams=1
# num_beam_groups=num_beams #see: 'Diverse beam search decoding'
max_new_tokens=128
repetition_penalty = 1
do_sample = True # allow 'beam sample': do_sample=True, num_beams > 1
num_return_sequences = 1 #generate multiple candidates, takes longer..

generation_config = transformers.GenerationConfig(
    temperature=temperature,
    top_p=top_p,
    top_k=top_k,
    num_beams=num_beams,
    repetition_penalty=repetition_penalty,
    do_sample=do_sample,
    num_return_sequences=num_return_sequences,
    pad_token_id = tokenizer.eos_token_id
    # num_beam_groups=num_beam_groups
)

generate_params = {
    "input_ids": input_ids,
    "generation_config": generation_config,
    "return_dict_in_generate": True,
    "output_scores": True,
    "max_new_tokens": max_new_tokens,
}
with torch.no_grad():
    generation_output = model.generate(
        input_ids=input_ids,
        generation_config=generation_config,
        return_dict_in_generate=True,
        output_scores=True,
        max_new_tokens=max_new_tokens,
    )


print(f'Instruction: {instruction}')

for i,s in enumerate(generation_output.sequences):
  output = tokenizer.decode(s,skip_special_tokens=True)
  # print(output)
  print(f'Output {i}: {prompter.get_response(output)}')


Instruction: Correct this sentence according to the EUR style guide
Output 0: I have a PhD in communication. #"" Quick response: I have a Ph.D. in Communication.
