# Load the Model

In [2]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7

In [3]:
import os
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [4]:
# The model that you want to train from the Hugging Face hub
base_model = "NousResearch/llama-2-7b-chat-hf"

# Load the Data

In [5]:
import re
import json
from datasets import Dataset, Features, Value

# Path to your .txt file
QA = '/content/drive/My Drive/nlp/Chain of Thoughts.txt'
COT = '/content/drive/My Drive/nlp/Chain of Thoughts.txt'
example = '/content/drive/My Drive/nlp/Examples - Q&A.txt'
priming = '/content/drive/My Drive/nlp/Priming - Q&A.txt'
TI = '/content/drive/My Drive/nlp/Task Instructions - Q&A.txt'
TOT = '/content/drive/My Drive/nlp/Tree of Thoughts.txt'

def convert_to_datasets(filename):
  # Initialize an empty list to hold the formatted strings
  formatted_list = []

  # Initialize variables to keep track of the current question and answer
  current_question = None
  current_answer = None

  # Open the file and read line by line
  with open(filename, 'r', encoding='utf-8') as file:
      for line in file:
          # Check if the line starts with 'Q:' and capture the question
          if line.startswith('Q:'):
              current_question = line[3:].strip()  # Remove 'Q:' and strip leading/trailing whitespace

          # Check if the line starts with 'A:' or 'Answer:' and capture the answer
          elif line.startswith('A:') or line.startswith('Answer:'):
              if line.startswith('A:'):
                  current_answer = line[2:].strip()  # Remove 'A:' and strip leading/trailing whitespace
              else:
                  current_answer = line[7:].strip()  # Remove 'Answer:' and strip leading/trailing whitespace

              # Once we have a question and an answer, format and append to the list
              if current_question and current_answer:
                  formatted_list.append({'input': current_question, 'output': current_answer})
                  # Reset question and answer for the next pair
                  current_question = None
                  current_answer = None


  return formatted_list



In [7]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [8]:
QA_dataset = convert_to_datasets(QA)
COT_dataset = convert_to_datasets(COT)
example_dataset = convert_to_datasets(example)
priming_dataset = convert_to_datasets(priming)
TI_dataset = convert_to_datasets(TI)
TOT_dataset = convert_to_datasets(TOT)

print("Q&A datafile:", QA_dataset, "\n")
print("Chain of Thoughts datafile:",COT_dataset, "\n")
print("Example datafile:", example_dataset, "\n")
print("Priming datafile:", priming_dataset, "\n")
print("Task Instruction:", TI_dataset,"\n")
print("Tree of Thoughts:", TOT_dataset, "\n")

Q&A datafile: [{'input': "Within the context of the European Union's legislative landscape, could you identify the formal designation and overarching aim of the policy designed to standardize the approach towards artificial intelligence technologies' creation, deployment, and supervision?", 'output': 'Proposal for a Regulation laying down harmonized rules for artificial intelligence.'}, {'input': 'What foundational goals does the AI Act, as proposed by the European Union, seek to achieve with respect to the regulation and supervision of artificial intelligence within its internal market? Additionally, what strategies are proposed to ensure the uniformity of standards throughout its member states, alongside the preservation of health, safety, and fundamental rights?', 'output': 'The EU AI Act aims to regulate the sale and use of AI in the EU, ensuring the proper functioning of the single market by setting consistent standards across member states, and safeguarding health, safety, and fu

# Set Parameters of the Model

In [9]:
compute_dtype = getattr(torch, "float16")

In [10]:
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

In [11]:
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map={"": 0}
)
model.config.use_cache = False
model.config.pretraining_tp = 1



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/583 [00:01<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]

In [12]:
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

# Answer Generation

## Ground Truth Generation

In [13]:
# Ground Truth
references = [QA_dataset[i]['output'] for i in range(len(QA_dataset))]
for i in range(10):
  print(references[i],"\n")

Proposal for a Regulation laying down harmonized rules for artificial intelligence. 

The EU AI Act aims to regulate the sale and use of AI in the EU, ensuring the proper functioning of the single market by setting consistent standards across member states, and safeguarding health, safety, and fundamental rights. 

The EU AI Act employs a risk-based approach, categorizing AI systems into four levels of risk: unacceptable, high, limited, and minimal/none, with specific regulations focusing on unacceptable and high-risk categories. 

AI systems classified as posing an unacceptable risk are prohibited, including those capable of manipulation or social scoring, and proposals exist to ban real-time remote biometric identification in public spaces. 

Developers of high-risk AI systems must adhere to a comprehensive set of requirements including risk management, data governance, transparency, and human oversight, and must register these systems in an EU-wide database. 

China's AIDP aimed to 

In [14]:
#  Choose 3 random questions
import random
# Set the random seed
random.seed(101)
# Generate 3 random integers between 0 and 100, inclusive
random_integers = [random.randint(0, 99) for _ in range(3)]

print(random_integers)


[74, 24, 69]


## Prompt Engineering Generation

### Baseline Q&A Generation

In [15]:
# Prepare your question
questions1 = [QA_dataset[i]['input'] for i in range(len(QA_dataset))]
predictions1 = []


for i in random_integers:
  print("Generating answer for the", i, "th question...")
  # Encode the question
  inputs = tokenizer(questions1[i], return_tensors="pt", padding=True, truncation=True, max_length=512)
  inputs = inputs.to('cuda')

  # Generate an answer
  output_sequences = model.generate(
      input_ids=inputs['input_ids'],
      attention_mask=inputs["attention_mask"],
      max_length= 200
  )

  # Decode the generated answer
  answer = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
  predictions1.append(answer)


Generating answer for the 74 th question...




Generating answer for the 24 th question...
Generating answer for the 69 th question...


In [16]:
import pandas as pd
import numpy as np

In [17]:
selected_questions = [questions1[i] for i in random_integers]

data = pd.DataFrame(selected_questions, columns=['Question'])

In [18]:
ground_truth = [references[i] for i in random_integers]
data['Ground Truth'] = ground_truth

In [19]:
data

Unnamed: 0,Question,Ground Truth
0,What are the essential hardware advancements n...,"First, It includes using Machine learning mode..."
1,What goals are pursued by the OECD AI Principl...,"Adopted in 2019 and endorsed by 42 countries, ..."
2,What essential elements must be included in an...,The appeal must contain the following: 1. The ...


### Chain of Thoughts prompting Generation

In [20]:
# Prepare your question
questions2 = [COT_dataset[i]['input'] for i in range(len(COT_dataset))]
predictions2 = []


for i in random_integers:
  print("Generating answer for the", i, "th question...")
  # Encode the question
  inputs = tokenizer(questions2[i], return_tensors="pt", padding=True, truncation=True, max_length=512)
  inputs = inputs.to('cuda')

  # Generate an answer
  output_sequences = model.generate(
      input_ids=inputs['input_ids'],
      attention_mask=inputs["attention_mask"],
      max_length= 200
  )

  # Decode the generated answer
  answer = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
  predictions2.append(answer)


Generating answer for the 74 th question...
Generating answer for the 24 th question...
Generating answer for the 69 th question...


In [22]:
data['PE_Chain of thoughts'] = predictions2

In [23]:
data

Unnamed: 0,Question,Ground Truth,PE_Chain of thoughts
0,What are the essential hardware advancements n...,"First, It includes using Machine learning mode...",What are the essential hardware advancements n...
1,What goals are pursued by the OECD AI Principl...,"Adopted in 2019 and endorsed by 42 countries, ...",What goals are pursued by the OECD AI Principl...
2,What essential elements must be included in an...,The appeal must contain the following: 1. The ...,What essential elements must be included in an...


### Example prompting Generation

In [24]:
# Prepare your question
questions3 = [example_dataset[i]['input'] for i in range(len(example_dataset))]
predictions3 = []


for i in random_integers:
  print("Generating answer for the", i, "th question...")
  # Encode the question
  inputs = tokenizer(questions3[i], return_tensors="pt", padding=True, truncation=True, max_length=512)
  inputs = inputs.to('cuda')

  # Generate an answer
  output_sequences = model.generate(
      input_ids=inputs['input_ids'],
      attention_mask=inputs["attention_mask"],
      max_length= 200
  )

  # Decode the generated answer
  answer = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
  predictions3.append(answer)


Generating answer for the 74 th question...
Generating answer for the 24 th question...
Generating answer for the 69 th question...


In [25]:
data['PE_Example'] = predictions3

### Priming prompting Generation

In [26]:
# Prepare your question
questions4 = [priming_dataset[i]['input'] for i in range(len(priming_dataset))]
predictions4 = []

for i in random_integers:
  print("Generating answer for the", i, "th question...")
  # Encode the question
  inputs = tokenizer(questions4[i], return_tensors="pt", padding=True, truncation=True, max_length=512)
  inputs = inputs.to('cuda')

  # Generate an answer
  output_sequences = model.generate(
      input_ids=inputs['input_ids'],
      attention_mask=inputs["attention_mask"],
      max_length= 200
  )

  # Decode the generated answer
  answer = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
  predictions4.append(answer)


Generating answer for the 74 th question...
Generating answer for the 24 th question...
Generating answer for the 69 th question...


In [27]:
data['PE_Priming'] = predictions4

### Task Instruction prompting Generation

In [28]:
# Prepare your question
questions5 = [TI_dataset[i]['input'] for i in range(len(TI_dataset))]
predictions5 = []


for i in random_integers:
  print("Generating answer for the", i, "th question...")
  # Encode the question
  inputs = tokenizer(questions5[i], return_tensors="pt", padding=True, truncation=True, max_length=512)
  inputs = inputs.to('cuda')

  # Generate an answer
  output_sequences = model.generate(
      input_ids=inputs['input_ids'],
      attention_mask=inputs["attention_mask"],
      max_length= 200
  )

  # Decode the generated answer
  answer = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
  predictions5.append(answer)


Generating answer for the 74 th question...
Generating answer for the 24 th question...
Generating answer for the 69 th question...


In [29]:
data['PE_Task instruction'] = predictions4

### Tree of Throughts prompting predictions vs. Ground Truth

In [30]:
# Prepare your question
questions6 = [TOT_dataset[i]['input'] for i in range(len(TOT_dataset))]
predictions6 = []

for i in random_integers:
  print("Generating answer for the", i, "th question...")
  # Encode the question
  inputs = tokenizer(questions6[i], return_tensors="pt", padding=True, truncation=True, max_length=512)
  inputs = inputs.to('cuda')

  # Generate an answer
  output_sequences = model.generate(
      input_ids=inputs['input_ids'],
      attention_mask=inputs["attention_mask"],
      max_length= 200
  )

  # Decode the generated answer
  answer = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
  predictions6.append(answer)


Generating answer for the 74 th question...
Generating answer for the 24 th question...
Generating answer for the 69 th question...


In [31]:
data['PE_Tree of thoughts'] = predictions6

In [32]:
data

Unnamed: 0,Question,Ground Truth,PE_Chain of thoughts,PE_Example,PE_Priming,PE_Task instruction,PE_Tree of thoughts
0,What are the essential hardware advancements n...,"First, It includes using Machine learning mode...",What are the essential hardware advancements n...,In light of concerns over AI's societal impact...,Addressing the ethical quandaries posed by art...,Addressing the ethical quandaries posed by art...,What technological advancements are necessary ...
1,What goals are pursued by the OECD AI Principl...,"Adopted in 2019 and endorsed by 42 countries, ...",What goals are pursued by the OECD AI Principl...,Given the global debate on AI's role in societ...,In the context of international cooperation fo...,In the context of international cooperation fo...,What objectives do the OECD AI Principles targ...
2,What essential elements must be included in an...,The appeal must contain the following: 1. The ...,What essential elements must be included in an...,In the context of leveraging AI across sectors...,In the context of fostering global advancement...,In the context of fostering global advancement...,What essential elements must be included in an...


In [59]:
from google.colab import files
files.download('data.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Fine Tuning Generation

In [60]:
tuned_model = "minhajgc14/fine-tuned-llama"

In [61]:
untuned_model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map={"": 0}
)
fine_tuned_model = PeftModel.from_pretrained(untuned_model, tuned_model)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_config.json:   0%|          | 0.00/453 [00:00<?, ?B/s]

adapter_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

In [76]:
generated_answers = []
logging.set_verbosity(logging.CRITICAL)

prompt = "Why does the AI Policy restrict some types of information from disclosure?"
pipe = pipeline(task="text-generation", model=fine_tuned_model, tokenizer=tokenizer, max_length=200)

# Generate the answer
result = pipe(prompt)
generated_text = result[0]['generated_text']

# Remove the prompt from the generated text
answer_only = generated_text.replace(prompt, '').replace('[/INST] ', '').strip()

# Add the processed answer to the list
generated_answers.append(answer_only)

print(answer_only)


[/INST] The AI Policy restricts certain types of information from disclosure to protect sensitive information, maintain the confidentiality of personal information, and avoid harm to the Bank or its stakeholders. It is essential to maintain the integrity of the Bank's operations and protect the privacy and security of its clients and employees. The AI Policy outlines the types of information that are exempt from disclosure, including personal information, financial information, personal communications, and other sensitive information. It is essential to protect the privacy and security of the Bank's clients and employees and maintain the integrity of its operations. The AI Policy outlines the types of information that are exempt from disclosure, including personal information, financial information, personal communications, and other sensitive information. It is essential to protect the privacy and security of the Bank's clients and employees and maintain the


In [78]:
generated_answers = []
for question in data['Question']:
    # generate answers using pipeline
    result = pipe(question)
    generated_text = result[0]['generated_text']
    answer_only = generated_text.replace(prompt, '').replace('[/INST] ', '').strip()
    # Add the answers to the list
    generated_answers.append(answer_only)

print(generated_answers)

['What are the essential hardware advancements necessary to facilitate the growth and application of AI, and what key elements are involved in this technological progress? The development of AI is dependent on advancements in hardware, including high-performance computing, data storage, and specialized AI-specific hardware like GPUs and TPUs. These technologies are crucial for training and deploying AI models, and their availability and performance directly impact the growth and application of AI. Key elements involved in this technological progress include the development of faster and more efficient computing hardware, advancements in data storage technologies, and the creation of specialized hardware like GPUs and TPUs. These advancements are necessary to support the growth and application of AI, and they are critical for developing and deploying AI models. The development of AI is dependent on advancements in hardware, including high-performance computing,', 'What goals are pursued

In [80]:
data['Fine tuned'] = generated_answers

# Load the Final Answer List in Dataframes and Save to Csv Files

In [83]:
data

Unnamed: 0,Question,Ground Truth,PE_Chain of thoughts,PE_Example,PE_Priming,PE_Task instruction,PE_Tree of thoughts,Fine tuned
0,What are the essential hardware advancements n...,"First, It includes using Machine learning mode...",What are the essential hardware advancements n...,In light of concerns over AI's societal impact...,Addressing the ethical quandaries posed by art...,Addressing the ethical quandaries posed by art...,What technological advancements are necessary ...,What are the essential hardware advancements n...
1,What goals are pursued by the OECD AI Principl...,"Adopted in 2019 and endorsed by 42 countries, ...",What goals are pursued by the OECD AI Principl...,Given the global debate on AI's role in societ...,In the context of international cooperation fo...,In the context of international cooperation fo...,What objectives do the OECD AI Principles targ...,What goals are pursued by the OECD AI Principl...
2,What essential elements must be included in an...,The appeal must contain the following: 1. The ...,What essential elements must be included in an...,In the context of leveraging AI across sectors...,In the context of fostering global advancement...,In the context of fostering global advancement...,What essential elements must be included in an...,What essential elements must be included in an...


In [85]:
data.to_csv('answer_list.csv',index = False)

In [86]:
from google.colab import files
files.download('answer_list.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>