In [None]:
!pip install --upgrade --quiet  langchain langchain-community langchainhub langchain-openai langchain-chroma click typer==0.9.0 PyMuPDF python-docx pypdf

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Import necessary libraries
import os
import csv
import fitz
from docx import Document
from langchain.document_loaders import PyPDFLoader, TextLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from typing import List
from IPython.display import display, HTML


# Define functions to extract text from different file formats
def extract_text_from_pdf(pdf_file):
    loader = PyPDFLoader(pdf_file)
    return loader.load_and_split()

def extract_text_from_docx(docx_file):
    doc = Document(docx_file)
    text = "\n".join([para.text for para in doc.paragraphs])
    return TextLoader(text).load_and_split()

def extract_text_from_txt(txt_file):
    with open(txt_file, 'r') as f:
        text = f.read()
    return TextLoader(text).load_and_split()




In [None]:
os.environ["OPENAI_API_KEY"] ='sk-proj-xc9ZJnTBAuiInfLiCFt3T3BlbkFJyoa45tXTH3581cEN2UPO'

In [None]:
# Uploads files from Google Colab and processes them to extract text.
from google.colab import files
uploaded = files.upload()

documents = []
for filename, content in uploaded.items():
    print(f'Processing file: {filename}')
    file_extension = filename.split('.')[-1].lower()
    if file_extension == 'pdf':
        documents.extend(extract_text_from_pdf(filename))
    elif file_extension == 'docx':
        documents.extend(extract_text_from_docx(filename))
    elif file_extension == 'txt':
        documents.extend(extract_text_from_txt(filename))
    else:
        print(f"Ignoring file {filename} as it has an unsupported format.")


Saving 1609.02907v4.pdf to 1609.02907v4.pdf
Saving 1705.07874v2.pdf to 1705.07874v2.pdf
Saving 1512.03385v1.pdf to 1512.03385v1.pdf
Saving 1409.0473v7.pdf to 1409.0473v7.pdf
Saving 1301.3781v3.pdf to 1301.3781v3.pdf
Processing file: 1609.02907v4.pdf
Processing file: 1705.07874v2.pdf
Processing file: 1512.03385v1.pdf
Processing file: 1409.0473v7.pdf
Processing file: 1301.3781v3.pdf


In [None]:
# Creates a vector index from the extracted documents.
embeddings = OpenAIEmbeddings()
index = VectorstoreIndexCreator(embedding=embeddings).from_documents(documents)

In [None]:
# Defines a custom prompt template for question answering.
prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Keep the answer as concise as possible.

Also find the text in the documents along with the name of the document where the answer is found. If the answer appears in multiple documents or in multiple places within a document, find all occurrences.

{context}

Question: {question}"""
prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

In [None]:
# Initializes the RetrievalQA chain with an OpenAI language model and the vector index.
llm = OpenAI(temperature=0)
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=index.vectorstore.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt}
)

In [None]:
#Defines a function to display the answer and highlighted text.
def display_answer(result):
    answer = result["result"]
    source_documents = result["source_documents"]

    highlighted_text = []
    for doc in source_documents:
        doc_text = doc.page_content
        highlighted_text.append(f"<b>Source: {doc.metadata['source']}</b>\n{doc_text[:]}")

    display(HTML(f"<p>{answer}</p>"))
    if highlighted_text:
        display(HTML("<hr>".join(highlighted_text)))


In [None]:
# Define a function to collect feedback from users
def collect_feedback(question, model_answer):
    print(f"Is the answer correct for the question '{question}'? (yes/no/): ")
    user_feedback=input()
    return user_feedback.lower()

In [None]:
#Tests the question answering functionality with multiple questions and displays the results.
questions = [
    "What are word embeddings?",
    "What are Residual Representations?",
    "What is the full form of SHAP and what does it mean?",
    "How are graph convolutional networks more efficient than convolutional neural networks?",
    "What are the datasets used in natural language processing?"
]

# Initialize an empty list to store dataset tuples (question, model_answer, feedback)
rlhf_dataset = []

for question in questions:
    result = qa.invoke({"query": question})
    display_answer(result)
    # Collect feedback from users
    feedback = collect_feedback(question, result['result'])
    # Append the data tuple (question, model_answer, feedback) to the dataset
    rlhf_dataset.append((question, result['result'], feedback))
    print("=" * 100)

Is the answer correct for the question 'What are word embeddings?'? (yes/no/): 
yes


Is the answer correct for the question 'What are Residual Representations?'? (yes/no/): 
yes


Is the answer correct for the question 'What is the full form of SHAP and what does it mean?'? (yes/no/): 
yes


Is the answer correct for the question 'How are graph convolutional networks more efficient than convolutional neural networks?'? (yes/no/): 
yes


Is the answer correct for the question 'What are the datasets used in natural language processing?'? (yes/no/): 
yes


In [None]:
# Define the CSV file path
csv_file = "rlhf_dataset.csv"

# Write the RLHF dataset to a CSV file
with open(csv_file, 'w', newline='') as file:
    writer = csv.writer(file)
    # Write the header
    writer.writerow(["Question", "Model Answer", "Feedback"])
    # Write each data point
    writer.writerows(rlhf_dataset)

print(f"RLHF dataset has been written to '{csv_file}'.")

RLHF dataset has been written to 'rlhf_dataset.csv'.


In [None]:
import pandas as pd
pd.read_csv(f'{csv_file}')

Unnamed: 0,Question,Model Answer,Feedback
0,What are word embeddings?,\n\nWord embeddings are vector representations...,yes
1,What are Residual Representations?,\n\nResidual Representations are a type of rep...,yes
2,What is the full form of SHAP and what does it...,\n\nSHAP stands for SHapley Additive ex Planat...,yes
3,How are graph convolutional networks more effi...,\n\nAnswer: Graph convolutional networks are m...,yes
4,What are the datasets used in natural language...,\n\nThe datasets used in natural language proc...,yes


In [None]:
from torch.utils.data import Dataset, DataLoader

# Define a custom dataset class for RLHF
class RLHFDataset(Dataset):
    def __init__(self, csv_file):
        self.data = pd.read_csv(csv_file)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Get question, model answer, and feedback at the specified index
        question = self.data.iloc[idx]['Question']
        model_answer = self.data.iloc[idx]['Model Answer']
        feedback = self.data.iloc[idx]['Feedback']
        return question, model_answer, feedback




# Define the path to the CSV file
csv_file = "rlhf_dataset.csv"

# Create an instance of RLHFDataset
rlhf_dataset = RLHFDataset(csv_file)

# Define batch size
batch_size = 1

# Create a train dataloader
train_dataloader = DataLoader(rlhf_dataset, batch_size=batch_size, shuffle=True)


In [None]:
!pip install instruct-goose

Collecting instruct-goose
  Downloading instruct_goose-0.0.7-py3-none-any.whl (12 kB)
Collecting datasets (from instruct-goose)
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting wandb (from instruct-goose)
  Downloading wandb-0.17.0-py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
Collecting gymnasium (from instruct-goose)
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m
Collecting einops (from instruct-goose)
  Downloading einops-0.8.0-py3-none-any.whl (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.2/43.2 kB[0m [3

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset

import torch
from torch.utils.data import DataLoader, random_split
from torch import optim

from instruct_goose import Agent, RewardModel, RLHFTrainer, RLHFConfig, create_reference_model

In [None]:
model_base = AutoModelForCausalLM.from_pretrained("gpt2") # for demonstration purposes
reward_model = RewardModel("gpt2")

tokenizer = AutoTokenizer.from_pretrained("gpt2", padding_side="left")
eos_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
model = Agent(model_base)
ref_model = create_reference_model(model)

In [None]:
max_new_tokens = 20
generation_kwargs = {
    "min_length":-1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
    "max_new_tokens": max_new_tokens
}

config = RLHFConfig()
N_EPOCH = 1 # for demonstration purposes
trainer = RLHFTrainer(model, ref_model, config)
optimizer = optim.SGD(model.parameters(), lr=1e-3)

In [None]:
import torch

# Define the number of epochs
N_EPOCH = 1

# Initialize the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

# Training loop
for epoch in range(N_EPOCH):
    for batch in train_dataloader:
        questions, model_answers, feedbacks = batch

        # Tokenize the input texts
        inputs = tokenizer(questions, padding=True, truncation=True, return_tensors="pt")

        # Tokenize the model answers
        target_texts = tokenizer(model_answers, padding=True, truncation=True, return_tensors="pt")["input_ids"]

        # Convert feedbacks to reward signals
        rewards = torch.tensor([1 if fb.lower() == 'yes' else -1 for fb in feedbacks], dtype=torch.float32)

        # Calculate PPO loss
        loss = trainer.compute_loss(
            query_ids=inputs["input_ids"],
            query_attention_mask=inputs["attention_mask"],
            response_ids=target_texts,
            response_attention_mask=torch.ones_like(target_texts),
            rewards=rewards
        )

        # Backpropagation and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        print(f"Epoch [{epoch+1}/{N_EPOCH}], Loss: {loss.item():.4f}")


Epoch [1/1], Loss: 1.8928
Epoch [1/1], Loss: -0.0613
Epoch [1/1], Loss: 2.2478
Epoch [1/1], Loss: -1978.0355
Epoch [1/1], Loss: 67.9628
