In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import numpy as np
import seaborn
import pandas as pd

In [None]:
#!pip install faiss-cpu numpy langchain-openai langchain-community sentence_transformers typing

In [None]:
import os
import faiss
import numpy as np
from langchain_openai import OpenAIEmbeddings, OpenAI
from langchain_community.vectorstores import FAISS
from sentence_transformers import SentenceTransformer, InputExample, losses
from typing import List
import re
from datasets import Dataset
from transformers import TrainingArguments
from transformers import Trainer
from datasets import load_dataset
import evaluate
from sklearn.model_selection import train_test_split

2024-12-13 14:09:40.962339: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
finetuning_dataset = pd.read_csv('rag_finetuning_questions_large.csv')

In [None]:
model_name = "EleutherAI/pythia-410m"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)



In [None]:
embeddings_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
questions = finetuning_dataset["Questions"]
answers = finetuning_dataset["Answers"]

In [None]:
questions = [str(q) for q in finetuning_dataset["Questions"] if pd.notna(q)]
answers = [str(a) for a in finetuning_dataset["Answers"] if pd.notna(a)]

In [None]:
formatted_texts = []
for i in range(len(questions)):
        text = f"Question: {questions[i].strip()} Answer: {answers[i].strip()}"
        formatted_texts.append({"text": text})

In [None]:
formatted_texts = pd.DataFrame(formatted_texts)

In [None]:
formatted_texts['labels'] = formatted_texts['text']

In [None]:
formatted_texts

Unnamed: 0,text,labels
0,Question: Q: What are the key differences betw...,Question: Q: What are the key differences betw...
1,Question: Q: How does crop rotation benefit so...,Question: Q: How does crop rotation benefit so...
2,Question: Q: Explain the Green Revolution's im...,Question: Q: Explain the Green Revolution's im...
3,Question: Q: What are the primary differences ...,Question: Q: What are the primary differences ...
4,Question: Q: How does the Doppler effect help ...,Question: Q: How does the Doppler effect help ...
...,...,...
145,Question: What is the significance of factorin...,Question: What is the significance of factorin...
146,Question: How does feedback regulation maintai...,Question: How does feedback regulation maintai...
147,Question: What is the principle behind electro...,Question: What is the principle behind electro...
148,Question: How do hash tables optimize data ret...,Question: How do hash tables optimize data ret...


In [None]:
#finetuning_dataset = Dataset.from_pandas(formatted_texts)

In [None]:
#print(finetuning_dataset)

In [None]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [None]:
text_tokenized = tokenizer(
    formatted_texts['text'].tolist(),
    truncation=True,
    padding='max_length',
    max_length=700,
    return_tensors='pt'
)

In [None]:
text_tokenized['labels'] = text_tokenized['input_ids'].clone()

In [None]:
finetuning_dataset = Dataset.from_dict({
    'input_ids': text_tokenized['input_ids'],
    'attention_mask': text_tokenized['attention_mask'],
    'labels': text_tokenized['labels']
})

In [None]:
# text_tokenized = finetuning_dataset.map(
#      lambda x: tokenizer(finetuning_dataset['text'], truncation=True, padding="max_length", max_length=600, return_tensors=None),
#      lambda x: tokenizer(finetuning_dataset['labels'], truncation=True, padding="max_length", max_length=600, return_tensors=None),
#      batched=True,
#      remove_columns=finetuning_dataset.column_names
# )

In [None]:
print(finetuning_dataset)

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 150
})


In [None]:
finetuning_train_test = finetuning_dataset.train_test_split(test_size = 0.3)

In [None]:
finetuning_train_test

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 105
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 45
    })
})

In [None]:
finetune_train = finetuning_train_test['train']
finetune_test = finetuning_train_test['test']

In [None]:
training_args = TrainingArguments(
    output_dir="./rag_trainer_3",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    learning_rate=5e-5,
    eval_strategy="epoch",
    logging_steps=100,
    prediction_loss_only=True,
    overwrite_output_dir = True
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=finetune_train,
    eval_dataset=finetune_test
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
0,No log,0.211066
2,No log,0.191953


TrainOutput(global_step=39, training_loss=0.3907907192523663, metrics={'train_runtime': 4456.9049, 'train_samples_per_second': 0.071, 'train_steps_per_second': 0.009, 'total_flos': 454732959744000.0, 'train_loss': 0.3907907192523663, 'epoch': 2.888888888888889})

In [None]:
model = model.from_pretrained('./rag_trainer_3/checkpoint-7')

In [None]:
simpletext_auto_documents = []
for st_file in os.listdir('./simpletext_auto'):
    text = open(f'/Users/davidlaszczkowski/Documents/4940_Grad_Project/simpletext_auto/{st_file}', "r")
    read_text = text.read()
    simpletext_auto_documents.append(read_text)

In [None]:
chunked_documents = []
for doc in simpletext_auto_documents:
    sentences = re.split(r'(?<=[.!?]) +', doc)
    i = 0
    while i < (len(sentences)):
        sentence_group = " ".join(sentences[i:i+5])
        chunked_documents.append(sentence_group)
        i += 1

In [None]:
embeddings_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
documents = chunked_documents

In [None]:
document_embeddings = embeddings_model.encode(documents, convert_to_tensor=True).cpu().numpy()
document_embeddings = np.array(document_embeddings).astype('float32')

In [None]:
dimension = document_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(document_embeddings)

In [None]:
class SimpleRetriever:
    def __init__(self, index, documents: List[str]):
        self.index = index
        self.documents = documents
        self.embeddings_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

    def retrieve(self, query: str, top_k: int = 3) -> List[str]:
        query_embedding = self.embeddings_model.encode([query], convert_to_tensor=True).cpu().numpy().astype('float32')
        _, indices = self.index.search(query_embedding, top_k)
        return [self.documents[i] for i in indices[0]]

In [None]:
class LLMWrapper:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def invoke(self, prompt: str, input_len) -> str:
        try:
            inputs = self.tokenizer(prompt, return_tensors="pt", max_length=input_len, truncation=True)
            prompt_length = inputs.input_ids.shape[1]
            outputs = self.model.generate(
                **inputs,
                max_new_tokens = 100,
                num_return_sequences=1,
                temperature=0.7,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id
            )
            generated_tokens = outputs[0][prompt_length:]
            decoded_output = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)
            return decoded_output

        except IndexError as e:
            print(f"IndexError occurred: {str(e)}")
            print("This error typically occurs when input sequence length exceeds model's position embedding limit")
            return "Error generating response - input may be too long"
        except Exception as e:
            print(f"Unexpected error occurred: {str(e)}")
            return "Error generating response"

In [None]:
class SimpleRAG:
    def __init__(self, llm, retriever):
        self.llm = llm
        self.retriever = retriever

#     def generate(self, query: str) -> str:
#         retrieved_docs = self.retriever.retrieve(query)
#         augmented_query = f"{retrieved_docs} Question: {query}\nAnswer:"
#         response = self.llm.invoke(augmented_query)
#         return response
    def generate(self, query: str) -> str:
        docs = self.retriever.retrieve(query)
        joined_docs = " ".join(docs)
        #7000 for Pythia, 3000 for gpt-2
        shortened_docs = joined_docs[:3000]

        prompt = f"""Use the following information to help answer the question,
    but respond in your own words without quoting the sources directly: {shortened_docs}.
    Make sure your answer is true according to the provided information.
    Think carefully about your answer and make it concise but fully answer the question.
    Question: {query}
    """


        response = self.llm.invoke(prompt = prompt, input_len = len(prompt))
        return response
# Make sure your answer is true according to the provided information.
# Think carefully about your answer and make it concise but fully answer the question.

In [None]:
retriever = SimpleRetriever(index, documents)
llm = LLMWrapper(model, tokenizer)  # Using your existing model and tokenizer
rag = SimpleRAG(llm, retriever)

In [None]:
query = "What is the primary benefit of precision agriculture?"
response = rag.generate(query)
print(response)

Answer: Precision agriculture uses precise seed production to produce precision genotypes with improved growth and performance traits. It produces seeds with greater seed vigour but lower seed moisture, and is cheaper than other technologies when compared to in-field seed production.


In [None]:
#stop

NameError: name 'stop' is not defined

In [None]:
def run_query(query):
    print(query)
    print('\n')
    response = rag.generate(query)
    print(response)
    print('\n')
    print('-----------------------------------')

In [None]:
question_set = ["What is the primary benefit of precision agriculture?",
"What is a black hole?",
"What is the role of ribosomes in a cell?",
"What is the significance of the periodic table in chemistry?",
"What is the function of an algorithm in computer science?",
"What causes volcanic eruptions?",
"What is the difference between civil and mechanical engineering?",
"What is the concept of 'alloying' in materials science?",
"What is the Pythagorean theorem?",
"What is the function of white blood cells in the immune system?",
"How does crop rotation benefit soil health?",
"What is the Hubble Space Telescope used for?",
"What is the function of mitochondria in cells?",
"What is an ionic bond?",
"What is machine learning?",
"What are the three main types of rocks in the rock cycle?",
"What is the principle behind hydraulic systems?",
"What is the purpose of heat treatment in materials science?",
"What is a derivative in calculus?",
"What is the difference between a virus and a bacterium?",
"What is sustainable farming?",
"What is the concept of the 'Big Bang'?",
"What is photosynthesis?",
"What is the role of catalysts in chemical reactions?",
"What is the difference between a compiler and an interpreter?",
"What is the difference between weather and climate?",
"What is an electrical circuit?",
"What is the concept of nanotechnology?",
"What is the difference between an exothermic and endothermic reaction?",
"What is the difference between supervised and unsupervised learning in machine learning?",
"What causes earthquakes?",
"What is the difference between AC and DC in electrical engineering?",
"What is the role of polymers in materials science?",
"What is a matrix and how is it used in mathematics?",
"What is the function of the liver in the human body?"]

In [None]:
for question in question_set:
    run_query(question)

In [None]:
# def tokenize_qa(examples):
# #     q_encodings = tokenizer(
# #         examples['q'],
# #         truncation=True,
# #         padding='max_length',
# #         max_length=64,
# #         return_tensors=None
# #     )

#     a_encodings = tokenizer(
#         examples['a'],
#         truncation=True,
#         padding='max_length',
#         max_length=64,
#         return_tensors=None
#     )
#     return {
# #         'input_ids': q_encodings['input_ids'],
# #         'attention_mask': q_encodings['attention_mask']
#         'input_ids': a_encodings['input_ids'],
#         'attention_mask': a_encodings['attention_mask']
#     }

In [None]:
# tokenized_dataset = finetuning_dataset.map(
#     tokenize_qa,
#     batched=True,
#     remove_columns=finetuning_dataset.column_names
# )

In [None]:
# q_tokenized = finetuning_dataset.map(
#     lambda x: tokenizer(x['q'], truncation=True, padding="max_length", max_length=64, return_tensors=None),
#     batched=True
# )
# a_tokenized = finetuning_dataset.map(
#     lambda x: tokenizer(x['a'], truncation=True, padding="max_length", max_length=64, return_tensors=None),
#     batched=True
# )