### We will build, test, and fine-tune all the models in this notebook

#### To use this notebook, create a virtual env and run pip install -r 'requirements.txt' before running the cells

# Table of Contents

### 1. [Datascrapping](#example)
### 2. [VectorStore](#VectorStore)
### 3. [Using the Model](#Model)
### 4. [Fine-tuning](#Fine-tuning)




In [8]:
#Import everything
import langchain
from openai import OpenAI
import os
import faiss
import openai
from langchain.document_loaders import UnstructuredURLLoader
from langchain.document_loaders import OnlinePDFLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import CharacterTextSplitter
import pickle
import faiss
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
import json
import pandas as pd
import regex as re

In [4]:
#change this to ur OPENAI key here
os.environ["OPENAI_API_KEY"]=""

## VectorStore

Building a vectorstore from scratch

In [7]:
#Load your data first
loader = CSVLoader(file_path="FinalOutput.csv") #here's an example of a csv file
data=loader.load()

#chunking the data 
text_splitter = CharacterTextSplitter(separator='\n',
                                      chunk_size=1000,
                                      chunk_overlap=200)

docs = text_splitter.split_documents(data)

#initialize the openai embeddings system
embeddings = OpenAIEmbeddings()

#Creating a new vector store from the documents 
VectorStore_openAI = FAISS.from_documents(docs, embeddings)

#Saving the new vectorstore to be called 'faiss_store_demo'
VectorStore_openAI.save_local("faiss_store_demo")

Loading the VectorStore

In [None]:
database = FAISS.load_local("faiss_store_demo", embeddings)

Adding new data to an existing VectorStore

In [None]:
index1 = faiss.read_index("./faiss_store/index.faiss")
index2 = faiss.read_index("./faiss_store2/index.faiss")
print(index1.d == index2.d)
new_index = faiss.IndexFlatL2(index1.d)  # Example for a flat L2 index

# Add vectors from both indexes to the new index
# This step varies based on index type and might require extracting vectors differently
# Here, it's assumed you can directly add vectors
new_index.add(index1.reconstruct_n(0, index1.ntotal))
new_index.add(index2.reconstruct_n(0, index2.ntotal))
faiss.write_index(new_index,'./faiss_store_merged/index')


## Fine-tuning

In [None]:
client = OpenAI()

#Create a training file first from the input
client.files.create(
  file=open("transformed_interactions.jsonl", "rb"),
  purpose="fine-tune"
)
'''
output: a training_file id from OpenAI, e.g.: 'file-ZsAaCWc0D5ceTsqEgr3ZdZbv'
'''

#Use the given training file id to create the finetuning job, and select the base model(could be a finetuned model)
client.fine_tuning.jobs.create(
  training_file='file-ZsAaCWc0D5ceTsqEgr3ZdZbv', 
  model="gpt-3.5-turbo"
)
'''
output: a job id form OpenAI, e.g.:'ftjob-BIOaKlGG3oALplrJCFarIu5N'
'''

#retrieve the job status
client.fine_tuning.jobs.retrieve('ftjob-BIOaKlGG3oALplrJCFarIu5N')

## Model

In [10]:
from langchain import chat_models
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain


Setting up the langchain model

In [13]:
#Specifying the vector store to retrieve info from
VectorStore = FAISS.load_local("VectorStores/faiss_store_merged2", OpenAIEmbeddings(),allow_dangerous_deserialization=True)
retriever = VectorStore.as_retriever()

#Specifying the model
llm  = chat_models.ChatOpenAI(model='gpt-4o')
#Enabling chat history memory
memory = ConversationBufferMemory(memory_key="chat_history",return_messages=True, output_key='answer')
#model
qa_chain = ConversationalRetrievalChain.from_llm(llm, retriever=retriever, memory=memory,get_chat_history=lambda h : h)

Calling the model

In [15]:
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)

In [16]:
qa_chain({"question":"What is LCD?"},return_only_outputs=True)

{'answer': 'In the context of the course information provided, "LCD" stands for "Liberal & Civic Discourse."'}