### We will build, test, and fine-tune all the models in this notebook

#### To use this notebook, create a virtual env and run pip install -r 'requirements.txt' before running the cells

# Table of Contents

### 1. [Datascrapping](#example)
### 2. [VectorStore](#VectorStore)
### 3. [Using the Model](#Model)
### 4. [Fine-tuning](#Fine-tuning)




In [8]:
#Import everything
import langchain
from openai import OpenAI
import os
import faiss
import openai
from langchain.document_loaders import UnstructuredURLLoader
from langchain.document_loaders import OnlinePDFLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import CharacterTextSplitter
import pickle
import faiss
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
import json
import pandas as pd
import regex as re

In [4]:
#change this to ur OPENAI key here
os.environ["OPENAI_API_KEY"]=""

## VectorStore

Building a vectorstore from scratch

In [7]:
#Load your data first
loader = CSVLoader(file_path="FinalOutput.csv") #here's an example of a csv file
data=loader.load()

#chunking the data 
text_splitter = CharacterTextSplitter(separator='\n',
                                      chunk_size=1000,
                                      chunk_overlap=200)

docs = text_splitter.split_documents(data)

#initialize the openai embeddings system
embeddings = OpenAIEmbeddings()

#Creating a new vector store from the documents 
VectorStore_openAI = FAISS.from_documents(docs, embeddings)

#Saving the new vectorstore to be called 'faiss_store_demo'
VectorStore_openAI.save_local("faiss_store_demo")

Loading the VectorStore

In [None]:
database = FAISS.load_local("faiss_store_demo", embeddings)

Adding new data to an existing VectorStore

In [None]:
index1 = faiss.read_index("./faiss_store/index.faiss")
index2 = faiss.read_index("./faiss_store2/index.faiss")
print(index1.d == index2.d)
new_index = faiss.IndexFlatL2(index1.d)  # Example for a flat L2 index

# Add vectors from both indexes to the new index
# This step varies based on index type and might require extracting vectors differently
# Here, it's assumed you can directly add vectors
new_index.add(index1.reconstruct_n(0, index1.ntotal))
new_index.add(index2.reconstruct_n(0, index2.ntotal))
faiss.write_index(new_index,'./faiss_store_merged/index')


In [123]:
#Load your data first
loader = CSVLoader(file_path="/Users/ArcherLi/Desktop/Webstac_FL24.csv") #here's an example of a csv file
data=loader.load()

#chunking the data 
text_splitter = CharacterTextSplitter(separator='\n',
                                      chunk_size=1000,
                                      chunk_overlap=200)

docs = text_splitter.split_documents(data)

#initialize the openai embeddings system
embeddings = OpenAIEmbeddings()

#Creating a new vector store from the documents 
VectorStore_openAI = FAISS.from_documents(docs, embeddings)

#Saving the new vectorstore to be called 'faiss_store_demo'
VectorStore_openAI.save_local("./VectorStores/Webstac_FL24")


Created a chunk of size 1300, which is longer than the specified 1000
Created a chunk of size 1063, which is longer than the specified 1000
Created a chunk of size 1527, which is longer than the specified 1000
Created a chunk of size 1121, which is longer than the specified 1000
Created a chunk of size 1669, which is longer than the specified 1000
Created a chunk of size 1320, which is longer than the specified 1000
Created a chunk of size 1255, which is longer than the specified 1000
Created a chunk of size 1255, which is longer than the specified 1000
Created a chunk of size 1255, which is longer than the specified 1000
Created a chunk of size 1277, which is longer than the specified 1000
Created a chunk of size 1292, which is longer than the specified 1000
Created a chunk of size 1326, which is longer than the specified 1000
Created a chunk of size 1418, which is longer than the specified 1000
Created a chunk of size 1763, which is longer than the specified 1000
Created a chunk of s

## Fine-tuning

In [None]:
client = OpenAI()

#Create a training file first from the input
client.files.create(
  file=open("transformed_interactions.jsonl", "rb"),
  purpose="fine-tune"
)
'''
output: a training_file id from OpenAI, e.g.: 'file-ZsAaCWc0D5ceTsqEgr3ZdZbv'
'''

#Use the given training file id to create the finetuning job, and select the base model(could be a finetuned model)
client.fine_tuning.jobs.create(
  training_file='file-ZsAaCWc0D5ceTsqEgr3ZdZbv', 
  model="gpt-3.5-turbo"
)
'''
output: a job id form OpenAI, e.g.:'ftjob-BIOaKlGG3oALplrJCFarIu5N'
'''

#retrieve the job status
client.fine_tuning.jobs.retrieve('ftjob-BIOaKlGG3oALplrJCFarIu5N')

## Model

In [10]:
from langchain import chat_models
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain


Setting up the langchain model

In [128]:
#Specifying the vector store to retrieve info from
VectorStore = FAISS.load_local("VectorStores/Webstac_FL24", OpenAIEmbeddings(),allow_dangerous_deserialization=True)
retriever = VectorStore.as_retriever()

#Specifying the model
llm  = chat_models.ChatOpenAI(model='gpt-4o',response_format={
    "type": "json_schema",
    "json_schema": {
  "name": "class_schedule",
  "schema": {
    "type": "object",
    "properties": {
      "schedule": {
        "type": "array",
        "description": "A complete table representation of the weekly class schedule. Monday thru Friday, 10AM to 6PM, in correct chronological order. If no event is scheduled, leave the time slot as 'nothing scheduled'",
        "items": {
          "type": "object",
          "properties": {
            "day": {
              "type": "string",
              "description": "The day of the week for the class.",
              "enum": [
                "Monday",
                "Tuesday",
                "Wednesday",
                "Thursday",
                "Friday"
              ]
            },
            "time_slot": {
              "type": "string",
              "description": "The actual time slot for the class based on the stored info, could be 50 minutes, 80 minutes. or 150 minutes long. e.g. 10:00AM to 10:50AM",
              #              "description": "The time slot for the class, could be 50 minutes, 80 minutes. or 150 minutes long. e.g. 10:00AM to 10:50AM",
              #              "description": "The time slot for the class as an hour range.",
              # "enum": [
              #   "9:00 AM - 10:00 AM",
              #   "10:00 AM - 11:00 AM",
              #   "11:00 AM - 12:00 PM",
              #   "1:00 PM - 2:00 PM",
              #   "2:00 PM - 3:00 PM",
              #   "3:00 PM - 4:00 PM"
              # ]
            },
            "class_name": {
              "type": "string",
              "description": "The actual name of the class scheduled in this slot, based on information provided. e.g. Data Structures and Algorithms"
            },
            "class_code": {
              "type": "string",
              "description": "The actual code assigned to this class. It starts with the abbreviation of the department and then a number. for example, CSE 131."
            }
          },
          "required": [
            "day",
            "time_slot",
            "class_code",
            "class_name"
          ],
          "additionalProperties": False
        }
      }
    },
    "required": [
      "schedule"
    ],
    "additionalProperties": False
  },
  "strict": False
}})
#Enabling chat history memory
memory = ConversationBufferMemory(memory_key="chat_history",return_messages=True, output_key='answer')
#model
qa_chain = ConversationalRetrievalChain.from_llm(llm, retriever=retriever, memory=memory,get_chat_history=lambda h : h)

                    response_format was transferred to model_kwargs.
                    Please confirm that response_format is what you intended.


Calling the model

In [15]:
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)

In [130]:
qa_chain({"question":"I am a first year CS engineering student. I need to take 3 CS classes and 1 humanities class, and a writing class this semester, make me a weekly schedule. "},return_only_outputs=True)

{'answer': '{"schedule":[{"day":"Monday","time_slot":"10:00AM to 10:50AM","class_name":"Introduction to Computer Science","class_code":"CSE 131"},{"day":"Monday","time_slot":"11:00AM to 11:50AM","class_name":"Writing 101","class_code":"ENG 101"},{"day":"Monday","time_slot":"1:00PM to 2:20PM","class_name":"Data Structures and Algorithms","class_code":"CSE 247"},{"day":"Monday","time_slot":"2:30PM to 3:50PM","class_name":"Introduction to Philosophy","class_code":"PHIL 100"},{"day":"Tuesday","time_slot":"10:00AM to 11:20AM","class_name":"Discrete Mathematics","class_code":"CSE 240"},{"day":"Tuesday","time_slot":"1:00PM to 1:50PM","class_name":"Writing 101","class_code":"ENG 101"},{"day":"Wednesday","time_slot":"10:00AM to 10:50AM","class_name":"Introduction to Computer Science","class_code":"CSE 131"},{"day":"Wednesday","time_slot":"11:00AM to 11:50AM","class_name":"Writing 101","class_code":"ENG 101"},{"day":"Wednesday","time_slot":"1:00PM to 2:20PM","class_name":"Data Structures and Alg

In [129]:
qa_chain({"question":"MWF are too cramped with no classes on tuesday thursday, change it so that i have 3 classes on MWF, and 2 classes on Tues Thurs"},return_only_outputs=True)

KeyboardInterrupt: 