In [None]:
%load_ext dotenv
%dotenv

### data loader

In [2]:
# note this might be sensitive to version
# pip install langchain==0.1.6 
# pip install langchain-community==0.0.19
from langchain.document_loaders.csv_loader import CSVLoader

In [3]:
loader = CSVLoader(file_path='./data.csv')
data = loader.load()

In [5]:
data[0].page_content

"\ufeffdate: 1/1/2024\nweather: sunny\nnotes: new year's day\nmood: happy"

In [6]:
# directly read data from URL
from langchain.document_loaders import UnstructuredURLLoader

In [7]:
from langchain.document_loaders import PyPDFLoader

In [8]:
loader = PyPDFLoader("JobShopSchedullingArticle.pdf")

In [9]:
documents = loader.load()

In [11]:
# for pdf, one doc is normally corresponding to 1 page
len(documents)

10

In [15]:
documents

[Document(page_content='Mathematical models for job-shop scheduling problems with routing\nand process plan ﬂexibility\nCemal Özgüvena,*, Lale Özbakırb, Yasemin Yavuza\naErciyes University, Faculty of Economics and Administrative Sciences, Department of Business, 38039 Kayseri, Turkey\nbErciyes University, Faculty of Engineering, Department of Industrial Engineering, 38039 Kayseri, Turkey\narticle info\nArticle history:\nReceived 31 October 2008Received in revised form 25 August 2009Accepted 1 September 2009Available online 4 September 2009\nKeywords:\nJob-shop schedulingRouting ﬂexibilityProcess plan ﬂexibilityMixed-integer programmingabstract\nAs a result of rapid developments in production technologies in recent years, ﬂexible job-\nshop scheduling problems have become increasingly signiﬁcant. This paper deals with twoNP-hard optimization problems: ﬂexible job-shop scheduling problems (FJSPs) that\nencompass routing and sequencing sub-problems, and the FJSPs with process plan ﬂexibi

### data transformer
- for example, split long doc into smaller segments to avoid prompt length limits
- in langchain, there are many built-in models to split/combine/filter docs

In [16]:
# a general purpose splitter
# split a large doc into small ones and then combine into med-sized chunks

from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    # size of each segment
    chunk_size = 100,
    # note that each segement would have overlap with others to ensure coherent
    chunk_overlap = 20,
    length_function = len
)

In [17]:
pages = loader.load_and_split(text_splitter=text_splitter)

In [19]:
# recall that if we use .load() method, this number was 10
len(pages) 

627

In [21]:
documents[0]

Document(page_content='Mathematical models for job-shop scheduling problems with routing\nand process plan ﬂexibility\nCemal Özgüvena,*, Lale Özbakırb, Yasemin Yavuza\naErciyes University, Faculty of Economics and Administrative Sciences, Department of Business, 38039 Kayseri, Turkey\nbErciyes University, Faculty of Engineering, Department of Industrial Engineering, 38039 Kayseri, Turkey\narticle info\nArticle history:\nReceived 31 October 2008Received in revised form 25 August 2009Accepted 1 September 2009Available online 4 September 2009\nKeywords:\nJob-shop schedulingRouting ﬂexibilityProcess plan ﬂexibilityMixed-integer programmingabstract\nAs a result of rapid developments in production technologies in recent years, ﬂexible job-\nshop scheduling problems have become increasingly signiﬁcant. This paper deals with twoNP-hard optimization problems: ﬂexible job-shop scheduling problems (FJSPs) that\nencompass routing and sequencing sub-problems, and the FJSPs with process plan ﬂexibil

In [20]:
pages[0]

Document(page_content='Mathematical models for job-shop scheduling problems with routing\nand process plan ﬂexibility', metadata={'source': 'JobShopSchedullingArticle.pdf', 'page': 0})

### data embedding
- commonly used ones: openAI, cohere, Hugging face

In [22]:
from langchain.embeddings import OpenAIEmbeddings
# pip install tiktoken

embedding_model = OpenAIEmbeddings()

  warn_deprecated(


In [23]:
embeddings = embedding_model.embed_documents([pages[0].page_content])

In [25]:
len(embeddings), len(embeddings[0])
# len(embedding is 1 b/c we only embed 1 doc)

(1, 1536)

In [26]:
embeddings

[[-0.010542845645942591,
  -0.005306310167584167,
  -0.010856828953490624,
  0.0069913512172564,
  -0.01701089228194154,
  0.012098805383192406,
  -0.03153783170447951,
  0.004008514452163093,
  -0.024253430517765403,
  -0.03949206431982513,
  0.0009716024775428242,
  0.0013352992616339342,
  -0.0032479780230139713,
  0.009335756560853682,
  0.006255236162166938,
  0.01059168792840061,
  0.02001117225367998,
  0.003771282837102075,
  -0.004608570260246261,
  0.015615413398588329,
  -0.01751326445643127,
  0.019885579675718846,
  0.0279654048690256,
  -0.04169692214761613,
  0.003549750527116849,
  0.015043266534872856,
  0.02804913449609649,
  -0.016648067623258087,
  0.01655038492098725,
  -0.015503774827149742,
  0.005840080952225482,
  -0.01927156986111313,
  -0.0209740536517692,
  -0.002768282156861305,
  0.0025188402040164327,
  0.0038375680933745784,
  0.014317617217505954,
  -0.004741140772791268,
  0.010186998456181715,
  -0.007298356590220558,
  0.014317617217505954,
  0.01210

### vector storage
- stored the embedded vectors

In [27]:
from langchain.vectorstores import Chroma

In [28]:
db = Chroma.from_documents(pages, OpenAIEmbeddings())

### retriever
- given an input query, return the related documents 

In [33]:
query = 'objective function'

In [34]:
docs = db.similarity_search(query)
len(docs)

4

In [35]:
docs[0]

Document(page_content='The proposed mathematical model is deﬁned as follows:Objective function: Minimize C\nmax', metadata={'page': 2, 'source': 'JobShopSchedullingArticle.pdf'})