In [1]:
from langchain_community.document_loaders import DirectoryLoader

In [6]:
loader = DirectoryLoader('content', glob="**/*.txt", use_multithreading=True)

In [7]:
docs = loader.load()

In [8]:
len(docs)

12

In [9]:
# This is a long document we can split up.
with open("content/content-0.txt") as f:
    state_of_the_union = f.read()

In [14]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=1000,
    chunk_overlap=300,
    length_function=len,
    is_separator_regex=False,
)

In [20]:
texts = text_splitter.create_documents([state_of_the_union])
len(texts)

35

In [29]:
from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter
from langchain_chroma import Chroma

# # Load the document, split it into chunks, embed each chunk and load it into the vector store.
# raw_documents = TextLoader('../../../state_of_the_union.txt').load()
# text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
# documents = text_splitter.split_documents(raw_documents)
db = Chroma.from_documents(texts, OpenAIEmbeddings())

In [50]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI()

In [59]:
from langchain_core.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate.from_messages([
      ("system", "You have a long string of data: {data_string}"),
      ("system", "Answer the following question based on the data:"),
      ("user", "{question}")
])
# prompt = ChatPromptTemplate.from_messages([
#     ("system", "You are a world class text extractor.You have to extract the text from the user's input"),
#     ("user", "{input}")
# ])

In [60]:
from langchain_core.output_parsers import StrOutputParser

output_parser = StrOutputParser()

In [61]:
chain = prompt | llm | output_parser

In [62]:
chain.invoke(
    {
        "data_string": state_of_the_union,
        "question": "extract the abstract without chaing anything"
    }
)

'The purpose of this study is to present an exhaustive analysis on research paper recommender systems which have become very popular and gained a lot of research attention. Though the major focus is on developing new recommendation algorithms, other research dimensions are left untouched. Renown recommendation classes include content-based approaches, collaborative filtering, link-based algorithms, co-occurrence based approaches, global relevance and hybrid methods. These approaches mainly differ in background knowledge and modes of user behavior analysis. For instance, content-based filtering uses paper descriptions which are mostly word-based features. Collaborative filtering makes predictions based on peers’ interests. Link-based algorithms utilize academic associations that exist between different entities in academia. Co-occurrence based techniques incorporate event occurrences to locate related papers. Global relevance adopts a ‘one-for-all’ policy for recommending popular articl