#### Data Ingestion - Document Loader

In [1]:
# Text Loader

from langchain_community.document_loaders import TextLoader

loader = TextLoader('speech.txt')

In [2]:
text_docs = loader.load()
text_docs

[Document(metadata={'source': 'speech.txt'}, page_content='Lorem ipsum dolor sit amet. Quo rerum nostrum a deleniti Quis aut culpa recusandae sed laudantium consequatur ut \nfugit asperiores aut illo omnis! Qui neque architecto est nostrum Quis eos doloremque dignissimos in repellat \nquidem sit dolorem adipisci ut vitae magni. Et minima omnis aut fugit adipisci a consequuntur illo vel laborum \ndicta est culpa aspernatur in enim libero vel neque maiores. Id eveniet ducimus aut illum amet est dignissimos \ndolorem qui dignissimos iure et tenetur magnam 33 eveniet internos. Est distinctio rerum ea omnis placeat hic\ndolor culpa sit pariatur dolor et soluta tempora est voluptatem voluptatibus est provident quisquam. Eum voluptates\nculpa in voluptates expedita et autem Quis qui totam accusantium aut nisi odio rem consequatur dicta. \nEt neque internos eos nesciunt saepe nam culpa corrupti. Vel molestias assumenda id perferendis sapiente et \nplaceat nihil. Et culpa odit ut corrupti optio

In [3]:
# Reading a pdf

from langchain_community.document_loaders import PyPDFLoader
pdf_loader = PyPDFLoader('sample_pdf.pdf')
pdf_doc = pdf_loader.load()
pdf_doc

[Document(metadata={'source': 'sample_pdf.pdf', 'page': 0}, page_content=' \nDr. Gunjan Chugh, Assistant Professor, MAIT \n \nUNIT 4 \nConvolutional Neural Networks: Convolutional Neural Networks, Building \nblocks of CNN, Transfer Learning , Pooling Layers , Convolutional Neural \nNetwork Architectures. Well known case studies: LeNet, AlexNet, VGG-16, \nResNet, Inception Net. Applications in Vision, Speech, and Audio-Video. \n \nINTRODUCTION TO CNN \nA convolutional neural network (CNN), is a network architecture for deep learning which learns \ndirectly from data.  CNNs are particularly useful for finding patterns in images to recognize \nobjects. They can also be quite effective for classifying non -image data such as audio, time \nseries, and signal data. \nCNNs are a class of Deep Neural Networks that can recognize and classify particular \nfeatures from images and are widely used for analyzing visual images. Their \napplications range from image and video recognition, image class

In [4]:
# Web based loader
from langchain_community.document_loaders import WebBaseLoader
web_loader = WebBaseLoader(web_paths=('https://lilianweng.github.io/posts/2023-06-23-agent/',),)
web_loader.load()

USER_AGENT environment variable not set, consider setting it to identify your requests.


[Document(metadata={'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/', 'title': "LLM Powered Autonomous Agents | Lil'Log", 'description': 'Building agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general problem solver.\nAgent System Overview\nIn a LLM-powered autonomous agent system, LLM functions as the agent’s brain, complemented by several key components:\n\nPlanning\n\nSubgoal and decomposition: The agent breaks down large tasks into smaller, manageable subgoals, enabling efficient handling of complex tasks.\nReflection and refinement: The agent can do self-criticism and self-reflection over past actions, learn from mistakes and refine them for future steps, thereby improving the quality of final resu

In [5]:
import bs4
web_loader2 = WebBaseLoader(web_paths=('https://lilianweng.github.io/posts/2023-06-23-agent/',),
                           bs_kwargs=dict(parse_only=bs4.SoupStrainer(class_ = ('post-title'))))
web_loader2.load()

[Document(metadata={'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/'}, page_content='\n      LLM Powered Autonomous Agents\n    ')]

In [8]:
from langchain_community.document_loaders import ArxivLoader
docs = ArxivLoader(query='1706.03762',load_max_docs=2).load()
docs

[Document(metadata={'Published': '2023-08-02', 'Title': 'Attention Is All You Need', 'Authors': 'Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, Illia Polosukhin', 'Summary': 'The dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks in an encoder-decoder configuration. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple network architecture, the Transformer, based\nsolely on attention mechanisms, dispensing with recurrence and convolutions\nentirely. Experiments on two machine translation tasks show these models to be\nsuperior in quality while being more parallelizable and requiring significantly\nless time to train. Our model achieves 28.4 BLEU on the WMT 2014\nEnglish-to-German translation task, improving over the existing best results,\nincluding ensembles by over 2 BLEU. On the WMT 2014 English-to-French\ntr

In [12]:
from langchain_community.document_loaders import WikipediaLoader
wiki_loader = WikipediaLoader(query='Generative AI',load_max_docs=1).load()
wiki_loader

[Document(metadata={'title': 'Generative artificial intelligence', 'summary': 'Generative artificial intelligence (generative AI, GenAI, or GAI) is a subset of artificial intelligence that uses generative models to produce text, images, videos, or other forms of data. These models learn the underlying patterns and structures of their training data and use them to produce new data based on the input, which often comes in the form of natural language prompts.  \nImprovements in transformer-based deep neural networks, particularly large language models (LLMs), enabled an AI boom of generative AI systems in the early 2020s. These include chatbots such as ChatGPT, Copilot, Gemini, and LLaMA; text-to-image artificial intelligence image generation systems such as Stable Diffusion, Midjourney, and DALL-E; and text-to-video AI generators such as Sora. Companies such as OpenAI, Anthropic, Microsoft, Google, and Baidu as well as numerous smaller firms have developed generative AI models.\nGenerat

#### Recursively Splitting text by characters

In [14]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500,chunk_overlap = 50)
final = text_splitter.split_documents(docs)
print(final[0])
print('------------')
print(final[1])

page_content='Provided proper attribution is provided, Google hereby grants permission to
reproduce the tables and figures in this paper solely for use in journalistic or
scholarly works.
Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.com
Noam Shazeer∗
Google Brain
noam@google.com
Niki Parmar∗
Google Research
nikip@google.com
Jakob Uszkoreit∗
Google Research
usz@google.com
Llion Jones∗
Google Research
llion@google.com
Aidan N. Gomez∗†
University of Toronto
aidan@cs.toronto.edu' metadata={'Published': '2023-08-02', 'Title': 'Attention Is All You Need', 'Authors': 'Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, Illia Polosukhin', 'Summary': 'The dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks in an encoder-decoder configuration. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple netwo

In [15]:
text_docs

[Document(metadata={'source': 'speech.txt'}, page_content='Lorem ipsum dolor sit amet. Quo rerum nostrum a deleniti Quis aut culpa recusandae sed laudantium consequatur ut \nfugit asperiores aut illo omnis! Qui neque architecto est nostrum Quis eos doloremque dignissimos in repellat \nquidem sit dolorem adipisci ut vitae magni. Et minima omnis aut fugit adipisci a consequuntur illo vel laborum \ndicta est culpa aspernatur in enim libero vel neque maiores. Id eveniet ducimus aut illum amet est dignissimos \ndolorem qui dignissimos iure et tenetur magnam 33 eveniet internos. Est distinctio rerum ea omnis placeat hic\ndolor culpa sit pariatur dolor et soluta tempora est voluptatem voluptatibus est provident quisquam. Eum voluptates\nculpa in voluptates expedita et autem Quis qui totam accusantium aut nisi odio rem consequatur dicta. \nEt neque internos eos nesciunt saepe nam culpa corrupti. Vel molestias assumenda id perferendis sapiente et \nplaceat nihil. Et culpa odit ut corrupti optio

In [16]:
speech = ''
with open('speech.txt') as f:
    speech = f.read()

speech

'Lorem ipsum dolor sit amet. Quo rerum nostrum a deleniti Quis aut culpa recusandae sed laudantium consequatur ut \nfugit asperiores aut illo omnis! Qui neque architecto est nostrum Quis eos doloremque dignissimos in repellat \nquidem sit dolorem adipisci ut vitae magni. Et minima omnis aut fugit adipisci a consequuntur illo vel laborum \ndicta est culpa aspernatur in enim libero vel neque maiores. Id eveniet ducimus aut illum amet est dignissimos \ndolorem qui dignissimos iure et tenetur magnam 33 eveniet internos. Est distinctio rerum ea omnis placeat hic\ndolor culpa sit pariatur dolor et soluta tempora est voluptatem voluptatibus est provident quisquam. Eum voluptates\nculpa in voluptates expedita et autem Quis qui totam accusantium aut nisi odio rem consequatur dicta. \nEt neque internos eos nesciunt saepe nam culpa corrupti. Vel molestias assumenda id perferendis sapiente et \nplaceat nihil. Et culpa odit ut corrupti optio a suscipit accusantium id aliquam nostrum sit consequatur

In [17]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=75,chunk_overlap=40)
text = text_splitter.create_documents(([speech]))
text

[Document(metadata={}, page_content='Lorem ipsum dolor sit amet. Quo rerum nostrum a deleniti Quis aut culpa'),
 Document(metadata={}, page_content='rerum nostrum a deleniti Quis aut culpa recusandae sed laudantium'),
 Document(metadata={}, page_content='aut culpa recusandae sed laudantium consequatur ut'),
 Document(metadata={}, page_content='fugit asperiores aut illo omnis! Qui neque architecto est nostrum Quis eos'),
 Document(metadata={}, page_content='neque architecto est nostrum Quis eos doloremque dignissimos in repellat'),
 Document(metadata={}, page_content='quidem sit dolorem adipisci ut vitae magni. Et minima omnis aut fugit'),
 Document(metadata={}, page_content='vitae magni. Et minima omnis aut fugit adipisci a consequuntur illo vel'),
 Document(metadata={}, page_content='fugit adipisci a consequuntur illo vel laborum'),
 Document(metadata={}, page_content='dicta est culpa aspernatur in enim libero vel neque maiores. Id eveniet'),
 Document(metadata={}, page_content='liber

In [19]:
from langchain_text_splitters import CharacterTextSplitter
character_splitter = CharacterTextSplitter(separator='\n\n',chunk_size = 50 , chunk_overlap = 20)
char_splitted = character_splitter.split_documents(text_docs)
char_splitted

Created a chunk of size 1572, which is longer than the specified 50
Created a chunk of size 1116, which is longer than the specified 50
Created a chunk of size 1319, which is longer than the specified 50
Created a chunk of size 1318, which is longer than the specified 50


[Document(metadata={'source': 'speech.txt'}, page_content='Lorem ipsum dolor sit amet. Quo rerum nostrum a deleniti Quis aut culpa recusandae sed laudantium consequatur ut \nfugit asperiores aut illo omnis! Qui neque architecto est nostrum Quis eos doloremque dignissimos in repellat \nquidem sit dolorem adipisci ut vitae magni. Et minima omnis aut fugit adipisci a consequuntur illo vel laborum \ndicta est culpa aspernatur in enim libero vel neque maiores. Id eveniet ducimus aut illum amet est dignissimos \ndolorem qui dignissimos iure et tenetur magnam 33 eveniet internos. Est distinctio rerum ea omnis placeat hic\ndolor culpa sit pariatur dolor et soluta tempora est voluptatem voluptatibus est provident quisquam. Eum voluptates\nculpa in voluptates expedita et autem Quis qui totam accusantium aut nisi odio rem consequatur dicta. \nEt neque internos eos nesciunt saepe nam culpa corrupti. Vel molestias assumenda id perferendis sapiente et \nplaceat nihil. Et culpa odit ut corrupti optio

##### HTML text splitter


In [20]:
html_str = '''
<!DOCTYPE html>
<html>
<body>
    <h1> Hello Everyone</h1>
    <p>Welcome to Langchain tutorial</p>
    <div>
        <h3> THis is a python tutorial notebook on langchain</h3>
        <p> Hope this will be informative for you all
    </div>
</body>
</html>
'''

In [21]:
headers_to_split_on = [
    ('h1','Header 1'),
    ('h2','Header 2'),
    ('h3','Header 3'),
    ('h4','Header 4')
]

In [23]:
from langchain_text_splitters import HTMLHeaderTextSplitter
html_splitter = HTMLHeaderTextSplitter(headers_to_split_on)
html_header_splitted = html_splitter.split_text(html_str)
html_header_splitted

[Document(metadata={'Header 1': 'Hello Everyone'}, page_content='Welcome to Langchain tutorial  \nTHis is a python tutorial notebook on langchain'),
 Document(metadata={'Header 1': 'Hello Everyone', 'Header 3': 'THis is a python tutorial notebook on langchain'}, page_content='Hope this will be informative for you all')]