# Document Loaders In LangChain

In [1]:
from langchain.document_loaders import TextLoader

In [2]:
# Initialize a TextLoader to load data from the file named 'nvda_news_1.txt'
loader = TextLoader("nvda_news_1.txt")

# Load the data from the file using the TextLoader instance
data = loader.load()

# Display the loaded data (useful for checking the content or format of the data)
data




In [3]:
# Import the CSVLoader class from the langchain package, designed for loading data from CSV files
from langchain.document_loaders.csv_loader import CSVLoader

# Initialize a CSVLoader with the file named 'movies.csv' and specify the 'title' column as the source column
loader = CSVLoader("movies.csv", source_column="title")

# Load the data from the CSV file using the CSVLoader instance
data = loader.load()

# Determine and display the length (number of entries) of the loaded data
len(data)


9

In [4]:
data[0].metadata

{'source': 'K.G.F: Chapter 2', 'row': 0}

In [6]:
from langchain.document_loaders import UnstructuredURLLoader

In [7]:
# Import the UnstructuredURLLoader class from the langchain package, which is designed to load unstructured data from URLs
from langchain.document_loaders import UnstructuredURLLoader

# Initialize an UnstructuredURLLoader instance with a list of URLs that you want to load data from
loader = UnstructuredURLLoader(
    urls=[
        "https://www.moneycontrol.com/news/business/banks/hdfc-bank-re-appoints-sanmoy-chakrabarti-as-chief-risk-officer-11259771.html",
        "https://www.moneycontrol.com/news/business/markets/market-corrects-post-rbi-ups-inflation-forecast-icrr-bet-on-these-top-10-rate-sensitive-stocks-ideas-11142611.html"
    ]
)


In [8]:
data = loader.load()
len(data)

2

In [9]:
data[0].page_content[0:100]

'English\n\nHindi\n\nGujarati\n\nSpecials\n\nTrending Stocks\n\nVedanta\xa0INE205A01025, VEDL, 500295\n\nSuzlon Ener'

In [10]:
data[0].metadata

{'source': 'https://www.moneycontrol.com/news/business/banks/hdfc-bank-re-appoints-sanmoy-chakrabarti-as-chief-risk-officer-11259771.html'}

# Text Splitters

In [11]:
# Taking some random text from wikipedia

text = """Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan. 
It stars Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, Matt Damon, and Michael Caine. 
Set in a dystopian future where humanity is embroiled in a catastrophic blight and famine, the film follows a group of astronauts who travel through a wormhole near Saturn in search of a new home for humankind.

Brothers Christopher and Jonathan Nolan wrote the screenplay, which had its origins in a script Jonathan developed in 2007 and was originally set to be directed by Steven Spielberg. 
Kip Thorne, a Caltech theoretical physicist and 2017 Nobel laureate in Physics,[4] was an executive producer, acted as a scientific consultant, and wrote a tie-in book, The Science of Interstellar. 
Cinematographer Hoyte van Hoytema shot it on 35 mm movie film in the Panavision anamorphic format and IMAX 70 mm. Principal photography began in late 2013 and took place in Alberta, Iceland, and Los Angeles. 
Interstellar uses extensive practical and miniature effects, and the company Double Negative created additional digital effects.

Interstellar premiered in Los Angeles on October 26, 2014. In the United States, it was first released on film stock, expanding to venues using digital projectors. The film received generally positive reviews from critics and grossed over $677 million worldwide ($715 million after subsequent re-releases), making it the tenth-highest-grossing film of 2014. 
It has been praised by astronomers for its scientific accuracy and portrayal of theoretical astrophysics.[5][6][7] Interstellar was nominated for five awards at the 87th Academy Awards, winning Best Visual Effects, and received numerous other accolades."""

In [12]:
# Say LLM token limit is 100, in that case we can do simple thing such as this

text[0:100]

'Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher N'

In [13]:
# Well but we want complete words and want to do this for entire text, may be we can use Python's split funciton

words = text.split(" ")
len(words)

264

In [14]:
chunks = []

s = ""
for word in words:
    s += word + " "
    if len(s)>200:
        chunks.append(s)
        s = ""
        
chunks.append(s)

In [15]:
chunks[:2]

['Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan. \nIt stars Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, Matt ',
 'Damon, and Michael Caine. \nSet in a dystopian future where humanity is embroiled in a catastrophic blight and famine, the film follows a group of astronauts who travel through a wormhole near Saturn in ']

In [16]:
# Import the CharacterTextSplitter class from the langchain package, which is designed to split text based on characters
from langchain.text_splitter import CharacterTextSplitter

# Initialize a CharacterTextSplitter instance to split text
# It will divide the text into chunks of 200 characters each, separated by a newline and without any overlap between chunks
splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=200,
    chunk_overlap=0
)


In [17]:
chunks = splitter.split_text(text)
len(chunks)

Created a chunk of size 210, which is longer than the specified 200
Created a chunk of size 208, which is longer than the specified 200
Created a chunk of size 358, which is longer than the specified 200


9

In [18]:
for chunk in chunks:
    print(len(chunk))

105
120
210
181
197
207
128
357
253


In [19]:
text

'Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan. \nIt stars Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, Matt Damon, and Michael Caine. \nSet in a dystopian future where humanity is embroiled in a catastrophic blight and famine, the film follows a group of astronauts who travel through a wormhole near Saturn in search of a new home for humankind.\n\nBrothers Christopher and Jonathan Nolan wrote the screenplay, which had its origins in a script Jonathan developed in 2007 and was originally set to be directed by Steven Spielberg. \nKip Thorne, a Caltech theoretical physicist and 2017 Nobel laureate in Physics,[4] was an executive producer, acted as a scientific consultant, and wrote a tie-in book, The Science of Interstellar. \nCinematographer Hoyte van Hoytema shot it on 35 mm movie film in the Panavision anamorphic format and IMAX 70 mm. Principal photography began in late 2013 and took place 

In [20]:
# Import the RecursiveCharacterTextSplitter class from the langchain package
# This splitter is designed to recursively split text based on multiple character separators
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Initialize a RecursiveCharacterTextSplitter instance
# This will try to split text first by double newlines ("\n\n"), then by single newlines ("\n"), and finally by spaces (" ")
# It will produce chunks of text of up to 200 characters, without any overlap, and uses the length of the string to determine chunk sizes
r_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " "],  # Ordered list of separators for the recursive split
    chunk_size=200,  # Desired maximum size for each chunk
    chunk_overlap=0,  # Overlapping characters between consecutive chunks
    length_function=len  # Function used to determine the size of chunks, set to length of the string
)


In [21]:
chunks = r_splitter.split_text(text)

for chunk in chunks:
    print(len(chunk))

105
120
199
10
181
197
198
8
128
191
165
198
54


In [22]:
first_split = text.split("\n\n")[0]
first_split

'Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan. \nIt stars Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, Matt Damon, and Michael Caine. \nSet in a dystopian future where humanity is embroiled in a catastrophic blight and famine, the film follows a group of astronauts who travel through a wormhole near Saturn in search of a new home for humankind.'

In [23]:
len(first_split)

439

In [24]:
second_split = first_split.split("\n")
second_split

['Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan. ',
 'It stars Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, Matt Damon, and Michael Caine. ',
 'Set in a dystopian future where humanity is embroiled in a catastrophic blight and famine, the film follows a group of astronauts who travel through a wormhole near Saturn in search of a new home for humankind.']

In [25]:
for split in second_split:
    print(len(split))

106
121
210


In [26]:
second_split[2]

'Set in a dystopian future where humanity is embroiled in a catastrophic blight and famine, the film follows a group of astronauts who travel through a wormhole near Saturn in search of a new home for humankind.'