# Text Loaders and UnstructuredUrlLoaders

* Loading using loader classes in langchain

In [17]:
import pypdf
from langchain.document_loaders import PyPDFLoader


In [18]:
loader = PyPDFLoader('pd.pdf')
data = loader.load()

In [23]:
data[0]

Document(metadata={'source': 'pd.pdf', 'page': 0}, page_content=" AI Engineer Internship We are excited to announce a 3-month remote internship opportunity for an AI Engineer at Aspireit. This isn’t just any internship—it’s a unique chance to work directly alongside our founder on cutting-edge AI-driven products, placing you at the forefront of the industry’s shift towards AI innovation. During this internship, you will be a key member of our team, contributing to the development of three innovative AI products. You’ll gain invaluable experience by working closely with our AI engineers, integrating AI models on the backend, and bringing them to life on the front-end. This hands-on involvement will give you the practical exposure needed to build next-generation products and prototypes. Important Note: You are encouraged to use ChatGPT or any other LLM for code generation. As a forward-thinking company, we are focused on efficiency and innovation, and we'd like to assess how quickly you 

In [28]:
from langchain.document_loaders.csv_loader import CSVLoader

loader = CSVLoader("movies.csv", source_column='title')
data = loader.load()
len(data)

9

In [29]:
type(data[0])

langchain_core.documents.base.Document

In [31]:
data[1].metadata

{'source': 'Doctor Strange in the Multiverse of Madness', 'row': 1}

In [33]:
!pip3 install unstructured libmagic python-magic python-magic-bin

Collecting unstructured
  Using cached unstructured-0.15.14-py3-none-any.whl.metadata (29 kB)
Collecting libmagic
  Using cached libmagic-1.0.tar.gz (3.7 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting python-magic
  Using cached python_magic-0.4.27-py2.py3-none-any.whl.metadata (5.8 kB)
[31mERROR: Could not find a version that satisfies the requirement python-magic-bin (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for python-magic-bin[0m[31m
[0m

In [34]:
from langchain.document_loaders import UnstructuredURLLoader

In [37]:
loader = UnstructuredURLLoader(urls=[
    "https://www.moneycontrol.com/news/business/markets/adani-enterprises-stock-in-focus-as-company-launches-first-tranche-of-qip-to-raise-500-million-12839267.html",
    "https://www.moneycontrol.com/news/business/capex-for-airports-expressway-green-energy-projects-how-adani-enterprises-plans-to-use-its-qip-proceeds-12839253.html"
])

In [38]:
data = loader.load()
len(data)

2

In [39]:
data[0]

Document(metadata={'source': 'https://www.moneycontrol.com/news/business/markets/adani-enterprises-stock-in-focus-as-company-launches-first-tranche-of-qip-to-raise-500-million-12839267.html'}, page_content="English\n\nHindi\n\nGujarati\n\nSpecials\n\nHello, Login\n\nHello, Login\n\nLog-inor Sign-Up\n\nMy Account\n\nMy Profile\n\nMy Portfolio\n\nMy Watchlist\n\nFREE Credit Score₹100 Cash Reward\n\nFixed Deposits\n\nMy Alerts\n\nMy Messages\n\nPrice Alerts\n\nMy Profile\n\nMy PRO\n\nMy Portfolio\n\nMy Watchlist\n\nFREE Credit Score₹100 Cash Reward\n\nFixed Deposits\n\nMy Alerts\n\nMy Messages\n\nPrice Alerts\n\nLogout\n\nChat with Us\n\nDownload App\n\nFollow us on:\n\nGo Ad-Free\n\nMy Alerts\n\n>->MC_ENG_DESKTOP/MC_ENG_NEWS/MC_ENG_MARKETS_AS/MC_ENG_ROS_NWS_MKTS_AS_ATF_728\n\nGo PRO @₹99 PRO\n\nAdvertisement\n\nRemove Ad\n\nBusiness\n\nMarkets\n\nStocks\n\nEconomy\n\nCompanies\n\nTrends\n\nIPO\n\nOpinion\n\nEV Special\n\nHomeNewsBusinessMarketsAdani Enterprises stock in focus as company 

# Text Splitter

* CharacterTextSplitter / RecursiveTextSplitter
* Due to token size limit
* then merge for uniformity and data distribution in each chunk

In [41]:
text = """   
    New York Dolls is the debut album by the American hard rock band New York Dolls (pictured). It was released in 1973 by Mercury Records. 
    In the years leading up to the album, the Dolls had developed a local fanbase by playing regularly in lower Manhattan after forming in 1971. 
    Most music producers and record companies were reluctant to work with them because of their onstage cross-dressing and blatant vulgarity. 
    The album – a mix of carefree rock and roll, influences from Brill Building pop, and campy sensibilities – explores themes of urban youth, 
    teen alienation, adolescent romance, and authenticity, as rendered in lead singer David Johansen's colloquial and ambiguous lyrics.
    New York Dolls was met with widespread critical acclaim but sold poorly and polarized listeners. Despite its commercial failure,
     New York Dolls was an influential precursor to the 1970s punk rock movement. 
    It has been named in various publications as one of the best debut records in rock music
"""

In [43]:
text[:200]

'   \n    New York Dolls is the debut album by the American hard rock band New York Dolls (pictured). It was released in 1973 by Mercury Records. \n    In the years leading up to the album, the Dolls had'

In [46]:
chunks = []

s=""
for word in text:
    s += word+""
    if len(s) > 200:
        chunks.append(s)
        s=""

chunks.append(s)


In [48]:
chunks[0]

'   \n    New York Dolls is the debut album by the American hard rock band New York Dolls (pictured). It was released in 1973 by Mercury Records. \n    In the years leading up to the album, the Dolls had '

In [50]:
from langchain.text_splitter import CharacterTextSplitter

## split on each new line character
splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=200,
    chunk_overlap=0
)

chunks = splitter.split_text(text)

In [51]:
len(chunks)

7

In [53]:
for chunk in chunks:
    print(len(chunk))

135
140
137
138
131
128
170


In [66]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

r_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " "],
    chunk_size=300,
    chunk_overlap=0
)

chunks = r_splitter.split_text(text)
len(chunks)

4

In [67]:
for chunk in chunks:
    print(len(chunk))

281
281
264
170


In [68]:
chunks=text.split("\n\n")

In [69]:
for chunk in chunks:
    print(len(chunk))

1023


# Vector DB

In [70]:
import pandas as pd

pd.set_option('display.max_colwidth', 100)

In [71]:
df = pd.read_csv("sample_text.csv")
df.shape

(8, 2)

In [72]:
df.head()

Unnamed: 0,text,category
0,Meditation and yoga can improve mental health,Health
1,"Fruits, whole grains and vegetables helps control blood pressure",Health
2,These are the latest fashion trends for this week,Fashion
3,Vibrant color jeans for male are becoming a trend,Fashion
4,The concert starts at 7 PM tonight,Event


In [73]:
# text column to embeddings.
from sentence_transformers import SentenceTransformer

encoder = SentenceTransformer("all-mpnet-base-v2")
vectors = encoder.encode(df.text)
vectors.shape

(8, 768)

In [74]:
dim = vectors.shape[1]
dim

768

In [76]:
import faiss

index = faiss.IndexFlatL2(dim)
index

<faiss.swigfaiss.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x31ec96090> >

In [77]:
index.add(vectors)

In [83]:
search_query = "I want to buy a polo t-shirt"
vec = encoder.encode(search_query)

In [87]:
svec = vec.reshape(1, -1)

In [94]:
distances, I = index.search(svec, 2)

In [95]:
df.loc[I[0]]

Unnamed: 0,text,category
3,Vibrant color jeans for male are becoming a trend,Fashion
2,These are the latest fashion trends for this week,Fashion


In [96]:
search_query

'I want to buy a polo t-shirt'

# RetrievalQA With Sources Chain

* Relevant chunks to create a LLM prompt.
* Give answer based on : chunk2 and chunk8 in one single prompt.(Stuff method)

**Stuff method**: simplest of all just append all required chunks and create a combined chunk which is then given with input query.

**Drawback:** What if the combined chunk exceeds the token limit of LLM?
if we get n relevant chunks which are filtered based on similarity index, 
we cant add all the chunks in one prompt as it will cross token limit size.

**Map Reduce Method** Each chunk is passed through individual llm's which inturn generates filtered content of each 
chunk by extracting only relevant part to answer the question(tries to reduce the chunk size).(n llm calls)
These filtered chunk are then combined to form summary chunk which is then prompted to final LLM to get answer.

In [137]:
import os 
from dotenv import load_dotenv
import streamlit as st
import dill as pickle
import time
import langchain
from langchain_community.llms import OpenAI
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS

In [104]:
load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('API_KEY')

In [105]:
# Initializing LLM with required params
llm = OpenAI(temperature=0.9, max_tokens=500)

loaders = UnstructuredURLLoader(urls=[
    "https://www.moneycontrol.com/news/business/markets/adani-enterprises-stock-in-focus-as-company-launches-first-tranche-of-qip-to-raise-500-million-12839267.html",
    "https://www.moneycontrol.com/news/business/capex-for-airports-expressway-green-energy-projects-how-adani-enterprises-plans-to-use-its-qip-proceeds-12839253.html"
])

data = loaders.load()
len(data)

2

In [111]:
data[0].page_content

Document(metadata={'source': 'https://www.moneycontrol.com/news/business/markets/adani-enterprises-stock-in-focus-as-company-launches-first-tranche-of-qip-to-raise-500-million-12839267.html'}, page_content="English\n\nHindi\n\nGujarati\n\nSpecials\n\nHello, Login\n\nHello, Login\n\nLog-inor Sign-Up\n\nMy Account\n\nMy Profile\n\nMy Portfolio\n\nMy Watchlist\n\nFREE Credit Score₹100 Cash Reward\n\nFixed Deposits\n\nMy Alerts\n\nMy Messages\n\nPrice Alerts\n\nMy Profile\n\nMy PRO\n\nMy Portfolio\n\nMy Watchlist\n\nFREE Credit Score₹100 Cash Reward\n\nFixed Deposits\n\nMy Alerts\n\nMy Messages\n\nPrice Alerts\n\nLogout\n\nChat with Us\n\nDownload App\n\nFollow us on:\n\nGo Ad-Free\n\nMy Alerts\n\n>->MC_ENG_DESKTOP/MC_ENG_NEWS/MC_ENG_MARKETS_AS/MC_ENG_ROS_NWS_MKTS_AS_ATF_728\n\nGo PRO @₹99 PRO\n\nAdvertisement\n\nRemove Ad\n\nBusiness\n\nMarkets\n\nStocks\n\nEconomy\n\nCompanies\n\nTrends\n\nIPO\n\nOpinion\n\nEV Special\n\nHomeNewsBusinessMarketsAdani Enterprises stock in focus as company 

In [121]:
# Divide into chunks
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

docs = splitter.split_documents(data) 
len(docs)

21

In [132]:
for doc in docs:
    print(len(doc.page_content))

973
954
948
683
847
993
990
989
989
219
971
664
969
842
742
962
924
989
919
998
867


In [146]:
embeddings = OpenAIEmbeddings()

vectorindex_openai=FAISS.from_documents(docs, embeddings)

In [142]:
# Stroing vector index created in local env
import dill as pickle  # Use dill instead of pickle for more flexibility
import ssl

# Function to handle pickling with exclusion of SSLContext or other non-picklable attributes
def clean_for_pickling(obj):
    """
    Remove non-picklable attributes from the object, such as SSLContext,
    threading locks, etc.
    """
    state = obj.__dict__.copy()

    # Remove the SSLContext if it exists
    for key, value in state.items():
        if isinstance(value, ssl.SSLContext):
            print(f"Excluding {key} (SSLContext) from being pickled.")
            state[key] = None  # Or set it to None, or reinitialize it later if needed

    # Add other exclusions as necessary
    # e.g., threading locks or other objects you can't pickle
    return state

# Assuming vectorindex_openai is a custom object
class CustomVectorIndex:
    def __init__(self):
        self.ssl_context = ssl.create_default_context()  # Example SSLContext attribute
        # Add other attributes as necessary

    def __getstate__(self):
        # Return a cleaned state without non-picklable attributes
        return clean_for_pickling(self)

    def __setstate__(self, state):
        # Restore the SSLContext or other non-picklable attributes
        self.__dict__.update(state)
        self.ssl_context = ssl.create_default_context()  # Re-create the SSLContext

# Create an instance of your class (for demonstration)
vectorindex_openai = CustomVectorIndex()

# Improved pickling process with error handling
file_path = "vector_index.pkl"
try:
    with open(file_path, "wb") as f:
        pickle.dump(vectorindex_openai, f)
    print(f"Object successfully pickled to {file_path}")
except (TypeError, pickle.PicklingError) as e:
    print(f"Error while pickling: {e}")


Excluding ssl_context (SSLContext) from being pickled.
Object successfully pickled to vector_index.pkl


In [143]:
## loading vector db
if os.path.exists(file_path):
    with open(file_path, "rb") as f:
        vectorIndex = pickle.load(f)

In [147]:
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorindex_openai.as_retriever())
chain



In [148]:
query = "what is the total fundraising of adani group?"

langchain.debug = True

chain.invoke({"question": query}, return_only_outputs=True)

[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain] Entering Chain run with input:
[0m{
  "question": "what is the total fundraising of adani group?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "input_list": [
    {
      "context": "The company's board had approved a total fundraising of Rs 16,600 crore or approximately $2 billion earlier this year in May. Note that while the QIP launched on October 9 has a base size of $500 million, Adani Enterprises could raise a larger amount using a green shoe option, sources told Moneycontrol, adding that any additional fundraising will depend on investor demand.\n\nStory continues below Advertisement\n\nRemove Ad\n\nThe offering already has a strong investor demand, 

{'answer': ' The total fundraising of Adani Group is approximately Rs 16,600 crore or $2 billion. However, the company plans to raise between Rs 30,000 and Rs 40,000 crore in total from retail investors over the next 3-4 years. \n',
 'sources': 'https://www.moneycontrol.com/news/business/markets/adani-enterprises-stock-in-focus-as-company-launches-first-tranche-of-qip-to-raise-500-million-12839267.html'}