In [None]:
from google.colab import drive
drive.mount('/content/drive')

**Document Used for the Notebook**

Using the blog article [The Illustrated Transformer](http://jalammar.github.io/illustrated-transformer/) by Jay Alammar. This is a popular, visually-rich guide that explains the core concepts behind the Transformer architecture, which powers most modern NLP models.

The article covers key ideas such as:

* Self-attention mechanisms
* Encoder-decoder structures
* Positional encoding
* The evolution of Transformers in deep learning


###### Installing and Importing Libraries

In [None]:
!pip install langchain-community langchain

In [None]:
!pip install -U langchain langchain-openai langchain-community


In [None]:
!pip install -U langchain langchain-community huggingface_hub


In [None]:
from langchain_community.llms import HuggingFaceHub


In [None]:
!pip install transformers faiss-cpu sentence-transformers

In [None]:
!pip freeze > requirement.txt

In [None]:
!pip install -U langchain langchain-community beautifulsoup4 requests


In [None]:
from langchain_community.document_loaders import WebBaseLoader


In [None]:
!pip -q uninstall -y langchain langchain-community langchain-openai
!pip -q install "langchain<0.2.0" beautifulsoup4 requests


Step 1: Loading

Specify a DocumentLoader to load in unstructured data as Documents.


A WebBaseLoader is used to load all text from HTML webpages into a document format that we can use for NLP tasks

Loading the Web Document with LangChain

Using LangChain’s WebBaseLoader to load the blog content directly from the URL. This:

i. Fetches and parses the webpage

ii. Strips out HTML tags

iii. Returns clean, readable text stored in the data variable

This allows us to work with real-world web content without manual preprocessing.




In [None]:
from langchain.document_loaders import WebBaseLoader

loader = WebBaseLoader("http://jalammar.github.io/illustrated-transformer/")
data = loader.load()

Step 2: Splitting

Split the Document into chunks for embedding and vector storage.



*   Vector Store: One of the most common ways to store and search over unstructured data is to embed it and store the resulting embedding vectors, and then at query time to embed the unstructured query and retrieve the embedding vectors that are 'most similar' to the embedded query.
*   Text Embedding:  It is the process of converting text into a numerical representation, typically a vector (a list of numbers). Each word or subword in the text is mapped to a vector in such a way that similar words or phrases have similar vector representations.



In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 0) #Chunk Size=500: Each chunk will contain up to 500 characters. Chunk Overlap = 0, no overlap is introducted
all_splits = text_splitter.split_documents(data)


Step 3: Storing

Embedding the contents of each document, then store the embedding and document in a vector store, with the embedding being used to index the document.

In [None]:
# Import FAISS from Langchain Vectorstore
from langchain.vectorstores import FAISS

In [None]:
from langchain.llms import HuggingFacePipeline
from langchain.embeddings import HuggingFaceEmbeddings

In [None]:
model_name = "sentence-transformers/all-mpnet-base-v2" #to optimize for creating embeddings of sentences & text
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False} #to prevent normalizaton of the embeddings
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [None]:
 # Creating a vector store
vectorstore = FAISS.from_documents(documents=all_splits, embedding=hf) ## hf are the hugging face embeddings

Step 4: Retrieval

In [None]:
question = "What are transformers?"
docs = vectorstore.similarity_search(question) #searching the vector store for the most relevant document chunks based on the similarity of their embeddings to the query's embedding.
docs

Step 5: Generation

In [None]:
question = "What are transformers?"

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA

# Load model and tokenizer
model_name = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Define the pipeline
hf_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)

# Wrap the pipeline in LangChain's LLM class
llm = HuggingFacePipeline(pipeline=hf_pipeline)

In [None]:
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(llm,retriever=vectorstore.as_retriever())
qa_chain({"query": question})

In [None]:
question = " What is attention mechanism?"
qa_chain({"query": question})

Step 6: Chat

Conservation Summary Memory:  summarizes the conversation as it happens and stores the current summary in memory. This memory can then be used to inject the summary of the conversation so far into a prompt/chain. This memory is most useful for longer conversations, where keeping the past message history in the prompt verbatim would take up too many tokens.



In [None]:
from langchain.memory import ConversationSummaryMemory


In [None]:
memory = ConversationSummaryMemory(
    llm=llm,
    memory_key="chat_history",
    return_messages=True
)

Conversational Retrieval Chain

This is a type of chain for having a conversation based on retrieved documents. This chain takes in chat history (a list of messages) and new questions, and then returns an answer to that question. The algorithm for this chain consists of three parts:

Use the chat history and the new question to create a “standalone question”. This is done so that this question can be passed into the retrieval step to fetch relevant documents. If only the new question was passed in, then the relevant context may be lacking. If the whole conversation was passed into retrieval, there may be unnecessary information there that would distract from retrieval.

This new standalone question is passed to the retriever, and relevant documents are returned.

The retrieved documents are passed to an LLM along with either the new question (default behavior) or the original question and chat history to generate a final response.

In [None]:
from langchain.chains import ConversationalRetrievalChain

retriever = vectorstore.as_retriever()
chat = ConversationalRetrievalChain.from_llm(
    llm,
    retriever=retriever,
    memory=memory,
    verbose=True
)

In [None]:
chat("Explain self-attention")


In [None]:
chat("What is a gentler approach to transformers?")

In [None]:
chat("Where were transformers proposed?")


In [None]:
chat("What are the different layers in a typical Transformer model?")


In [None]:
chat("If the vocabulary is 10,000 words, what would the width of the logits vector?")


In [None]:
chat("Explain the training process of a Transformer network in detail")


### Exploratory Data Analysis

In [None]:
# 1. Dataset Summary
print(f"Total documents: {len(data)}")

# Extract the text content from each Document object
document_lengths = [len(doc.page_content.split()) for doc in data]  # Assuming 'data' is a list of Document objects

# Display summary statistics
print(f"Average document length: {sum(document_lengths)/len(document_lengths)} words")
print(f"Minimum document length: {min(document_lengths)} words")
print(f"Maximum document length: {max(document_lengths)} words")

In [None]:
# 2. Text Sample Inspection
sample_size = 5
print(f"Displaying {sample_size} sample documents:")

for i in range(min(sample_size, len(data))):
    print(f"\nSample {i+1}:")
    print(data[i].page_content[:500])  # Display the first 500 characters of each document

In [None]:
# 3. Tokenization
from transformers import AutoTokenizer

# Load tokenizer for FLAN-T5 model
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")

# Apply tokenizer and check token counts for first few documents
tokenized_lengths = [len(tokenizer.tokenize(doc.page_content)) for doc in data]
print(f"Average token length: {sum(tokenized_lengths)/len(tokenized_lengths)} tokens")
print(f"Minimum token length: {min(tokenized_lengths)} tokens")
print(f"Maximum token length: {max(tokenized_lengths)} tokens")


In [None]:
# 4. Document Distribution Analysis
import matplotlib.pyplot as plt

# Plot the distribution of document lengths (in words) and token lengths
plt.figure(figsize=(12, 6))

# Document length distribution
plt.subplot(1, 2, 1)
plt.hist(document_lengths, bins=20, color='skyblue', edgecolor='black')
plt.title("Document Length Distribution (in Words)")
plt.xlabel("Word Count")
plt.ylabel("Frequency")

# Token length distribution
plt.subplot(1, 2, 2)
plt.hist(tokenized_lengths, bins=20, color='lightgreen', edgecolor='black')
plt.title("Token Length Distribution")
plt.xlabel("Token Count")
plt.ylabel("Frequency")

plt.tight_layout()
plt.show()

In [None]:
!pip -q install nbconvert
!jupyter nbconvert --ClearOutputPreprocessor.enabled=True --LangChain Document Q&A.ipynb

In [35]:
!ls -lah


total 36K
drwxr-xr-x 1 root root 4.0K Jan 24 12:11 .
drwxr-xr-x 1 root root 4.0K Jan 24 10:40 ..
drwxr-xr-x 4 root root 4.0K Jan 16 14:24 .config
drwx------ 5 root root 4.0K Jan 24 12:11 drive
-rw-r--r-- 1 root root  13K Jan 24 10:47 requirement.txt
drwxr-xr-x 1 root root 4.0K Jan 16 14:24 sample_data


In [36]:
!find /content -name "*.ipynb" -maxdepth 5


/content/drive/MyDrive/Colab Notebooks/LangChain Document Q&A.ipynb
/content/drive/MyDrive/Colab Notebooks/PythonForDataScience_intro-1.ipynb
/content/drive/MyDrive/Colab Notebooks/Hands_on_Notebook_NumPy_v1.ipynb
/content/drive/MyDrive/Colab Notebooks/Session_Notebook_MovieLens_Case_Study+%283%29 (1).ipynb
/content/drive/MyDrive/Colab Notebooks/Solution_Notebook_Cred_Pay_Case_Study+%282%29.ipynb
/content/drive/MyDrive/Colab Notebooks/PythonVisualization (1).ipynb
/content/drive/MyDrive/Colab Notebooks/Untitled0.ipynb
/content/drive/MyDrive/Colab Notebooks/Session_Notebook_MovieLens_Case_Study+%283%29.ipynb
/content/drive/MyDrive/Colab Notebooks/PythonVisualization.ipynb
/content/drive/MyDrive/Colab Notebooks/Uber_Case_Study_%281%29_%281%29 (2).ipynb
/content/drive/MyDrive/Colab Notebooks/Uber_Case_Study_%281%29_%281%29 (1).ipynb
/content/drive/MyDrive/Colab Notebooks/Uber_Case_Study_%281%29_%281%29.ipynb
/content/drive/MyDrive/Colab Notebooks/Copy of PYF_Project_LearnerNotebook_LowCod

In [37]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
