In [17]:
# Import functions / modules
from langchain_community.document_loaders import UnstructuredExcelLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.vectorstores.faiss import FAISS
from IPython.display import display, Markdown
from dotenv import load_dotenv
import os

In [18]:
# Load environment variables in a file called .env

load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

# Check the key

if not api_key:
    print("No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!")
elif not api_key.startswith("sk-proj-"):
    print("An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook")
elif api_key.strip() != api_key:
    print("An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook")
else:
    print("API key found and looks good so far!")

API key found and looks good so far!


In [None]:
# !pip install networkx

## Excel

In [11]:
# !pip install "unstructured[all-docs]"

# Create an instance of UnstructuredExcelLoader to load and parse the Excel file.
# The 'mode="elements"' argument tells the loader to break the file into its individual components (such as tables, text, etc.).
loader = UnstructuredExcelLoader("./resources/data/Reviews.xlsx", mode="elements")
docs = loader.load()

docs[0]

Document(metadata={'source': './resources/data/Reviews.xlsx', 'file_directory': './resources/data', 'filename': 'Reviews.xlsx', 'last_modified': '2025-08-08T13:41:19', 'page_name': 'Udemy_Reviews_Export_2024-08-22', 'page_number': 1, 'text_as_html': '<table><tr><td>Course Name</td><td>Student Name</td><td>Timestamp</td><td>Rating</td><td>Comment</td></tr><tr><td>Master Python for Data Analysis and Business Analytics 2024</td><td>Gaurav Mehra</td><td>2024-08-21 06:46:55+00:00</td><td>4</td><td/></tr><tr><td>Master Python for Data Analysis and Business Analytics 2024</td><td>Harigovind S</td><td>2024-08-21 04:35:13+00:00</td><td>5</td><td/></tr><tr><td>Data Literacy and Business Analytics for Business Leaders</td><td>Celine Jayme</td><td>2024-08-21 01:42:37+00:00</td><td>4</td><td/></tr><tr><td>Decision Making with Problem Solving &amp; Critical Thinking</td><td>Donovan Smith</td><td>2024-08-20 20:02:59+00:00</td><td>4</td><td/></tr><tr><td>Econometrics and Statistics for Business in R &

### Split the documents into chunks

In [13]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 2000,
    chunk_overlap = 200
)
chunks = text_splitter.split_documents(docs)

# Display the first 5 chunks
chunks[:5]

[Document(metadata={'source': './resources/data/Reviews.xlsx', 'file_directory': './resources/data', 'filename': 'Reviews.xlsx', 'last_modified': '2025-08-08T13:41:19', 'page_name': 'Udemy_Reviews_Export_2024-08-22', 'page_number': 1, 'text_as_html': '<table><tr><td>Course Name</td><td>Student Name</td><td>Timestamp</td><td>Rating</td><td>Comment</td></tr><tr><td>Master Python for Data Analysis and Business Analytics 2024</td><td>Gaurav Mehra</td><td>2024-08-21 06:46:55+00:00</td><td>4</td><td/></tr><tr><td>Master Python for Data Analysis and Business Analytics 2024</td><td>Harigovind S</td><td>2024-08-21 04:35:13+00:00</td><td>5</td><td/></tr><tr><td>Data Literacy and Business Analytics for Business Leaders</td><td>Celine Jayme</td><td>2024-08-21 01:42:37+00:00</td><td>4</td><td/></tr><tr><td>Decision Making with Problem Solving &amp; Critical Thinking</td><td>Donovan Smith</td><td>2024-08-20 20:02:59+00:00</td><td>4</td><td/></tr><tr><td>Econometrics and Statistics for Business in R 

### Embeddings with OpenAI embedding model

In [19]:
embeddings = OpenAIEmbeddings(
    openai_api_key = api_key,
    model = "text-embedding-3-large"
)

# Create the FAISS index
# Create a FAISS vector store (database) from the list of document chunks and their corresponding embeddings.
# FAISS (Facebook AI Similarity Search) is a library for efficient similarity search and clustering of dense vectors.
# Here, 'from_documents' takes the split document chunks and computes their vector representations using the provided embedding model.
# The resulting 'db_faiss' object allows for fast similarity search and retrieval of relevant document chunks based on vector similarity.
# Stores the embeddings of the document chunks in the FAISS vector store.
db_faiss = FAISS.from_documents(chunks, embeddings)
db_faiss

<langchain_community.vectorstores.faiss.FAISS at 0x163c90150>

### Try the Retrieval System

In [20]:
query = "Give me my worst reviews"

# Retrieve the context -> Langchain uses Cosine Distance Metric
docs_faiss = db_faiss.similarity_search_with_score(query, k = 5)
docs_faiss

[(Document(id='c56e32a3-558c-477c-b45a-b34c8614d44c', metadata={'source': './resources/data/Reviews.xlsx', 'file_directory': './resources/data', 'filename': 'Reviews.xlsx', 'last_modified': '2025-08-08T13:41:19', 'page_name': 'Udemy_Reviews_Export_2024-08-22', 'page_number': 1, 'text_as_html': '<table><tr><td>Course Name</td><td>Student Name</td><td>Timestamp</td><td>Rating</td><td>Comment</td></tr><tr><td>Master Python for Data Analysis and Business Analytics 2024</td><td>Gaurav Mehra</td><td>2024-08-21 06:46:55+00:00</td><td>4</td><td/></tr><tr><td>Master Python for Data Analysis and Business Analytics 2024</td><td>Harigovind S</td><td>2024-08-21 04:35:13+00:00</td><td>5</td><td/></tr><tr><td>Data Literacy and Business Analytics for Business Leaders</td><td>Celine Jayme</td><td>2024-08-21 01:42:37+00:00</td><td>4</td><td/></tr><tr><td>Decision Making with Problem Solving &amp; Critical Thinking</td><td>Donovan Smith</td><td>2024-08-20 20:02:59+00:00</td><td>4</td><td/></tr><tr><td>Ec

In [21]:
# Merge the retrieved documents into a single context string for the Generation system
# We use _score here because each item in docs_faiss is a tuple: (Document, score).
# The score represents the similarity between the query and the document chunk.
context_text = "\n\n".join([doc.page_content for doc, _score in docs_faiss])
context_text

"2021-12-20 08:30:29+00:00 5 Master Time Series Analysis and Forecasting with Python 2024 Leo TiÅ¡ljariÄ‡ 2021-12-18 16:56:12+00:00 5 Master Time Series Analysis and Forecasting with Python 2024 Nikhitha 2021-12-18 10:51:14+00:00 4.5 Econometrics and Statistics for Business in R & Python Konstantinos Tsiamasiotis 2021-12-18 04:59:38+00:00 5 Econometrics and Statistics for Business in R & Python Angeliki Kafritsa 2021-12-17 08:37:51+00:00 5 Econometrics and Statistics for Business in R & Python Rosie Liu 2021-12-16 22:21:46+00:00 4 Econometrics and Statistics for Business in R & Python Renuka Gusain 2021-12-16 11:10:09+00:00 4.5 Master Time Series Analysis and Forecasting with Python 2024 Yhasreen Abrahim 2021-12-16 10:33:42+00:00 3 content too simple and basic time series XGBoost for Business: Machine Learning Course in Python & R Jacob Lackey 2021-12-16 04:00:47+00:00 3 Too many spelling errors and grammatical mistakes on the powerpoint slides Econometrics and Statistics for Business 

In [22]:
# Create a simple prompt for a RAG system
prompt = f"""
based on this context {context_text}
please answer this question {query}
if you don't know the answer just say you don't know.
"""

In [23]:
# Call the OpenAI API with the Langchain
model = ChatOpenAI(
    openai_api_key = api_key,
    model = "gpt-4o-mini",
    temperature = 0 # Set temperature to 0 for deterministic responses
)
# Generate a response using the provided prompt
response_text = model.invoke(prompt)

In [24]:
# Display the answer as Markdown
display(Markdown(response_text.content))

Based on the provided context, here are the worst reviews:

1. **Master Time Series Analysis and Forecasting with Python 2024** - Rating: 3
   - Reviewer: Namita Menon
   - Date: 2022-01-20 04:55:45+00:00

2. **content too simple and basic time series** - Rating: 3
   - Reviewer: Jacob Lackey
   - Date: 2021-12-16 10:33:42+00:00

3. **Too many spelling errors and grammatical mistakes on the powerpoint slides** - Rating: 3
   - Reviewer: Anonymized User
   - Date: 2021-12-16 04:00:47+00:00

4. **Forecasting Models & Time Series Analysis for Business in R** - Rating: 2.5
   - Reviewer: Thomas Tharakan
   - Date: 2022-01-03 09:26:53+00:00

These reviews reflect the lowest ratings given in the context provided.

### Preparing the unstructured data

In [25]:
def prepare_excel(file_path):
    # Loading the data
    loader = UnstructuredExcelLoader(file_path, mode = "elements")
    docs = loader.load()

    # Split the text into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 2000,
        chunk_overlap = 200
    )
    chunks = text_splitter.split_documents(docs)

    # Prepare the embeddings
    embeddings = OpenAIEmbeddings(
        openai_api_key = api_key,
        model = "text-embedding-3-large"
    )

    # Create the FAISS index
    db_faiss = FAISS.from_documents(chunks, embeddings)

    return db_faiss

### Prepare a function to retrieve and generate (RAG)

In [26]:
def ask(df, query, k):
  # Retrieve relevant documents from the FAISS index
  docs_faiss = df.similarity_search_with_score(query, k = k)

  # Merge the content of retrieved documents into a single context string
  context_text = "\n\n".join([doc.page_content for doc, _score in docs_faiss])

  # Define the prompt for the language model
  prompt = f"""
  based on this context {context_text}
  please answer this question {query}
  if the information is not in the context, say that you don't have information.
  """

  # Call the language model to generate a response
  model = ChatOpenAI(
      openai_api_key = api_key,
      model = "gpt-4o-mini",
      temperature = 0
  )
  # Generate the response
  response_text = model.invoke(prompt)

  # Display the response as Markdown
  return display(Markdown(response_text.content))

In [27]:
# Preparing the excel Data
df_excel = prepare_excel("./resources/data/Reviews.xlsx")

In [28]:
# Define the query
query = """
Analyse the comments for the Econometrics course and tell me the top 3 improvements
"""

In [29]:
# Ask the question
ask(df_excel, query, 20)

Based on the comments for the Econometrics course, the top three suggested improvements are:

1. **Clearer Communication of Assignment Deadlines**: Several users mentioned that clearer communication regarding assignment deadlines could enhance the overall learning experience.

2. **More In-Depth Exploration of Functions/Methods**: Some comments indicated that the course could benefit from a deeper exploration of the functions and methods used, as this would help students modify them for different needs more easily.

3. **Use of Real-Life Data and Case Studies**: There were suggestions to incorporate more real-life data and case studies, as well as to use "bad/dirty" data to teach students how to clean it effectively. This would provide a more practical learning experience.

These improvements could help enhance the course's effectiveness and applicability for students.

### Working with Word Documents

In [None]:
# Install libraries
# !pip install python-docx
# !pip install --upgrade nltk



### Adjust Chunk Size:

> For shorter or context-sensitive documents, reduce the chunk size to 300-400 characters.

### Different Embedding Models:

> Choose a smaller model like "text-embedding-3-small" for faster processing

In [31]:
# Import nltk library
import nltk
nltk.download('punkt')  # Download the 'punkt' tokenizer model for sentence and word tokenization

# Import the document loader for unstructured Word documents
from langchain_community.document_loaders import UnstructuredWordDocumentLoader

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mdashikadnan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [32]:
# Load the Word document as individual elements (e.g., paragraphs, tables, etc.)
loader = UnstructuredWordDocumentLoader(
    "./resources/data/Declaration of independence.docx",  # Path to the Word document
    mode="elements"  # Parse the document into its structural elements
)

# Use the loader to extract the document's content into a list of Document objects
docs = loader.load()

# Display the first 5 elements to inspect the structure and content
docs[:5]

[Document(metadata={'source': './resources/data/Declaration of independence.docx', 'category_depth': 0, 'file_directory': './resources/data', 'filename': 'Declaration of independence.docx', 'last_modified': '2025-08-09T01:11:44', 'page_number': 1, 'languages': ['eng'], 'filetype': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'category': 'UncategorizedText', 'element_id': 'bb562112a268f9b61e327bb112658e40'}, page_content='In Congress, July 4, 1776'),
 Document(metadata={'source': './resources/data/Declaration of independence.docx', 'category_depth': 0, 'file_directory': './resources/data', 'filename': 'Declaration of independence.docx', 'last_modified': '2025-08-09T01:11:44', 'page_number': 1, 'languages': ['eng'], 'filetype': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'category': 'NarrativeText', 'element_id': '37dfdfb7ee7a49a6bd0e5a59bf4d33a4'}, page_content="The unanimous Declaration of the thirteen united States of Americ

In [33]:
# Create a function to prepare the word document
def prepare_word(file_path):
  # Loading the data from the Word document
  loader = UnstructuredWordDocumentLoader(
      file_path, # Path to the Word document
      mode = "elements"
  )
  # Load the document content into `docs`
  docs = loader.load()

  # Split the document into chunks for processing
  text_splitter = RecursiveCharacterTextSplitter(
      chunk_size = 500,
      chunk_overlap = 50
  )
  # Split the documents into chunks
  chunks = text_splitter.split_documents(docs)

  # Create embeddings using OpenAI
  embeddings = OpenAIEmbeddings(
      openai_api_key = api_key,
      model = "text-embedding-3-large"
  )
  # Create a FAISS index from the document chunks
  df_faiss = FAISS.from_documents(chunks, embeddings)

  # Return the FAISS index for further use
  return df_faiss

In [34]:
# Define a couple of queries
query1 = "Tell me about the document by giving the 3 main points"
query2 = "Get me the best chocolate cake recipe"

In [35]:
# Prepare the word data
db_doc = prepare_word("./resources/data/Declaration of independence.docx")
ask(db_doc, query1, k = 5)

The document you provided appears to be an excerpt from the Declaration of Independence of the United States. Here are three main points derived from the context:

1. **Right to Alter or Abolish Government**: The document asserts that when a government becomes destructive to the rights of the people, it is their right and duty to alter or abolish it and establish a new government that ensures their safety and happiness.

2. **Unalienable Rights**: It emphasizes that all men are created equal and are endowed with certain unalienable rights, including life, liberty, and the pursuit of happiness. Governments are established to secure these rights, deriving their powers from the consent of the governed.

3. **Justification for Separation**: The document states that when it becomes necessary for one people to dissolve the political connections with another, they must declare the reasons for their separation, demonstrating respect for the opinions of mankind and the principles of natural law.

If you need more specific information or details, please let me know!

In [36]:
# Test with query 2
ask(db_doc, query2, k = 5)

I don't have information on chocolate cake recipes based on the provided context.

### Powerpoint

* chunk_size = 200 : Powerpoint slides have short text segments, so a 200-character chunk size ensures meaningful content.

* chunk_overlap=20 : small overlap maintains context between chunks, essential for presentations with ideas spread across slides.

* Adjusting Chunk Size: For detailed text, a chunk size of 300 or more may be better.

In [37]:
# Install a library
# !pip install python-pptx

In [38]:
# Import the PPT class
from langchain_community.document_loaders import UnstructuredPowerPointLoader

In [39]:
# Build a function to prepare powerpoint
def prepare_ppt(file_path):
  # Loading the data from the PowerPoint presentation
  loader = UnstructuredPowerPointLoader(file_path, mode = "elements")
  docs = loader.load()

  # Split the document into chunks for processing
  text_splitter = RecursiveCharacterTextSplitter(
      chunk_size = 500,
      chunk_overlap = 50
  )
  # Split the documents into chunks
  chunks = text_splitter.split_documents(docs)

  # Create embeddings using OpenAI
  embeddings = OpenAIEmbeddings(
      openai_api_key = api_key,
      model = "text-embedding-3-large"
  )
  # Create a FAISS index from the document chunks
  df_faiss = FAISS.from_documents(chunks, embeddings)

  # Return the FAISS index for further use
  return df_faiss

In [40]:
# Prepare the presentation data
db_pp = prepare_ppt("./resources/data/Bitte pitch deck EN.pptx")

In [41]:
# Define a couple of queries to test the presentation
query1 = "What is Bitte's competitive advantage?"
query2 = "What is indepence"

In [42]:
# Ask the questions
ask(db_pp, query1, k = 5)

Bitte's competitive advantage lies in its ability to provide a richer and more interactive dining experience through its digital menu. This includes features such as presenting not only photographs and nutritional information but also offering suggestions for dish combinations, which can promote increased sales. Additionally, Bitte offers customizable layouts that align with the restaurant's brand image, enhancing the overall customer experience. However, specific details about payment commissions or transaction advantages are not provided in the context.

In [43]:
# Ask the second question
ask(db_pp, query2, 5)

The context provided does not contain information about "independence." Therefore, I don't have information on that topic.

## EPUB

EPUB files contain text, metadata, and images. We'll show how to load, chunk, and embed them for easy retrieval.

```
Document Loading -> Text Splitting -> Embedding -> Vector Stores
```

`pypandoc` allows the epub conversion.

In [None]:
# Install all the required libraries
# !pip install pypandoc pandoc

In [44]:
# Import the libraries
from langchain_community.document_loaders import UnstructuredEPubLoader
import pypandoc
# Download and install pandoc if it is not already installed.
# Pandoc is required by pypandoc to perform file format conversions (such as EPUB to text).
# This ensures that the necessary backend is available for document processing.
pypandoc.download_pandoc()

x .
x ./usr
x ./usr/local
x ./usr/local/bin
x ./usr/local/bin/pandoc
x ./usr/local/bin/._pandoc
x ./usr/local/bin/pandoc-lua
x ./usr/local/bin/pandoc-server
x ./usr/local/._bin
x ./usr/local/share
x ./usr/local/share/man
x ./usr/local/share/man/man1
x ./usr/local/share/man/man1/pandoc-lua.1
x ./usr/local/share/man/man1/._pandoc-lua.1
x ./usr/local/share/man/man1/pandoc-server.1
x ./usr/local/share/man/man1/._pandoc-server.1
x ./usr/local/share/man/man1/pandoc.1
x ./usr/local/share/man/man1/._pandoc.1
x ./usr/local/share/man/._man1
x ./usr/local/share/._man
x ./usr/local/._share
x ./usr/._local
x ./._usr


In [45]:
# Build function prepared epub docs
def prepare_epub(file_path):
    # Loading the data
    loader = UnstructuredEPubLoader(file_path, mode = "elements")
    docs = loader.load()

    # Split into chunks, Embeddings and FAISS
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 200,
        chunk_overlap = 20
    )
    # Split the documents into chunks
    chunks = text_splitter.split_documents(docs)

    # Create embeddings using OpenAI
    embeddings = OpenAIEmbeddings(
        openai_api_key = api_key,
        model = "text-embedding-3-large"
    )
    # Create a FAISS index from the document chunks
    db_faiss = FAISS.from_documents(chunks, embeddings)

    return db_faiss

In [46]:
# Prepare the EPUB
db_epub = prepare_epub("./resources/data/Alice’s Adventures in Wonderland.epub")

In [47]:
# Prepare a couple of queries
query1 = "What is the main point of the story?"
query2 = "Does Alice like Bitte's digital menus?"

In [48]:
# Answer query 1
ask(db_epub, query1, 5)

The main point of the story, as suggested by the Duchess, is that everything has a moral, and in this case, the moral presented is that "’tis love, ’tis love, that makes the world go round." This implies that love is a fundamental and driving force in life.

In [None]:
# Answer query 2
ask(db_epub, query2, 5)