In [2]:
#Convert doc to markdown
from docx import Document

def docx_to_markdown(docx_path, markdown_path):
    doc = Document(docx_path)
    md_lines = []

    for para in doc.paragraphs:
        text = para.text.strip()

        if not text:
            md_lines.append("")  # blank line
            continue

        # Detect headers by style
        style = para.style.name.lower()
        if "heading 1" in style:
            md_lines.append(f"# {text}")
        elif "heading 2" in style:
            md_lines.append(f"## {text}")
        elif "heading 3" in style:
            md_lines.append(f"### {text}")
        elif "list" in style:
            md_lines.append(f"- {text}")
        else:
            md_lines.append(text)

    # Save as .md file
    with open(markdown_path, "w", encoding="utf-8") as f:
        f.write("\n\n".join(md_lines))

    print(f"✅ Converted {docx_path} → {markdown_path}")

# Example usage
docx_to_markdown("cleaned_output.docx", "output.md")


✅ Converted cleaned_output.docx → output.md


In [16]:
import re

# Load the Markdown content
with open("output.md", "r", encoding="utf-8") as file:
    content = file.read()

# Updated regex: more robust to different whitespace, line endings, and optional colons
cleaned_content = re.sub(r'(?im)^chapter\s+\d+\s*:\s+.*(?:\r?\n)?', '', content)

# Save the cleaned content
with open("output_cleaned.md", "w", encoding="utf-8") as file:
    file.write(cleaned_content)

print("All chapter headings removed successfully.")


All chapter headings removed successfully.


In [17]:
#Load and chunk the embeddings
def load_markdown(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return f.read()

# def split_markdown(text, chunk_size=300, overlap=50):
#     chunks = []
#     words = text.split()
#     for i in range(0, len(words), chunk_size - overlap):
#         chunk = " ".join(words[i:i + chunk_size])
#         chunks.append(chunk)
#     return chunks

# md_text = load_markdown("output.md")
# chunks = split_markdown(md_text)


In [19]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import markdown2
from bs4 import BeautifulSoup

# Load and clean markdown
with open("output_cleaned.md", "r", encoding="utf-8") as f:
    markdown_text = f.read()

html = markdown2.markdown(markdown_text)
plain_text = BeautifulSoup(html, "html.parser").get_text()

# Chunk the plain text
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,
    chunk_overlap=50
)
chunks = text_splitter.split_text(plain_text)

# Print a few chunks
for i, chunk in enumerate(chunks[:5]):
    print(f"\n--- Chunk {i+1} ---\n{chunk}")



--- Chunk 1 ---
BOOK #1
AI Shaping Tomorrow's World
An AI-Driven World
By
Charles Antony
Table of Contents
The Dawn of the AI Era

--- Chunk 2 ---
Table of Contents
The Dawn of the AI Era
The morning sun rises over Silicon Valley, casting long shadows across gleaming corporate campuses where some of the world's most advanced artificial intelligence systems are being developed. Inside these buildings, teams of researchers and engineers are pushing the boundaries of what machines can do, creating systems that can see, hear, speak, and reason in ways that would have seemed impossible just a decade ago.

--- Chunk 3 ---
This scene represents more than just technological progress, it symbolizes a fundamental shift in human history. We are witnessing the emergence of a new era, one in which artificial intelligence is reshaping the very fabric of our society. Unlike previous technological revolutions that primarily transformed physical labor, the AI revolution is unprecedented in its ability

In [20]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

# Load the model and tokenizer
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()

# Embedding function
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)

    token_embeddings = outputs.last_hidden_state
    attention_mask = inputs['attention_mask']
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size())
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return (sum_embeddings / sum_mask).squeeze().numpy()


In [21]:
#Creating indexes
import faiss
import numpy as np
import pickle

# Step 1: Create embeddings for all chunks
embeddings = [get_embedding(chunk) for chunk in chunks]
embeddings = np.array(embeddings).astype("float32")

# Step 2: Create FAISS index
index = faiss.IndexFlatL2(embeddings.shape[1])

# Step 3: Add embeddings to the index
index.add(embeddings)

# Step 4: Save the index and chunks
faiss.write_index(index, "chunk_index.faiss")
with open("chunk_texts.pkl", "wb") as f:
    pickle.dump(chunks, f)


In [24]:
# Load index and chunks
import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

import faiss
import pickle

index = faiss.read_index("chunk_index.faiss")

with open("chunk_texts.pkl", "rb") as f:
    chunks = pickle.load(f)

# Re-define embedding function (in case notebook restarted)
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()

def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    token_embeddings = outputs.last_hidden_state
    attention_mask = inputs['attention_mask']
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size())
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return (sum_embeddings / sum_mask).squeeze().numpy()

# Query
query = "What critical factors can determine a startup's sucess"
query_vector = get_embedding(query).astype("float32").reshape(1, -1)

# Search top 3 similar
k = 3
distances, indices = index.search(query_vector, k)

# Show results
print("Nearest indices:", indices)
print("Similar Chunks:")
for i in indices[0]:
    print("-", chunks[i])


Nearest indices: [[190 187 185]]
Similar Chunks:
- Essential Considerations
Startups require a delicate balance to succeed.
Financial Planning: Managing cash flow is critical. Projections, runway (available cash to operate), and budgeting are key components.
Capital Strategies: Raising and deploying capital effectively is crucial.
- Critical Success Factors:
Startup success isn't accidental; it's built on key elements.
Market Traction: This is the evidence that your idea resonates with the market—demonstrated by data, not just hope. Traction can be shown through paying customers, user growth, or engagement metrics. Key metrics include Customer Acquisition Cost (CAC) and Lifetime Value (LTV).
- This entrepreneurial spirit is about identifying and addressing gaps—whether it's inefficient services or outdated systems. Recognizing this foundational principle is essential for every step you take.
Key Characteristics of a Startup:
