In [26]:
from dotenv import load_dotenv
import os
load_dotenv()

GOOGLE_API_KEY =os.getenv("GOOGLE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [27]:
# youtube data

import pandas as pd
from langchain_core.documents import Document

# --- Load and Prepare Data ---
# Load the CSV
df = pd.read_csv('youtube_data.csv')

# 1. Replace NaN with an empty string so the code doesn't break
df['Summary'] = df['Summary'].fillna("")

# 2. Define a function to normalize Unicode (fixes \u202f, curly quotes, etc.)
import unicodedata
def clean_text(text):
    # Ensure it's a string before normalizing
    return unicodedata.normalize('NFKC', str(text))

# 3. Apply the cleaning function to the entire 'Summary' column
df['Summary'] = df['Summary'].apply(clean_text)
# 4. Apply to Video Title
df['Video Title'] = df['Video Title'].apply(clean_text)


youtube_documents = []
for index, row in df.iterrows():
    # Create a LangChain Document
    # page_content: This is what gets embedded (The Summary)
    # metadata: This is the extra data you want to retrieve (Title, URL, etc.)
    if not row['Summary'].strip():
        continue

    doc = Document(
        page_content=row['Summary'], 
        metadata={
            "video_title": row['Video Title'], # Now using the cleaned title
            "video_url": row['Video URL'],
        }
    )
    youtube_documents.append(doc)

print(f"Created {len(youtube_documents)} documents.")

Created 547 documents.


In [28]:
# annual report data

import glob
import re
import os
from langchain_core.documents import Document

# 1. Define the directory path
# (Using raw string r'' to handle backslashes correctly on Windows)
folder_path = r"annual_report_data"
file_pattern = os.path.join(folder_path, "page_*.md")

# 2. Get the list of files
file_list = glob.glob(file_pattern)

# Optional: Sort files numerically by page number (1, 2, 10 instead of 1, 10, 2)
# We extract numbers using regex for sorting
file_list.sort(key=lambda f: int(re.search(r'page_(\d+)', f).group(1)))

annual_rep_documents = []

for file_path in file_list:
    try:
        # 3. Extract Page Number from filename
        # This regex looks for 'page_' followed by digits (\d+)
        match = re.search(r'page_(\d+)\.md', file_path)
        
        if match:
            page_num = int(match.group(1))
        else:
            page_num = 0 # Fallback if filename pattern doesn't match
            
        # 4. Read the file content
        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()

        # 5. Create the Document
        doc = Document(
            page_content=content,
            metadata={
                "report_title": "DILMAH CEYLON TEA COMPANY PLC ANNUAL REPORT 2024/25",
                "page_number": page_num,
            }
        )
        annual_rep_documents.append(doc)

    except Exception as e:
        print(f"Error reading {file_path}: {e}")

print(f"Created {len(annual_rep_documents)} annual report documents.")

Created 252 annual report documents.


In [29]:
## web site

import glob
import os
import re
from langchain_core.documents import Document

# 1. Define the root folder
folder_path = "website_data"

# 2. Find all .md files recursively (in all subfolders)
# "**/*.md" combined with recursive=True searches every nested folder
file_list = glob.glob(os.path.join(folder_path, "**/*.md"), recursive=True)

website_documents = []

for file_path in file_list:
    try:
        # --- Step A: Read the file ---
        with open(file_path, "r", encoding="utf-8") as f:
            raw_content = f.read()

        # --- Step B: Clean the Content (Remove gaps) ---
        # This Regex looks for 3 or more newlines (or newlines with spaces in between) 
        # and replaces them with just two newlines (standard paragraph break).
        # It turns "\n\n\n\n" -> "\n\n"
        clean_content = re.sub(r'\n\s*\n', '\n\n', raw_content).strip()
        
        # Skip empty files
        if not clean_content:
            continue

        # --- Step D: Create Document ---
        doc = Document(
            page_content=clean_content,
            metadata={
                "url": "https://www.dilmahtea.com/",      # The extracted URL
                "type": "website_content"
            }
        )
        website_documents.append(doc)

    except Exception as e:
        print(f"Skipping {file_path}: {e}")

# --- Verify ---
print(f"Successfully loaded {len(website_documents)} website pages.")

Successfully loaded 10 website pages.


In [30]:
# now let's combine all documents together
documents = youtube_documents + annual_rep_documents + website_documents
print(f"Total documents combined: {len(documents)}")

Total documents combined: 809


In [31]:
documents[0]

Document(metadata={'video_title': 'Blessed Christmas Wishes from the Dilmah Family 2025', 'video_url': 'https://www.youtube.com/watch?v=cnuSE24ECuM'}, page_content='The family‐owned tea, tourism, and cinnamon business was founded 40 years ago, originating from the founder’s grandfather’s vision of healthier, kinder, and more sustainable products. Its stated global mission is to assist hundreds of thousands of less‐fortunate people each year through those three sectors. The company emphasizes quality, integrity, and environmental respect, attributing its impact to worldwide customers. A holiday message references the upcoming year 2026.')

In [32]:
documents[700]

Document(metadata={'report_title': 'DILMAH CEYLON TEA COMPANY PLC ANNUAL REPORT 2024/25', 'page_number': 154}, page_content='<|ref|>title<|/ref|><|det|>[[81, 52, 422, 71]]<|/det|>\n# CORPORATE GOVERNANCE\n\n<|ref|>table<|/ref|><|det|>[[83, 144, 914, 486]]<|/det|>\n\n<table><tr><td colspan="6">Governance Highlights - FY 2024/25</td></tr><tr><td>Name of Director</td><td>Position on the Dilmah Board</td><td colspan="2">No. of Directorships held in listed Companies other than Dilmah Ceylon Tea Company PLC</td><td colspan="2">Directorships held in non- listed Companies</td></tr><tr><td></td><td></td><td>Executive<br>Capacity</td><td>Non-Executive Capacity</td><td>Executive<br>Capacity</td><td>Non-Executive Capacity</td></tr><tr><td>Mr. Dilhan C. Fernando</td><td>Executive Director/Chief<br>Executive Officer/ Chairman</td><td>None</td><td>03</td><td>46</td><td>14</td></tr><tr><td>Mr. Malik J. Fernando</td><td>Executive Director</td><td>None</td><td>05</td><td>44</td><td>14</td></tr><tr><td>M

In [33]:
documents[808]

Document(metadata={'url': 'https://www.dilmahtea.com/', 'type': 'website_content'}, page_content='---\nurl: "https://www.dilmahtea.com/tribute-type-sitemap.xml"\ntitle: undefined\n---\n\nhttps://www.dilmahtea.com/tributes/letters/2025-04-20T00:00:00+00:00https://www.dilmahtea.com/tributes/videos/2025-04-22T00:00:00+00:00')

In [34]:
# Initialize OpenAI Embeddings
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small" # Efficient and high performance
)

In [40]:
from qdrant_client import QdrantClient
from dotenv import load_dotenv
import os
load_dotenv()

QDRANT_URL = os.getenv("QDRANT_URL")
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")

# Increase timeout to 60 seconds (default is usually 5 or 10)
qdrant_client = QdrantClient(
    url=QDRANT_URL, 
    api_key=QDRANT_API_KEY,
    timeout=60.0  
)

print(qdrant_client.get_collections())

collections=[]


In [41]:
# 1. Import 'models' explicitly
from qdrant_client import models  

# 2. Now you can use 'models.VectorParams' and 'models.Distance'
qdrant_client.recreate_collection(
    collection_name="dilmah_wishes",
    vectors_config=models.VectorParams(
        size=1536,  # Ensure this matches your embedding model
        distance=models.Distance.COSINE
    )
)

print("Collection created successfully.")

  qdrant_client.recreate_collection(


Collection created successfully.


In [42]:
print(qdrant_client.get_collections())

collections=[CollectionDescription(name='dilmah_wishes')]


In [43]:
# 2. Create the Vector Store
# This will initialize the collection in Qdrant and upload the vectors.
from langchain_core.documents import Document
from langchain_qdrant import QdrantVectorStore

vector_store = QdrantVectorStore(
    client=qdrant_client,
    collection_name="dilmah_wishes",
    embedding=embeddings,
)
# Add documents manually
vector_store.add_documents(documents=documents)

print(f"Vector store created in collection 'dilmah_wishes' with {len(documents)} document(s).")

Vector store created in collection 'dilmah_wishes' with 809 document(s).


In [45]:
query = "Dilma’s Dream Escape  competition?"

# k=1 returns the top 1 result
results = vector_store.similarity_search(query, k=3)

for doc in results:
    print(f"Content: {doc.page_content}")
    print(f"Metadata: {doc.metadata}")
    print("--------------------------------")

Content: Dilma’s “Dream Escape” competition will award 20 winners a luxury trip for two to the Ceylon Tea region, and the brand is marking 40 years of tea‐time partnerships with Australians while allocating 15 % of its profits to conservation projects such as the Elephant Transit Home. The Elephant Transit Home, situated in Uda Walawe National Park about a 4.5‐hour drive from Colombo, rescues orphaned Asian elephants, rehabilitates them (feeding them five times daily) and releases them in social groups; it has been supported by Dilma Conservation since 2007 and also cares for a 15‐year‐old bull with a prosthetic leg that cannot be re‐wilded. Human
Metadata: {'video_title': 'Channel 7 Australia’s visit to the Elephant Transit Home', 'video_url': 'https://www.youtube.com/watch?v=-yn2ZBmr0xA', '_id': 'cefa7e0a-6deb-42f0-a6d1-32dc80cb0b9c', '_collection_name': 'dilmah_wishes'}
--------------------------------
Content: Dilma is running a tea‐inspired dessert competition that requires partic