## youtube data

In [None]:
from dotenv import load_dotenv
import os
load_dotenv()

GOOGLE_API_KEY =os.getenv("GOOGLE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [13]:
import pandas as pd
# --- 2. Load and Prepare Data ---
# Load the CSV
df = pd.read_csv('youtube_data.csv')

In [14]:
# 1. Replace NaN with an empty string so the code doesn't break
df['Summary'] = df['Summary'].fillna("")

In [16]:
# 2. Define a function to normalize Unicode (fixes \u202f, curly quotes, etc.)
import unicodedata
def clean_text(text):
    # Ensure it's a string before normalizing
    return unicodedata.normalize('NFKC', str(text))

# 3. Apply the cleaning function to the entire 'Summary' column
df['Summary'] = df['Summary'].apply(clean_text)
# 4. Apply to Video Title
df['Video Title'] = df['Video Title'].apply(clean_text)

In [17]:
from langchain_core.documents import Document

documents = []
for index, row in df.iterrows():
    # Create a LangChain Document
    # page_content: This is what gets embedded (The Summary)
    # metadata: This is the extra data you want to retrieve (Title, URL, etc.)
    if not row['Summary'].strip():
        continue

    doc = Document(
        page_content=row['Summary'], 
        metadata={
            "video_title": row['Video Title'], # Now using the cleaned title
            "video_url": row['Video URL'],
        }
    )
    documents.append(doc)

print(f"Created {len(documents)} documents.")

Created 547 documents.


In [19]:
documents[0]

Document(metadata={'video_title': 'Blessed Christmas Wishes from the Dilmah Family 2025', 'video_url': 'https://www.youtube.com/watch?v=cnuSE24ECuM'}, page_content='The family‐owned tea, tourism, and cinnamon business was founded 40 years ago, originating from the founder’s grandfather’s vision of healthier, kinder, and more sustainable products. Its stated global mission is to assist hundreds of thousands of less‐fortunate people each year through those three sectors. The company emphasizes quality, integrity, and environmental respect, attributing its impact to worldwide customers. A holiday message references the upcoming year 2026.')

In [20]:
documents[100]

Document(metadata={'video_title': 'சாயம் அதிகம் இட்டு பால் தேநீர் பருக விரும்புவோருக்கு', 'video_url': 'https://www.youtube.com/watch?v=Ei3J6IfenAk'}, page_content='The speaker drinks extra‐strength milk with honey each morning.')

## Handle annual report

In [21]:
import glob
import re
import os
from langchain_core.documents import Document

# 1. Define the directory path
# (Using raw string r'' to handle backslashes correctly on Windows)
folder_path = r"annual_report_data"
file_pattern = os.path.join(folder_path, "page_*.md")

# 2. Get the list of files
file_list = glob.glob(file_pattern)

# Optional: Sort files numerically by page number (1, 2, 10 instead of 1, 10, 2)
# We extract numbers using regex for sorting
file_list.sort(key=lambda f: int(re.search(r'page_(\d+)', f).group(1)))

documents = []

for file_path in file_list:
    try:
        # 3. Extract Page Number from filename
        # This regex looks for 'page_' followed by digits (\d+)
        match = re.search(r'page_(\d+)\.md', file_path)
        
        if match:
            page_num = int(match.group(1))
        else:
            page_num = 0 # Fallback if filename pattern doesn't match
            
        # 4. Read the file content
        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()

        # 5. Create the Document
        doc = Document(
            page_content=content,
            metadata={
                "report_title": "DILMAH CEYLON TEA COMPANY PLC ANNUAL REPORT 2024/25",
                "page_number": page_num,
            }
        )
        documents.append(doc)

    except Exception as e:
        print(f"Error reading {file_path}: {e}")

# Verify
print(f"Successfully loaded {len(documents)} pages.")
print(f"Sample Metadata: {documents[0].metadata}")

Successfully loaded 252 pages.
Sample Metadata: {'report_title': 'DILMAH CEYLON TEA COMPANY PLC ANNUAL REPORT 2024/25', 'page_number': 1}


In [22]:
print(f"Created {len(documents)} documents.")

Created 252 documents.


In [23]:
documents[0]

Document(metadata={'report_title': 'DILMAH CEYLON TEA COMPANY PLC ANNUAL REPORT 2024/25', 'page_number': 1}, page_content='<|ref|>title<|/ref|><|det|>[[316, 113, 696, 277]]<|/det|>\n# Strength, Resilience & Future  \n\n<|ref|>image<|/ref|><|det|>[[0, 308, 999, 999]]<|/det|>')

## handle website

In [32]:
import glob
import os
import re
from langchain_core.documents import Document

# 1. Define the root folder
folder_path = "website_data"

# 2. Find all .md files recursively (in all subfolders)
# "**/*.md" combined with recursive=True searches every nested folder
file_list = glob.glob(os.path.join(folder_path, "**/*.md"), recursive=True)

documents = []

for file_path in file_list:
    try:
        # --- Step A: Read the file ---
        with open(file_path, "r", encoding="utf-8") as f:
            raw_content = f.read()

        # --- Step B: Clean the Content (Remove gaps) ---
        # This Regex looks for 3 or more newlines (or newlines with spaces in between) 
        # and replaces them with just two newlines (standard paragraph break).
        # It turns "\n\n\n\n" -> "\n\n"
        clean_content = re.sub(r'\n\s*\n', '\n\n', raw_content).strip()
        
        # Skip empty files
        if not clean_content:
            continue

        # --- Step D: Create Document ---
        doc = Document(
            page_content=clean_content,
            metadata={
                "url": "https://www.dilmahtea.com/",      # The extracted URL
                "type": "website_content"
            }
        )
        documents.append(doc)

    except Exception as e:
        print(f"Skipping {file_path}: {e}")

# --- Verify ---
print(f"Successfully loaded {len(documents)} website pages.")

if len(documents) > 0:
    print(f"\n--- Sample Metadata ---")
    print(documents[0].metadata)
    print(f"\n--- Sample Content Snippet (First 200 chars) ---")
    print(documents[0].page_content[:200])

Successfully loaded 10 website pages.

--- Sample Metadata ---
{'url': 'https://www.dilmahtea.com/', 'type': 'website_content'}

--- Sample Content Snippet (First 200 chars) ---
---
url: "https://www.dilmahtea.com/"
title: "Home - Dilmah Tea"
---

[Skip to content](https://www.dilmahtea.com/#primary)

[![Dilmah Tea](https://www.dilmahtea.com/wp-content/themes/dilmah/img/logo.


In [33]:
print(f"Created {len(documents)} documents.")

Created 10 documents.


In [34]:
documents[0]

Document(metadata={'url': 'https://www.dilmahtea.com/', 'type': 'website_content'}, page_content='---\nurl: "https://www.dilmahtea.com/"\ntitle: "Home - Dilmah Tea"\n---\n\n[Skip to content](https://www.dilmahtea.com/#primary)\n\n[![Dilmah Tea](https://www.dilmahtea.com/wp-content/themes/dilmah/img/logo.png)](https://www.dilmahtea.com/)\n\n- [Family](https://www.dilmahtea.com/family/)\n\n[close](https://www.dilmahtea.com/#)\n\n[Family](https://www.dilmahtea.com/family/)\n\nA family business, serving humanity with kindness to people and nature.\n\n[Family](https://www.dilmahtea.com/family/)\n\n- [Making a Stand](https://www.dilmahtea.com/family/making-a-stand/)\n\n[close](https://www.dilmahtea.com/#)\n\n- [Founder](https://www.dilmahtea.com/family/founder/)\n\n- [Family Members](https://www.dilmahtea.com/family/members/)\n\n- [Dilmah Story](https://www.dilmahtea.com/family/story/)\n\n- [Kindness](https://www.dilmahtea.com/kindness/)\n\n- [Tea](https://www.dilmahtea.com/teas/)\n\n[close]

In [35]:
documents[9]

Document(metadata={'url': 'https://www.dilmahtea.com/', 'type': 'website_content'}, page_content='---\nurl: "https://www.dilmahtea.com/tribute-type-sitemap.xml"\ntitle: undefined\n---\n\nhttps://www.dilmahtea.com/tributes/letters/2025-04-20T00:00:00+00:00https://www.dilmahtea.com/tributes/videos/2025-04-22T00:00:00+00:00')