In [1]:
import os
from dotenv import load_dotenv
load_dotenv()
from langchain_openai import OpenAIEmbeddings

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

from langchain_openai import ChatOpenAI

fast_model = ChatOpenAI(model="gpt-4o-mini")
large_model = ChatOpenAI(model="gpt-4o")
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [2]:
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient

client = QdrantClient("http://qdrant.homehub.tv")
vector_store = QdrantVectorStore(
    client=client,
    collection_name="pageport_videos",
    embedding=embeddings,
)

# Check if collection exists
if not client.collection_exists("pageport_videos"):
    raise ValueError("Collection 'pageport_videos' does not exist")

In [3]:
from langchain.document_transformers import EmbeddingsClusteringFilter
from langchain_openai import OpenAIEmbeddings

# Initialize clustering filter
clustering_filter = EmbeddingsClusteringFilter(
    embeddings=OpenAIEmbeddings(model="text-embedding-3-large"),
    num_clusters=10,
    random_state=42
)

# Function to return docs from vector_store
from langchain_core.documents import Document

def get_all_docs_from_vectorstore(vector_store):
    """
    Retrieve all documents from the Qdrant vector store using scroll API.
    
    Args:
        vector_store: QdrantVectorStore instance
        
    Returns:
        List of Document objects containing all documents from the collection
    """
    try:
        # Access the underlying Qdrant client
        client = vector_store.client
        
        # Scroll through all documents
        all_docs = []
        offset = None
        
        while True:
            # Scroll with no filter to get all documents
            scroll_result = client.scroll(
                collection_name=vector_store.collection_name,
                limit=1000,  # Batch size, adjust as needed
                offset=offset,
                with_payload=True,
                with_vectors=False  # We don't need vectors for clustering
            )
            
            # scroll_result is a tuple: (records, next_offset)
            records, next_offset = scroll_result
            
            # Convert Qdrant records to LangChain Documents
            batch_docs = [
                Document(
                    page_content=record.payload.get("page_content", ""),
                    metadata={k: v for k, v in record.payload.items() if k != "page_content"}
                )
                for record in records
            ]
            
            all_docs.extend(batch_docs)
            
            # If no more documents, break
            if not next_offset or len(records) == 0:
                break
                
            offset = next_offset
        
        if not all_docs:
            print("Warning: No documents found in the vector store")
            
        return all_docs
        
    except Exception as e:
        print(f"Error retrieving documents: {str(e)}")
        return []

# Usage:
docs = get_all_docs_from_vectorstore(vector_store)

# Perform clustering on documents
clustered_docs = clustering_filter.transform_documents(docs)

# Extract the 'page_content' from each Document in clustered_docs
extracted_content = [doc.page_content for doc in clustered_docs]

In [None]:
from langchain.prompts import ChatPromptTemplate

# Define the hook generation prompt template
hook_prompt_template = ChatPromptTemplate.from_template(
    """
    You’re crafting hooks for a video sales letter (VSL) featuring Todd Hoins, a financial advisor at Brokerage Specialists. Based on the following document, identify video ideas—either directly mentioned or implied through pain points, services, or client needs. Focus on creating hooks that grab attention, spark curiosity, or highlight stakes, tailored to retirement planning, fiduciary services, or wealth management.

    FORMAT EACH HOOK LIKE THIS:
    Quote or context from document: (Direct quote or summary of relevant content).  
    Core video idea: (Summary of the video concept inspired by the content).  
    Hook: (A concise, compelling hook for the VSL).

    RETURN 2-3 HOOKS PER DOCUMENT, INCLUDING AT LEAST ONE BASED ON A DIRECTLY MENTIONED VIDEO IDEA (IF PRESENT) AND OTHERS FROM IMPLIED THEMES.

    Follow these rules when formulating your responses:  
  
    - Never use a metaphor, simile, or other figure of speech which you are used to seeing in print.
    - Never use a long word where a short one will do.
    - If it is possible to cut a word out, always cut it out.
    - Never use the passive where you can use the active.
    - Never use a foreign phrase, a scientific word, or a jargon word if you can think of an everyday English equivalent.
    - Break any of these rules sooner than say anything outright barbarous.

    Document: {document}
    """
)

find_pillar_content = ChatPromptTemplate.from_template(
    """
    Act as an expert summarizer. I will provide you with a transcript, and your task is to create a concise and accurate summary of its key points. Focus on capturing the main ideas, important details, and overall purpose of the conversation or content, while omitting redundant or minor information. Ensure the summary is clear, well-structured, and written in natural language. If there are any unclear sections in the transcript, note them briefly and make an educated guess based on context where possible. Once I provide the transcript, deliver the summary in 150-200 words.

    Follow these rules when formulating your responses:  
  
    - Never use a metaphor, simile, or other figure of speech which you are used to seeing in print.
    - Never use a long word where a short one will do.
    - If it is possible to cut a word out, always cut it out.
    - Never use the passive where you can use the active.
    - Never use a foreign phrase, a scientific word, or a jargon word if you can think of an everyday English equivalent.
    - Break any of these rules sooner than say anything outright barbarous.

    Document: {document}
    """
)


# Generate unique hooks for each document
unique_pillars = []
for i, document in enumerate(extracted_content):
    # Create a chain combining prompt template with LLM
    chain = find_pillar_content | large_model
    
    # Invoke the chain with document document
    response = chain.invoke({"document": document})
    unique_pillars.append(response.content.strip())
    
    print(f"Hook {i}: {response.content.strip()}")

In [8]:
from langchain.prompts import ChatPromptTemplate

# Define output directory for articles
output_dir = "scripts"
os.makedirs(output_dir, exist_ok=True)

# Define vsl script prompt template
vsl_prompt_template = ChatPromptTemplate.from_template(
    """
    Create a totally unique, persuasive video sales letter (VSL) script featuring Todd E. Hoins, a financial advisor at Brokerage Specialists. Use the following hook as the central theme and incorporate details from the provided document to craft an engaging, viewer-focused script:

    Hook: {hook}
    Document: {document}
    Related Docs: {docs}

    Ensure the script is conversational, emotionally engaging, and designed to build trust and excitement. Naturally weave in keywords to reinforce Todd Hoins’ expertise while maintaining a persuasive flow tailored for video delivery.

    Follow these rules when formulating your responses:  
  
    - Never use a metaphor, simile, or other figure of speech which you are used to seeing in print.
    - Never use a long word where a short one will do.
    - If it is possible to cut a word out, always cut it out.
    - Never use the passive where you can use the active.
    - Never use a foreign phrase, a scientific word, or a jargon word if you can think of an everyday English equivalent.
    - Break any of these rules sooner than say anything outright barbarous.

    """
)

# Define vsl script prompt template
executive_summary_prompt_template = ChatPromptTemplate.from_template(
    """
    Act as an expert summarizer and video marketing analyst. I will provide you with a transcript from a meeting about video marketing, and your task is to create a concise and accurate summary of its key points. Focus on capturing the main ideas, important details, and overall purpose of the conversation or content, while omitting redundant or minor information. Ensure the summary is clear, well-structured, and written in natural language. If there are any unclear sections in the transcript, note them briefly and make an educated guess based on context where possible.
    Additionally, search the transcript for any explicit video marketing ideas or suggestions (e.g., specific video concepts, formats, platforms, or strategies). Extract these ideas and list them separately under a 'Video Marketing Ideas' section. If no explicit ideas are present, suggest 2-3 potential video ideas based on the transcript’s themes and context, clearly labeling them as inferred suggestions. 

    Executive Summary: {pillar_content}
    Original Document: {document}
    Related Docs: {docs}

    Follow these rules when formulating your responses:  
  
    - Never use a metaphor, simile, or other figure of speech which you are used to seeing in print.
    - Never use a long word where a short one will do.
    - If it is possible to cut a word out, always cut it out.
    - Never use the passive where you can use the active.
    - Never use a foreign phrase, a scientific word, or a jargon word if you can think of an everyday English equivalent.
    - Break any of these rules sooner than say anything outright barbarous.

    """
)

# Define the output file path (single document)
output_file = os.path.join(output_dir, "video_marketing_summaries.md")

# Open the file once in write mode to start fresh (or append mode 'a' if you want to add to an existing file)
with open(output_file, "w", encoding="utf-8") as file:
    file.write("# Video Marketing Summaries\n\n")  # Optional header for the document

# Generate vsl articles using hooks and summaries and append to the single file
for i, (pillar_content, document) in enumerate(zip(unique_pillars, extracted_content)):
    print(f"Generating summary {i}...")

    # Return related content from the vector_store
    retriever = vector_store.as_retriever(
        search_type="similarity_score_threshold", search_kwargs={"k": 5, "score_threshold": 0.5}
    )
    
    docs = retriever.invoke(pillar_content)
    
    # Create a chain combining prompt template with LLM
    chain = executive_summary_prompt_template | large_model
    
    # Invoke the chain with pillar_content and document
    response = chain.invoke({"pillar_content": pillar_content, "document": document, "docs": docs})
    
    # Append the response to the single markdown file
    with open(output_file, "a", encoding="utf-8") as file:
        file.write(f"## Summary {i}\n\n")
        file.write(response.content)
        file.write("\n\n---\n\n")  # Separator between summaries for readability
    
    print(f"Summary {i} appended to {output_file}")

print(f"All summaries saved to {output_file}")


Generating summary 0...
Summary 0 appended to scripts/video_marketing_summaries.md
Generating summary 1...
Summary 1 appended to scripts/video_marketing_summaries.md
Generating summary 2...
Summary 2 appended to scripts/video_marketing_summaries.md
Generating summary 3...
Summary 3 appended to scripts/video_marketing_summaries.md
Generating summary 4...
Summary 4 appended to scripts/video_marketing_summaries.md
Generating summary 5...
Summary 5 appended to scripts/video_marketing_summaries.md
Generating summary 6...
Summary 6 appended to scripts/video_marketing_summaries.md
Generating summary 7...
Summary 7 appended to scripts/video_marketing_summaries.md
Generating summary 8...
Summary 8 appended to scripts/video_marketing_summaries.md
Generating summary 9...
Summary 9 appended to scripts/video_marketing_summaries.md
All summaries saved to scripts/video_marketing_summaries.md
