In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

# Load the API key from the .env file
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient

client = QdrantClient(url=os.getenv("QDRANT_URL"))
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

vector_store = QdrantVectorStore(
    client=client,
    collection_name=os.getenv("QDRANT_COLLECTION"),
    embedding=embeddings
)

In [None]:
from langchain_openai import ChatOpenAI
import pandas as pd
import csv
import os
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

# Initialize LLM
llm = ChatOpenAI(model="o3-2025-04-16")

# Get the current directory in a way that works for both scripts and notebooks
try:
    # This works when running as a script
    current_dir = os.path.dirname(os.path.abspath(__file__))
except NameError:
    # This works in interactive environments like Jupyter
    current_dir = os.getcwd()

# Print current directory for debugging
print(f"Current working directory: {current_dir}")

# Set base directory
base_dir = '/home/codyt/Documents/Projects/Cognitive-Assistant'
input_file = os.path.join(base_dir, 'system-prompt-creator', 'questions.csv')
output_file = os.path.join(base_dir, 'data', 'questions_with_answers_v4.csv')

# Check if input file exists and provide feedback
if not os.path.exists(input_file):
    print(f"Error: Input file not found at: {input_file}")
    print(f"Current directory contents: {os.listdir(current_dir)}")
    if os.path.exists(os.path.dirname(input_file)):
        print(f"system-prompt-creator directory contents: {os.listdir(os.path.dirname(input_file))}")
    raise FileNotFoundError(f"Cannot find questions.csv at {input_file}")

# Ensure the output directory exists
os.makedirs(os.path.dirname(output_file), exist_ok=True)

# Read the CSV file, create new columns if they don't exist
print(f"Reading file from: {input_file}")
df = pd.read_csv(input_file)
for col in ['Answer 1', 'Answer 2', 'Answer 3']:
    if col not in df.columns:
        df[col] = pd.NA

message = """
You are the inner voice of the author.
Your task is to weave the retrieved journal snippets (“Context”) into a single, first-person reflection that answers the posed question (“{question}”).  

Guidelines  

1. Tone: reflective, vulnerable, narrative; no mention of external theories or their creators.  
2. Source: rely only on the provided Context—never invent new biographical facts.  
3. Abstraction: after each concrete detail, immediately surface the broader principle or pattern it reveals.  
   • Target ≈ 30 percent illustrative detail, 70 percent generalized insight that would still make sense to someone unfamiliar with the specific events.
   • At all times maintain complete fidelity to the context. Don't hallucinate details that didn't happen to preserve coherence.
   • Never mention people's or places names. Always generalize instead to preserve privacy. 
4. Form: one cohesive paragraph of roughly 200-400 words; avoid numbered lists or direct quotations unless indispensable.  
5. Insufficient data: if the Context lacks substance, first abstract whatever can be inferred, then write:  
   “I don’t have enough information yet to answer this fully.”  
   and list two or three clarifying sub-questions the author could explore.  
6. Quality check: before finalizing, reread and revise any statement that would feel opaque or overly specific to an outside reader.  

Follow these rules when formulating your responses:

- Never use a metaphor, simile, or other figure of speech which you are used to seeing in print.
- Never use a long word where a short one will do.
- If it is possible to cut a word out, always cut it out.
- Never use the passive where you can use the active.
- Never use a foreign phrase, a scientific word, or a jargon word if you can think of an everyday English equivalent.
- Break any of these rules sooner than say anything outright barbarous.

Language Constraints: Clear, Grounded, Action-Oriented
Use language that is:
Practical, concrete, and free of spiritual or self-help jargon.
Focused on clarity over inspiration—assume the user is already motivated, just seeking alignment and traction.
Written as if you're explaining things to a peer with a sharp mind and little tolerance for fluff.

Avoid:
Terms like "identity shift," "embodying transformation," "manifest," or "intentional living."
Vague abstractions: "step into your power," "hold space," "rise into alignment," etc.
Anything that requires interpretation to understand what to do next.

Prefer:
Words like "build," "improve," "simplify," "track," "limit," "adjust," "review," "keep going."
Instructions that give clear actions with clear outcomes.
Reflections that stay grounded in behavior or tangible change.

If you're unsure, ask: Would this make sense to a focused, skeptical builder who wants to make real progress, not perform transformation?
If not, rewrite it.
     

User message
––––––––––––
{question}   

Context
–––––––
{context} 
"""

# Retrieve relevant documents
retriever = vector_store.as_retriever(k=20, score_threshold=0.5)

# Format the documents
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

prompt = ChatPromptTemplate.from_messages([("human", message)])

rag_chain = {"context": retriever | format_docs, "question": RunnablePassthrough()} | prompt | llm

# Function to process a single question and answer
def process_question(row, question_col, answer_col):
    if pd.isna(row[answer_col]):  # Only process if answer is empty
        query = row[question_col]
        response = rag_chain.invoke(query)
        return response.content
    return row[answer_col]  # Return existing answer if it exists

# Process questions in a loop
question_pairs = [
    ('Question 1', 'Answer 1'),
    ('Question 2', 'Answer 2'),
    ('Question 3', 'Answer 3')
]

for index, row in df.iterrows():
    for question_col, answer_col in question_pairs:
        if question_col in df.columns:  # Check if question column exists
            df.at[index, answer_col] = process_question(row, question_col, answer_col)
            # Clean the new answer
            df[answer_col] = df[answer_col].replace(r'\n', ' ', regex=True)
            df[answer_col] = df[answer_col].replace(r'\s+', ' ', regex=True)
            # Save after each answer to prevent data loss
            df.to_csv(output_file, index=False, quoting=csv.QUOTE_NONNUMERIC)

# Final formatting of all columns
df = df.replace(r'\n', ' ', regex=True)
df = df.replace(r'\s+', ' ', regex=True)

# Final save
df.to_csv(output_file, index=False, quoting=csv.QUOTE_NONNUMERIC)
print(f"Results saved to: {output_file}")

Current working directory: /home/codyt/Documents/Projects/Cognitive-Assistant/system-prompt-creator
Reading file from: /home/codyt/Documents/Projects/Cognitive-Assistant/system-prompt-creator/questions.csv
Results saved to: /home/codyt/Documents/Projects/Cognitive-Assistant/data/questions_with_answers_v4.csv
