In [1]:
import credentials

from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Pinecone
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader
from langchain.prompts import PromptTemplate

from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

import pinecone

import time
import os

  from tqdm.autonotebook import tqdm


# Experimentation

This notebook serves as an experimentation on the components involved in the completion of the objective: Generate text in the author's style of writing, making use of stored document information and the knowledge of the trained LLM model. 

The variable breakdown of the experiment is as follows: 

### Constant Variables
- Documents Uploaded (Documents contained are authored only by the intented mimic author)
    - Character Growth Manifesto
    - Tracking
    - Form Follows Function
    - Thriving Romantically
- Queries Asked (Targeting a specific section of document generation)


### Independent Variables
- Embedding Chunk Size
- Embedding Overlap
- Query Search Method *
- Q&A LLM model
- Q&A Cohesion Method *
- Generative Model Query
- Temperature


### Vector Database Independent Variables
- Search type: Either Max Marginal Relevance Search (MMR) or Similarity search
- k: Number of documents to be returned
- lambda_mult: controls the diversity of results returned by the MMR algorithm, with 1 being minimum diversity and 0 being maximum. Defaults to 0.5.
- filter: Allows you to define a filter on what documents should be retrieved, based on the documents' metadata. This has no effect if the Vectorstore doesn't store any metadata.



### Dependent Variables
- Generative output 

**Note:** Each output will have a stored set of variables describing the experimental parameters that produced that output. 

## Simple Generative Case

Create a simple example generative model ahead of creating experiment

In [2]:
INDEX_NAME = "flourishing-humanity"
DIMENSION = 768
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 0
EMBEDDINGS = HuggingFaceEmbeddings()
QA_MODEL = "gpt-3.5-turbo"
TEMPERATURE = 0
QUERY = "Further expand on the information in the character growth manifesto to empower readers. Specifically, generate a daily routine to help them take control of their lives."

In [3]:
credentials.set_credentials()

In [4]:
pinecone.init(os.getenv("PINECONE_API_KEY"), environment=os.getenv("PINECONE_ENV"))

if INDEX_NAME not in pinecone.list_indexes():
    pinecone.create_index(name=INDEX_NAME, dimension=DIMENSION)
    
index = pinecone.Index(INDEX_NAME)

In [5]:
def load_documents(file_path: str):
    loader = DirectoryLoader(file_path, 
                             glob="**/*.txt", 
                             loader_cls=TextLoader, 
                             show_progress=True)  # Directory uploader for txt documents
    documents = loader.load()  # Load documents
    return documents

In [6]:
path = "../kb/"
documents = load_documents(path)

100%|███████████████████████████████████████████| 4/4 [00:00<00:00, 2566.89it/s]


In [7]:
text_splitter = CharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
docs = text_splitter.split_documents(documents)

Created a chunk of size 1516, which is longer than the specified 1000
Created a chunk of size 1907, which is longer than the specified 1000
Created a chunk of size 1136, which is longer than the specified 1000
Created a chunk of size 1039, which is longer than the specified 1000
Created a chunk of size 1166, which is longer than the specified 1000
Created a chunk of size 1065, which is longer than the specified 1000
Created a chunk of size 1100, which is longer than the specified 1000
Created a chunk of size 1002, which is longer than the specified 1000
Created a chunk of size 1595, which is longer than the specified 1000
Created a chunk of size 1719, which is longer than the specified 1000
Created a chunk of size 1270, which is longer than the specified 1000
Created a chunk of size 1094, which is longer than the specified 1000
Created a chunk of size 1839, which is longer than the specified 1000
Created a chunk of size 1087, which is longer than the specified 1000
Created a chunk of s

Within the docs, the source is stored within the metadata of each doc. We want to alter this metadata to include both the complete title and the author in order to support author and non-author queries.

In [8]:
print(docs[0].metadata)

{'source': '../kb/form_follows_function-paul_rohde.txt'}


In [9]:
for doc in docs:
    doc_source = doc.metadata['source']
    hyphen_index = doc_source.find('-')  # Identify the position of the hyphen in the source
    title = doc_source[6:hyphen_index].replace('_', ' ')  # Extract title and remove underscores
    dot_index = doc_source.rfind('.')  # Perform find operation starting at the end to identify dot from .txt
    author = doc_source[hyphen_index + 1: dot_index]
    print(title + "|" + author)
    
    doc.metadata['title'] = title
    doc.metadata['author'] = author

form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follo

In [38]:
pinecone.init(
    api_key=os.getenv("PINECONE_API_KEY"),  # find at app.pinecone.io
    environment=os.getenv("PINECONE_ENV"),  # next to api key in console
)

In [11]:
vectordb = Pinecone.from_documents(docs, EMBEDDINGS, index_name=INDEX_NAME)  # Insert documents into vector db

In [39]:
llm = ChatOpenAI(model_name=QA_MODEL, temperature=TEMPERATURE)  # Instantiate llm

In [42]:
qa = RetrievalQA.from_chain_type(llm, retriever=vectordb.as_retriever(),
                                           return_source_documents=True)  # Create the QA retrieval chain

In [14]:
response = qa({"query": QUERY})  # Perform the query

In [15]:
print(response['result'])  # Print the response

In The Character Growth Manifesto, we not only provide readers with a comprehensive understanding of character development but also offer practical strategies to empower them in taking control of their lives. One such strategy is the creation of a daily routine that fosters personal growth and self-improvement. Here is a suggested daily routine that can help you on your journey to becoming the person you aspire to be:

1. Morning Reflection and Goal Setting:
Start your day by taking a few moments to reflect on your goals and aspirations. Write down your intentions for the day and visualize yourself achieving them. This practice will help you set a positive mindset and focus your energy on what truly matters to you.

2. Mindfulness or Meditation:
Engage in a mindfulness or meditation practice to cultivate a sense of calm and clarity. This can involve deep breathing exercises, guided meditation, or simply sitting in silence and observing your thoughts. By practicing mindfulness, you can 

In [16]:
response['source_documents']  # print the source documents

[Document(page_content='By addressing these common pain points, The Character Growth Manifesto provides readers with the knowledge and tools necessary to transform their lives and create lasting, positive change.', metadata={'author': 'paul_rohde', 'source': '../kb/character_growth_manifesto-paul_rohde.txt', 'title': 'character growth manifesto'}),
 Document(page_content="\ufeffThe Character Growth Manifesto - A Comprehensive Manual to the Systematic Development of Character\nPreface\nDiscover the transformative power of character development and unlock your full potential with The Character Growth Manifesto, a groundbreaking and comprehensive guide to personal growth and self-improvement. \n\n\nThe Character Growth Manifesto is designed for anyone committed to personal growth and self-improvement, regardless of age, background, or current stage in life. Whether you're a student, a professional, a parent, or simply someone seeking to better understand yourself and your potential, this 

In [17]:
pinecone.delete_index(INDEX_NAME)

## Method

In [43]:
INDEX_NAME = "flourishing-humanity"

In [44]:
QUERY = "Further expand on the information in the character growth manifesto to empower readers. Specifically, generate a rough daily routine to help them take control of their lives."

In [45]:
credentials.set_credentials()

In [46]:
def create_index(index_name: str, dimension: int):
    pinecone.init(os.getenv("PINECONE_API_KEY"), environment=os.getenv("PINECONE_ENV"))

    if index_name not in pinecone.list_indexes():
        pinecone.create_index(name=index_name, dimension=dimension)

    index = pinecone.Index(index_name)
    return index

In [61]:
def load_documents(file_path: str):
    loader = DirectoryLoader(file_path, 
                             glob="**/*.txt", 
                             loader_cls=TextLoader, 
                             show_progress=True)  # Directory uploader for txt documents
    documents = loader.load()  # Load documents
    return documents


def prep_docs(documents, chunk_size, chunk_overlap):
    text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    docs = text_splitter.split_documents(documents)
    return docs


def modify_metadata(docs):
    for doc in docs:
        doc_source = doc.metadata['source']
        hyphen_index = doc_source.find('-')  # Identify the position of the hyphen in the source
        title = doc_source[6:hyphen_index].replace('_', ' ')  # Extract title and remove underscores
        dot_index = doc_source.rfind('.')  # Perform find operation starting at the end to identify dot from .txt
        author = doc_source[hyphen_index + 1: dot_index].replace('_', ' ')

        doc.metadata['title'] = title
        doc.metadata['author'] = author
    return docs

In [56]:
def initialize_pinecone():
    pinecone.init(
        api_key=os.getenv("PINECONE_API_KEY"),  # find at app.pinecone.io
        environment=os.getenv("PINECONE_ENV"),  # next to api key in console
    )

In [57]:
def experiment_exe(query: str, index_name: str, dimension: int, chunk_size: int, chunk_overlap: int, embedding, model: str, temperature: float, search_type: str, k: int, lambda_mult: float):
    index = create_index(index_name, dimension)   # Create index
    
    path = "../kb/"
    documents = load_documents(path)
    docs = prep_docs(documents, chunk_size, chunk_overlap)  # Prepare the documents for upload
    docs = modify_metadata(docs)  # Modify each documents metadata to include title and author
    
    initialize_pinecone()
    
    vectordb = Pinecone.from_documents(docs, embedding, index_name=index_name)
    
    llm = ChatOpenAI(model_name=model, temperature=temperature)
    
    if search_type == 'mmr':  # MMR uses lambda_mult
        qa = RetrievalQA.from_chain_type(llm, retriever=vectordb.as_retriever(search_type=search_type, search_kwargs={'k': k, 'lambda_mult': lambda_mult}), return_source_documents=True)  # Create the QA retrieval chain
    else:  # Removal of the lambda_mult query
        qa = RetrievalQA.from_chain_type(llm, retriever=vectordb.as_retriever(search_type=search_type, search_kwargs={'k': k}), return_source_documents=True)  # Create the QA retrieval chain
        
    response = qa({"query": QUERY})  # Perform the query

    pinecone.delete_index(index_name)  # Delete index for the creation of the next
    return (response['result'], response['source_documents'])

In [58]:
(result, context) = experiment_exe(query=QUERY, 
                   index_name=INDEX_NAME, 
                   dimension=768,
                   chunk_size=500,
                   chunk_overlap=0,
                   embedding=HuggingFaceEmbeddings(),
                   model="gpt-3.5-turbo",
                   temperature=0,
                   search_type='mmr',
                   k=4,
                   lambda_mult=0.5)

100%|███████████████████████████████████████████| 4/4 [00:00<00:00, 2781.37it/s]
Created a chunk of size 504, which is longer than the specified 500
Created a chunk of size 522, which is longer than the specified 500
Created a chunk of size 525, which is longer than the specified 500
Created a chunk of size 1516, which is longer than the specified 500
Created a chunk of size 843, which is longer than the specified 500
Created a chunk of size 599, which is longer than the specified 500
Created a chunk of size 857, which is longer than the specified 500
Created a chunk of size 1907, which is longer than the specified 500
Created a chunk of size 600, which is longer than the specified 500
Created a chunk of size 848, which is longer than the specified 500
Created a chunk of size 655, which is longer than the specified 500
Created a chunk of size 1136, which is longer than the specified 500
Created a chunk of size 738, which is longer than the specified 500
Created a chunk of size 1039, wh

Created a chunk of size 826, which is longer than the specified 500
Created a chunk of size 1206, which is longer than the specified 500
Created a chunk of size 1480, which is longer than the specified 500
Created a chunk of size 683, which is longer than the specified 500
Created a chunk of size 1153, which is longer than the specified 500
Created a chunk of size 722, which is longer than the specified 500
Created a chunk of size 875, which is longer than the specified 500
Created a chunk of size 541, which is longer than the specified 500
Created a chunk of size 664, which is longer than the specified 500
Created a chunk of size 635, which is longer than the specified 500
Created a chunk of size 764, which is longer than the specified 500
Created a chunk of size 737, which is longer than the specified 500
Created a chunk of size 655, which is longer than the specified 500
Created a chunk of size 944, which is longer than the specified 500
Created a chunk of size 919, which is longer 

Created a chunk of size 694, which is longer than the specified 500
Created a chunk of size 816, which is longer than the specified 500
Created a chunk of size 655, which is longer than the specified 500
Created a chunk of size 529, which is longer than the specified 500
Created a chunk of size 822, which is longer than the specified 500
Created a chunk of size 531, which is longer than the specified 500
Created a chunk of size 542, which is longer than the specified 500
Created a chunk of size 656, which is longer than the specified 500
Created a chunk of size 807, which is longer than the specified 500
Created a chunk of size 717, which is longer than the specified 500
Created a chunk of size 752, which is longer than the specified 500
Created a chunk of size 566, which is longer than the specified 500
Created a chunk of size 808, which is longer than the specified 500
Created a chunk of size 606, which is longer than the specified 500
Created a chunk of size 2111, which is longer th

{'source': '../kb/form_follows_function-paul_rohde.txt', 'title': 'form follows function', 'author': 'paul_rohde'}


In [59]:
print(result)

Creating a daily routine can be a powerful tool in taking control of your life and fostering character growth. Here is a rough outline of a daily routine that can empower you to develop your character deliberately:

1. Morning Reflection: Start your day by reflecting on your character goals and intentions. Ask yourself questions like, "What character qualities do I want to embody today?" and "How can I apply these qualities in my actions and interactions?"

2. Journaling: Take a few minutes to journal about your character development journey. Write about situations where you have successfully demonstrated character qualities and how it made you feel. Also, identify missed opportunities and imagine how you would have felt if you had acted differently. This practice will increase your self-awareness and reinforce positive behavior.

3. Mindful Practices: Incorporate mindfulness into your routine. Engage in activities like meditation, deep breathing exercises, or yoga to cultivate self-aw

In [60]:
print(context[2])

page_content='* When did I show/practice/express character quality X today? Celebrate! How did it make you feel? (positive reinforcement)\n* (1) Where was an opportunity missed to practice character quality X today? (awareness) and (2) imagine you would have done it, how would you have felt? (positive reinforcement)\n* (1) How can I embody/apply/practice character quality X tomorrow? (future application) and (2) how will you feel if you do this? \n   \nJournaling can be a great way of creating more awareness around something you want to cultivate and using positive reinforcement to reward desirable behavior. You can create a simple journal in your note-taking software or buy a physical one.\nCombine personal or professional projects with character projects - If you want to cultivate more courage, then you do projects that are out of your comfort zone and make you feel afraid. However, avoid the panic zone. More specifically, this could mean setting up a website or traveling alone for a

## Experiment Variables
Generate the sets of experiments to be executed, from the specified experimental parameters

Please review the following docs for use of the filter: https://python.langchain.com/docs/use_cases/question_answering/how_to/vector_db_qa

In [None]:
chunk_sizes = [50, 100, 200, 500, 1000]
chunk_overlaps = [0, 50, 100, 200]
embeddings = [HuggingFaceEmbeddings()]
dimensions = [756]
models = ["gpt-3.5-turbo"]
temperatures = [0, 0.2, 0.5, 0.8, 1.0]

# Vectordb independent variables
search_types = ['mmr', 'similarity_score_threshold']
ks = [1, 4, 8]
lambda_mults = [0, 0.5, 1]
# filter_statement = {'filter': {'author':'paul rhode'}}  # This enables you to filter based on author or document
# filter_statement = {'filter': {'title':'tracking'}}

In [None]:
experiments_list = []

for chunk_size in chunk_sizes:
    for chunk_overlap in chunk_overlaps:
        for embedding, dimension in zip(embeddings, dimensions):
            for model in models:
                for temperature in temperatures:
                    for search_type in search_types:
                        for k in ks:
                            for lambda_mult in lambda_mults:
                                if search_type != 'mmr':
                                    lambda_mult = -1
                                exp_var = {'chunk_size': chunk_size,
                                           'chunk_overlap': chunk_overlap,
                                           'embedding': embedding,
                                           'dimension': dimension,
                                           'model': model,
                                           'temperature': temperature,
                                           'search_type': search_type,
                                           'k': k,
                                           'lambda_mult': lambda_mult}
                                experiments_list.append(exp_var)       

In [None]:
experiments_list[0]

In [None]:
len(experiments_list)

## Perform Experiment

The following are proposed as experimental tasks:

1. Recreate: generate a why section for character development based on the why sections in the dataset
    - Access: does have access to the character dev article
2. GenerateFromScratch: generate a why section for the personal development system based on other why sections 
    - Access: doesn't have access to the personal dev system articles
3. ImproveV1: edit/improve this why section from v1 for example character development article
    - Access: doesn't have access to character dev articles
4. TranscriptToV1: write a what section based on an excerpt from the transcript for example character development article
    - Access: doesn't have access to the personal dev system articles
5. OthersToPaul: rewrite this blog article on habits in Paul Style 
    - https://jamesclear.com/habits

In [None]:
experiment_template = "Experiment_"
counter = 0

In [None]:
for experiment in experiments_list[:3]:
    # Execute experiment
    result, context = experiment_exe(query=QUERY, 
                    index_name=INDEX_NAME, 
                   dimension=experiment['dimension'],
                   chunk_size=experiment['chunk_size'],
                   chunk_overlap=experiment['chunk_overlap'],
                   embedding=experiment['embedding'],
                   model=experiment['model'],
                   temperature=experiment['temperature'])
    
    # Experiment name and counting
    experiment_name = experiment_template + str(counter)
    print(experiment_name)
    print(result)
    print("------------------------------------------------------------------------------------------------")
    counter = counter + 1
    time.sleep(1)