In [6]:
import credentials
from langchain.vectorstores import Pinecone
import pinecone

# Embedding models
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings import GPT4AllEmbeddings

# Peripheral
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

# LLM Models
from langchain.chat_models import ChatOpenAI
from langchain.llms import GPT4All

# System
import time
import os
import re

# Data collections
import pandas as pd

# Experimentation

This notebook serves as an experimentation on the components involved in the completion of the objective: Generate text in the author's style of writing, making use of stored document information and the knowledge of the trained LLM model. 

The variable breakdown of the experiment is as follows: 

### Constant Variables
- Documents Uploaded (Documents contained are authored only by the intented mimic author)
    - Character Growth Manifesto
    - Tracking
    - Form Follows Function
    - Thriving Romantically
- Queries Asked (Targeting a specific section of document generation)


### Independent Variables
- Embedding Chunk Size
- Embedding Overlap
- Query Search Method *
- Q&A LLM model
- Q&A Cohesion Method *
- Generative Model Query
- Temperature

\* Indicates variables to still be included within the experimental setup.

### Vector Database Independent Variables
- Search type: Either Max Marginal Relevance Search (MMR) or Similarity search
- k: Number of documents to be returned
- lambda_mult: controls the diversity of results returned by the MMR algorithm, with 1 being minimum diversity and 0 being maximum. Defaults to 0.5.
- filter: Allows you to define a filter on what documents should be retrieved, based on the documents' metadata. This has no effect if the Vectorstore doesn't store any metadata.



### Dependent Variables
- Generative output 

**Note:** Each output will have a stored set of variables describing the experimental parameters that produced that output. 

## Simple Generative Case

Create a simple example generative model ahead of creating experiment

In [7]:
INDEX_NAME = "flourishing-humanity"
DIMENSION = 768
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 0
EMBEDDINGS = HuggingFaceEmbeddings()
QA_MODEL = "gpt-3.5-turbo"
TEMPERATURE = 0
QUERY = "Generate a daily routine based on the information in the Character Growth Manifesto"

In [8]:
credentials.set_credentials()

In [4]:
pinecone.init(os.getenv("PINECONE_API_KEY"), environment=os.getenv("PINECONE_ENV"))

if INDEX_NAME not in pinecone.list_indexes():
    pinecone.create_index(name=INDEX_NAME, dimension=DIMENSION)
    
index = pinecone.Index(INDEX_NAME)

In [9]:
def load_documents(file_path: str):
    loader = DirectoryLoader(file_path, 
                             glob="**/*.txt", 
                             loader_cls=TextLoader, 
                             show_progress=True)  # Directory uploader for txt documents
    documents = loader.load()  # Load documents
    return documents

In [10]:
path = "../kb/"
documents = load_documents(path)

100%|███████████████████████████████████████████| 4/4 [00:00<00:00, 1184.08it/s]


In [7]:
text_splitter = CharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
docs = text_splitter.split_documents(documents)

Created a chunk of size 1516, which is longer than the specified 1000
Created a chunk of size 1907, which is longer than the specified 1000
Created a chunk of size 1136, which is longer than the specified 1000
Created a chunk of size 1039, which is longer than the specified 1000
Created a chunk of size 1166, which is longer than the specified 1000
Created a chunk of size 1065, which is longer than the specified 1000
Created a chunk of size 1100, which is longer than the specified 1000
Created a chunk of size 1002, which is longer than the specified 1000
Created a chunk of size 1595, which is longer than the specified 1000
Created a chunk of size 1719, which is longer than the specified 1000
Created a chunk of size 1270, which is longer than the specified 1000
Created a chunk of size 1094, which is longer than the specified 1000
Created a chunk of size 1839, which is longer than the specified 1000
Created a chunk of size 1087, which is longer than the specified 1000
Created a chunk of s

Within the docs, the source is stored within the metadata of each doc. We want to alter this metadata to include both the complete title and the author in order to support author and non-author queries.

In [8]:
print(docs[0].metadata)

{'source': '../kb/form_follows_function-paul_rohde.txt'}


In [9]:
for doc in docs:
    doc_source = doc.metadata['source']
    hyphen_index = doc_source.find('-')  # Identify the position of the hyphen in the source
    title = doc_source[6:hyphen_index].replace('_', ' ')  # Extract title and remove underscores
    dot_index = doc_source.rfind('.')  # Perform find operation starting at the end to identify dot from .txt
    author = doc_source[hyphen_index + 1: dot_index]
    print(title + "|" + author)
    
    doc.metadata['title'] = title
    doc.metadata['author'] = author

form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follows function|paul_rohde
form follo

In [38]:
pinecone.init(
    api_key=os.getenv("PINECONE_API_KEY"),  # find at app.pinecone.io
    environment=os.getenv("PINECONE_ENV"),  # next to api key in console
)

In [11]:
vectordb = Pinecone.from_documents(docs, EMBEDDINGS, index_name=INDEX_NAME)  # Insert documents into vector db

In [39]:
llm = ChatOpenAI(model_name=QA_MODEL, temperature=TEMPERATURE)  # Instantiate llm

In [42]:
qa = RetrievalQA.from_chain_type(llm, retriever=vectordb.as_retriever(),
                                           return_source_documents=True)  # Create the QA retrieval chain

In [14]:
response = qa({"query": QUERY})  # Perform the query

In [15]:
print(response['result'])  # Print the response

In The Character Growth Manifesto, we not only provide readers with a comprehensive understanding of character development but also offer practical strategies to empower them in taking control of their lives. One such strategy is the creation of a daily routine that fosters personal growth and self-improvement. Here is a suggested daily routine that can help you on your journey to becoming the person you aspire to be:

1. Morning Reflection and Goal Setting:
Start your day by taking a few moments to reflect on your goals and aspirations. Write down your intentions for the day and visualize yourself achieving them. This practice will help you set a positive mindset and focus your energy on what truly matters to you.

2. Mindfulness or Meditation:
Engage in a mindfulness or meditation practice to cultivate a sense of calm and clarity. This can involve deep breathing exercises, guided meditation, or simply sitting in silence and observing your thoughts. By practicing mindfulness, you can 

In [16]:
response['source_documents']  # print the source documents

[Document(page_content='By addressing these common pain points, The Character Growth Manifesto provides readers with the knowledge and tools necessary to transform their lives and create lasting, positive change.', metadata={'author': 'paul_rohde', 'source': '../kb/character_growth_manifesto-paul_rohde.txt', 'title': 'character growth manifesto'}),
 Document(page_content="\ufeffThe Character Growth Manifesto - A Comprehensive Manual to the Systematic Development of Character\nPreface\nDiscover the transformative power of character development and unlock your full potential with The Character Growth Manifesto, a groundbreaking and comprehensive guide to personal growth and self-improvement. \n\n\nThe Character Growth Manifesto is designed for anyone committed to personal growth and self-improvement, regardless of age, background, or current stage in life. Whether you're a student, a professional, a parent, or simply someone seeking to better understand yourself and your potential, this 

In [17]:
pinecone.delete_index(INDEX_NAME)

## Method

Create the required methods to encapsulate this process, in order to abstract the experimental process. 
A trial run of the created experiment was performed. 

In [2]:
INDEX_NAME = "flourishing-humanity"  # Specify the index name

In [22]:
QUERY = "Further expand on the information in the character growth manifesto to empower readers. Specifically, generate a rough daily routine to help them take control of their lives."

In [23]:
credentials.set_credentials()  # Set up pinecone and openai credentials

In [24]:
def create_index(index_name: str, dimension: int):
    """This method creates a new index if it does not already exit, or it creates a connection to the existing index."""
    pinecone.init(os.getenv("PINECONE_API_KEY"), environment=os.getenv("PINECONE_ENV"))

    if index_name not in pinecone.list_indexes():
        pinecone.create_index(name=index_name, dimension=dimension)

    index = pinecone.Index(index_name)
    return index

In [25]:
def load_documents(file_path: str):
    """Method loads documents from a filepath into a Document format for processing into vector form.
    Note only txt documents are accepted format for the experiments.
    """
    loader = DirectoryLoader(file_path, 
                             glob="**/*.txt", 
                             loader_cls=TextLoader, 
                             show_progress=True)  # Directory uploader for txt documents
    documents = loader.load()  # Load documents
    return documents


def prep_docs(documents, chunk_size, chunk_overlap):
    """Method chunks the retrieved documents to serve as inputs to the embedding model."""
    text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    docs = text_splitter.split_documents(documents)
    return docs


def modify_metadata(docs):
    """Method modifies the document metadata based on the document names, in order to create the metadata fields author and title."""
    for doc in docs:
        doc_source = doc.metadata['source']
        hyphen_index = doc_source.find('-')  # Identify the position of the hyphen in the source
        title = doc_source[6:hyphen_index].replace('_', ' ')  # Extract title and remove underscores
        dot_index = doc_source.rfind('.')  # Perform find operation starting at the end to identify dot from .txt
        author = doc_source[hyphen_index + 1: dot_index].replace('_', ' ')  # Extract the author section

        doc.metadata['title'] = title
        doc.metadata['author'] = author
    return docs

In [26]:
def initialize_pinecone():
    """Initialize a connection to the specific pinecone index. This connection allows the upsertion of vectors"""
    pinecone.init(
        api_key=os.getenv("PINECONE_API_KEY"),  # find at app.pinecone.io
        environment=os.getenv("PINECONE_ENV"),  # next to api key in console
    )

In [27]:
def experiment_exe(query: str, index_name: str, dimension: int, chunk_size: int, chunk_overlap: int, embedding, model , search_type: str, k: int, lambda_mult: float):
    """Method performs a single experiment: 
    It creates a vector database (pinecone) from the specified document.
    It performs a QA retrieval and text generation based on the search query
    It returns the generative output and context.
    It deletes the exisitng vector database and sleeps for a set period ahead of the subsequent experiment.
    """
    index = create_index(index_name, dimension)   # Create index
    
    path = "../kb/"  # Document path
    documents = load_documents(path)  # Load the documents
    docs = prep_docs(documents, chunk_size, chunk_overlap)  # Prepare the documents for upload
    docs = modify_metadata(docs)  # Modify each documents metadata to include title and author
    
    initialize_pinecone()  # Initialize pinecone connection
    
    vectordb = Pinecone.from_documents(docs, embedding, index_name=index_name)  # Populate the vector DB
    
    llm = model  # Change model variable name
    
    if search_type == 'mmr':  # MMR uses lambda_mult
        qa = RetrievalQA.from_chain_type(llm, retriever=vectordb.as_retriever(search_type=search_type, search_kwargs={'k': k, 'lambda_mult': lambda_mult}), return_source_documents=True)  # Create the QA retrieval chain
    else:  # Removal of the lambda_mult query
        qa = RetrievalQA.from_chain_type(llm, retriever=vectordb.as_retriever(search_type=search_type, search_kwargs={'k': k}), return_source_documents=True)  # Create the QA retrieval chain
        
    response = qa({"query": QUERY})  # Perform the query

    pinecone.delete_index(index_name)  # Delete index for the creation of the next
    time.sleep(2)  # Allow Pinecone chance to delete index
    return (response['result'], response['source_documents'])  # Return output and query

Perform an example experiment execution.

In [29]:
(result, context) = experiment_exe(query=QUERY, 
                   index_name=INDEX_NAME, 
                   dimension=768,
                   chunk_size=500,
                   chunk_overlap=0,
                   embedding=HuggingFaceEmbeddings(),
                   model=ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.5),
                   search_type='mmr',
                   k=8,
                   lambda_mult=0.5)

100%|███████████████████████████████████████████| 4/4 [00:00<00:00, 2252.88it/s]
Created a chunk of size 504, which is longer than the specified 500
Created a chunk of size 522, which is longer than the specified 500
Created a chunk of size 525, which is longer than the specified 500
Created a chunk of size 1516, which is longer than the specified 500
Created a chunk of size 843, which is longer than the specified 500
Created a chunk of size 599, which is longer than the specified 500
Created a chunk of size 857, which is longer than the specified 500
Created a chunk of size 1907, which is longer than the specified 500
Created a chunk of size 600, which is longer than the specified 500
Created a chunk of size 848, which is longer than the specified 500
Created a chunk of size 655, which is longer than the specified 500
Created a chunk of size 1136, which is longer than the specified 500
Created a chunk of size 738, which is longer than the specified 500
Created a chunk of size 1039, wh

Created a chunk of size 826, which is longer than the specified 500
Created a chunk of size 1206, which is longer than the specified 500
Created a chunk of size 1480, which is longer than the specified 500
Created a chunk of size 683, which is longer than the specified 500
Created a chunk of size 1153, which is longer than the specified 500
Created a chunk of size 722, which is longer than the specified 500
Created a chunk of size 875, which is longer than the specified 500
Created a chunk of size 541, which is longer than the specified 500
Created a chunk of size 664, which is longer than the specified 500
Created a chunk of size 635, which is longer than the specified 500
Created a chunk of size 764, which is longer than the specified 500
Created a chunk of size 737, which is longer than the specified 500
Created a chunk of size 655, which is longer than the specified 500
Created a chunk of size 944, which is longer than the specified 500
Created a chunk of size 919, which is longer 

Created a chunk of size 694, which is longer than the specified 500
Created a chunk of size 816, which is longer than the specified 500
Created a chunk of size 655, which is longer than the specified 500
Created a chunk of size 529, which is longer than the specified 500
Created a chunk of size 822, which is longer than the specified 500
Created a chunk of size 531, which is longer than the specified 500
Created a chunk of size 542, which is longer than the specified 500
Created a chunk of size 656, which is longer than the specified 500
Created a chunk of size 807, which is longer than the specified 500
Created a chunk of size 717, which is longer than the specified 500
Created a chunk of size 752, which is longer than the specified 500
Created a chunk of size 566, which is longer than the specified 500
Created a chunk of size 808, which is longer than the specified 500
Created a chunk of size 606, which is longer than the specified 500
Created a chunk of size 2111, which is longer th

The below is the result of the following query: 
"Further expand on the information in the character growth manifesto to empower readers. Specifically, generate a rough daily routine to help them take control of their lives."

In [30]:
print(result)

In addition to the framework provided in The Character Growth Manifesto, establishing a daily routine can be a powerful tool in taking control of your life and fostering personal growth. While everyone's routine will differ based on individual preferences and goals, here is a rough outline to help you get started:

1. Morning Ritual:
   - Wake up at a consistent time to establish a sense of structure and discipline.
   - Begin the day with gratitude and positive affirmations to set a positive mindset.
   - Engage in mindfulness or meditation practices to cultivate self-awareness and inner calm.
   - Dedicate time for physical exercise to boost energy levels and promote overall well-being.
   - Set intentions for the day and review your character development goals.

2. Work/Productivity Time:
   - Prioritize your most important tasks for the day and allocate focused time blocks to work on them.
   - Minimize distractions by turning off notifications and creating a dedicated workspace.
 

In [31]:
print(context[0])  # Print a single contextual finding from the vector db

page_content='By addressing these common pain points, The Character Growth Manifesto provides readers with the knowledge and tools necessary to transform their lives and create lasting, positive change.' metadata={'author': 'paul rohde', 'source': '../kb/character_growth_manifesto-paul_rohde.txt', 'title': 'character growth manifesto'}


## Experiment Variables
Generate the sets of experiments to be executed, from the specified experimental parameters

### Full Experimental Setup
A full set of found parameters is given below:

```Python
chunk_sizes = [50, 100, 200, 500, 1000]
chunk_overlaps = [0, 50, 100, 200]
embeddings = [HuggingFaceEmbeddings(), OpenAIEmbeddings(), GPT4AllEmbeddings()]
embeddings_names = ['HuggingFace', 'OpenAI Embedding', 'GPT4All']
dimensions = [768, 1536, 384]
models = [ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.5), GPT4All(model='/home/travisdawson/.cache/gpt4all/ggml-all-MiniLM-L6-v2-f16.bin')]
model_names = ['gpt-3.5-turbo', 'GPT4All']

# Vector db independent variables
search_types = ['mmr', 'similarity_score_threshold']
ks = [1, 4, 8]
lambda_mults = [0, 0.5, 1]

# Filtering
# filter_statement = {'filter': {'author':'paul rhode'}}  # This enables you to filter based on author or document
# filter_statement = {'filter': {'title':'tracking'}}
```

Please be advised, the variable layout above generates an exponentially increasing number of experiments with each variable added. The experimental setup above will generate over 7000 experiments to be run. 
Please carefully select the experimental variables and ensure the number of experiments is as expected. 
The number of experiments is printed below. 

### Variable Meaning

#### General Variables
- **Chunk Sizes:** The number of characters per grouping of text. A document is broken down into a set of chunks containing its text.
- **Chunk Overlap:** The character overlap between chunks. This may help with contextual builidng, such that chunks are not seen as seperate entities but contains links to each other.
- **Embeddings:** The models that transform text chunks into vectors (text to vector models). These capture the language context and aim to encapsulate that within a numerical vector.
- **Embedding Names:** Shorthand names for the embedding models for simplification when the results are saved.
- **Dimensions:** The dimension of the vectors. This is linked to the embedding model used, such that the dimension matches the output vector size of the embedding model.
- **Models:** The LLM models used.
- **Model Names:** Shorthand model names for the simplification of the results when saved to file.


#### Vector Search Variables
- **Search Types:** The type of search within the vector database, based on the user query. The following link provides more in-depth understanding of aximal Marginal Relevance (MMR) vs Similarity Search: https://python.langchain.com/docs/modules/data_connection/vectorstores/
    - MMR: Search optimizes similarity to the user query while maintaining contextual diversity.
    - Similarity Search: Optimizes for vector similarity to the user query only.
- **K's:** The number of vectors (context pieces) to be returned by the vector database search. 
- **Lambda Multipliers:** This parameter relates to MMR search only (this has already been accounted for in the experiments). The parameter controls the amount of diversity included within the search. 1 includes minimal diversity through to 0 introducing maximum diversity.
- **filter:** The filter allows the user to filter the search results using metadata. Remember the available metadata includes: title, author, and source (path to document). Filter parameter should be included in `experiment_exe()` in the following manner `search_kwargs={'filter': {'title':'tracking'}}`. The below link contains an exampe of a filter being used.

The following link showcases the vector search results in a more in-depth manner: https://python.langchain.com/docs/use_cases/question_answering/how_to/vector_db_qa



### Important
Please review the following docs for use of the filter: https://python.langchain.com/docs/use_cases/question_answering/how_to/vector_db_qa

**Note:** The temperature variable is not included in all llm models. As such, when a model is capable of taking the parameter, please hardcode it into the `models` list as is done in `ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.5)`

**Note 2:** Please add your local path for the GPT4All model when running the experiments. Specifically this line `GPT4All(model='/home/travisdawson/.cache/gpt4all/ggml-all-MiniLM-L6-v2-f16.bin')`

**Note 3:** The open-source models such as GPT4All have context window limitations. By providing an increased k or a long query, or even repeated queries the model will begin blocking requests.

**Note 4:** The results are written to the file `notebooks/exp_results/result.csv`. The file is a csv file for easy visualization and analysis using Pandas.

The below variables are selected and discussed to form the initial experimental set

In [None]:
chunk_sizes = [100, 500, 1000]
chunk_overlaps = [0, 75, 200]
embeddings = [HuggingFaceEmbeddings(), OpenAIEmbeddings()]
embeddings_names = ['HuggingFace', 'OpenAI Embedding']
dimensions = [768, 1536, 384]
models = [ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.5)]
model_names = ['gpt-3.5-turbo']

# Vectordb independent variables
search_types = ['mmr', 'similarity_score_threshold']
ks = [1, 4, 8]
lambda_mults = [0, 0.5, 1]
# filter_statement = {'filter': {'author':'paul rohde'}}  # This enables you to filter based on author or document
# filter_statement = {'filter': {'title':'tracking'}}

**Testing variables**: These variables are used to conduct an experimental process test on a small experiment set to check that all processes work as they should. If you are aiming to run the experiment above, please ensure the below cell is not run as it will overwrite the variables.

In [14]:
chunk_sizes = [500]
chunk_overlaps = [0]
embeddings = [HuggingFaceEmbeddings()]
embeddings_names = ['HuggingFace', 'OpenAI Embeddings']
dimensions = [768, 1536]
models = [ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.5)]
model_names = ['gpt-3.5-turbo']

# Vectordb independent variables
search_types = ['mmr']
ks = [4]
lambda_mults = [0.5]

The below cell populates the `experiments_list` list with the set of experiments created from the parameter specification. Remember this generates all permuations of all variables.

In [26]:
experiments_list = []

for chunk_size in chunk_sizes:
    for chunk_overlap in chunk_overlaps:
        for embedding, embedding_name, dimension in zip(embeddings, embeddings_names, dimensions):
            for model, model_name in zip(models, model_names):
                for search_type in search_types:
                    for k in ks:
                        for lambda_mult in lambda_mults:
                            if search_type != 'mmr':
                                lambda_mult = -1
                            exp_var = {'chunk_size': chunk_size,
                                       'chunk_overlap': chunk_overlap,
                                       'embedding': embedding,
                                       'embedding_name': embedding_name,
                                       'dimension': dimension,
                                       'model': model,
                                       'model_name': model_name,
                                       'search_type': search_type,
                                       'k': k,
                                       'lambda_mult': lambda_mult}
                            experiments_list.append(exp_var)       

Print the number of experiments generated.

In [28]:
len(experiments_list)

324

## Perform Experiment

As a reminder the following are proposed as experimental tasks:

1. Recreate: generate a why section for character development based on the why sections in the dataset
    - Access: does have access to the character dev article
2. GenerateFromScratch: generate a why section for the personal development system based on other why sections 
    - Access: doesn't have access to the personal dev system articles
3. ImproveV1: edit/improve this why section from v1 for example character development article
    - Access: doesn't have access to character dev articles
4. TranscriptToV1: write a what section based on an excerpt from the transcript for example character development article
    - Access: doesn't have access to the personal dev system articles
5. OthersToPaul: rewrite this blog article on habits in Paul Style 
    - https://jamesclear.com/habits

Write a function to capture the experimental variables and results:

In [196]:
def capture(experiment: dict):
    """Method cleans the results and saves them to file.
    If the file already exists, the new result is appended to the end of the file. 
    If the file does not exist, a new file is created and the first result is written to it.
    """
    remove_distraction(experiment)  # Clean the results
    
    file_name = 'exp_results/results.csv'
    df = pd.DataFrame(experiment, index=[0])
    
    if not os.path.isfile(file_name):      # File does not exist
        df.to_csv(file_name, mode='w')
    else:  # File does exist
        df.to_csv(file_name, mode='a')
    
    
def remove_distraction(experiment: dict):
    """Method removes the full model and embedding model specifications as only the model names are required."""
    del experiment['model']
    del experiment['embedding']
    
    
def context_breakdown(experiment: dict, contexts: list):
    """Clean the result context such that the following two actions are performed:
    1. All Authors are extracted and written to the results. 
    2. The document titles are extracted and written to the results. 
    Note, there will be no duplicates of authors or document titles. 
    """
    doc_content = []
    authors = []
    documents = []
    
    for context in contexts:
        metadata = context.metadata
        doc_content.append(context.page_content)
        
        if metadata['author'] not in authors:  # Author not yet in authors
            authors.append(metadata['author'])
            
        if metadata['title'] not in documents:  # Title not yet in titles
            documents.append(metadata['title'])
            
    experiment['authors'] = strip_newlines(', '.join(authors)).strip()  # Join authors into a single string (comma seperated)
    experiment['source_documents'] = strip_newlines(', '.join(documents)).strip()  # Join doc titles into a single string (comma seperated)
    experiment['context'] = strip_newlines('| '.join(doc_content)).strip()  # Join context into a single string (seperated by |)
    return experiment


def strip_newlines(text: str):
    """Method strips newline characters from the document context"""
    pattern = r'^\d+\.\s*'
    clean_string = re.sub(pattern, '', text, flags=re.MULTILINE)
    clean_string = clean_string.replace('\n', '')
    return clean_string

In [197]:
experiment_template = "Experiment_"
counter = 0

In [198]:
for experiment in experiments_list:
    # Execute experiment
    result, context = experiment_exe(query=QUERY, 
                    index_name=INDEX_NAME, 
                   dimension=experiment['dimension'],
                   chunk_size=experiment['chunk_size'],
                   chunk_overlap=experiment['chunk_overlap'],
                   embedding=experiment['embedding'],
                   model=experiment['model'],
                    search_type=experiment['search_type'],
                   k=experiment['k'],
                   lambda_mult=experiment['lambda_mult'])  # Perform the experiment
    
    # Experiment name and counting
    experiment_name = experiment_template + str(counter)
    print(experiment_name)
    print(result)
    print("------------------------------------------------------------------------------------------------")
    
    
    # Record experiment
    experiment['title'] = experiment_name  # generate experiment name
    experiment['output'] = strip_newlines(result)  # Strip newlines from the output
    experiment = context_breakdown(experiment, context)  # Extract info from the context
    capture(experiment)  # Record the experiment results.
    
    counter = counter + 1
    time.sleep(1)
    

100%|███████████████████████████████████████████| 4/4 [00:00<00:00, 3949.44it/s]
Created a chunk of size 504, which is longer than the specified 500
Created a chunk of size 522, which is longer than the specified 500
Created a chunk of size 525, which is longer than the specified 500
Created a chunk of size 1516, which is longer than the specified 500
Created a chunk of size 843, which is longer than the specified 500
Created a chunk of size 599, which is longer than the specified 500
Created a chunk of size 857, which is longer than the specified 500
Created a chunk of size 1907, which is longer than the specified 500
Created a chunk of size 600, which is longer than the specified 500
Created a chunk of size 848, which is longer than the specified 500
Created a chunk of size 655, which is longer than the specified 500
Created a chunk of size 1136, which is longer than the specified 500
Created a chunk of size 738, which is longer than the specified 500
Created a chunk of size 1039, wh

Created a chunk of size 826, which is longer than the specified 500
Created a chunk of size 1206, which is longer than the specified 500
Created a chunk of size 1480, which is longer than the specified 500
Created a chunk of size 683, which is longer than the specified 500
Created a chunk of size 1153, which is longer than the specified 500
Created a chunk of size 722, which is longer than the specified 500
Created a chunk of size 875, which is longer than the specified 500
Created a chunk of size 541, which is longer than the specified 500
Created a chunk of size 664, which is longer than the specified 500
Created a chunk of size 635, which is longer than the specified 500
Created a chunk of size 764, which is longer than the specified 500
Created a chunk of size 737, which is longer than the specified 500
Created a chunk of size 655, which is longer than the specified 500
Created a chunk of size 944, which is longer than the specified 500
Created a chunk of size 919, which is longer 

Created a chunk of size 694, which is longer than the specified 500
Created a chunk of size 816, which is longer than the specified 500
Created a chunk of size 655, which is longer than the specified 500
Created a chunk of size 529, which is longer than the specified 500
Created a chunk of size 822, which is longer than the specified 500
Created a chunk of size 531, which is longer than the specified 500
Created a chunk of size 542, which is longer than the specified 500
Created a chunk of size 656, which is longer than the specified 500
Created a chunk of size 807, which is longer than the specified 500
Created a chunk of size 717, which is longer than the specified 500
Created a chunk of size 752, which is longer than the specified 500
Created a chunk of size 566, which is longer than the specified 500
Created a chunk of size 808, which is longer than the specified 500
Created a chunk of size 606, which is longer than the specified 500
Created a chunk of size 2111, which is longer th

Experiment_0
Developing a daily routine can be a powerful tool in taking control of your life and fostering character growth. Here is a rough outline of a daily routine that can empower you to make positive changes:

1. Morning Reflection: Start your day by reflecting on your character development goals. Ask yourself questions like, "What character qualities do I want to embody today?" and "How can I apply these qualities in my interactions and actions?"

2. Mindfulness Practice: Engage in a mindfulness practice such as meditation or deep breathing exercises. This can help you cultivate self-awareness, manage stress, and enhance your focus for the day ahead.

3. Journaling: Take a few minutes to journal about your thoughts, emotions, and experiences. Use the prompts provided in the manifesto to explore your character strengths and areas for improvement. Celebrate moments when you demonstrated desired character qualities and identify missed opportunities.

4. Goal Setting: Set specific 

100%|███████████████████████████████████████████| 4/4 [00:00<00:00, 2223.33it/s]
Created a chunk of size 504, which is longer than the specified 500
Created a chunk of size 522, which is longer than the specified 500
Created a chunk of size 525, which is longer than the specified 500
Created a chunk of size 1516, which is longer than the specified 500
Created a chunk of size 843, which is longer than the specified 500
Created a chunk of size 599, which is longer than the specified 500
Created a chunk of size 857, which is longer than the specified 500
Created a chunk of size 1907, which is longer than the specified 500
Created a chunk of size 600, which is longer than the specified 500
Created a chunk of size 848, which is longer than the specified 500
Created a chunk of size 655, which is longer than the specified 500
Created a chunk of size 1136, which is longer than the specified 500
Created a chunk of size 738, which is longer than the specified 500
Created a chunk of size 1039, wh

Created a chunk of size 826, which is longer than the specified 500
Created a chunk of size 1206, which is longer than the specified 500
Created a chunk of size 1480, which is longer than the specified 500
Created a chunk of size 683, which is longer than the specified 500
Created a chunk of size 1153, which is longer than the specified 500
Created a chunk of size 722, which is longer than the specified 500
Created a chunk of size 875, which is longer than the specified 500
Created a chunk of size 541, which is longer than the specified 500
Created a chunk of size 664, which is longer than the specified 500
Created a chunk of size 635, which is longer than the specified 500
Created a chunk of size 764, which is longer than the specified 500
Created a chunk of size 737, which is longer than the specified 500
Created a chunk of size 655, which is longer than the specified 500
Created a chunk of size 944, which is longer than the specified 500
Created a chunk of size 919, which is longer 

Created a chunk of size 694, which is longer than the specified 500
Created a chunk of size 816, which is longer than the specified 500
Created a chunk of size 655, which is longer than the specified 500
Created a chunk of size 529, which is longer than the specified 500
Created a chunk of size 822, which is longer than the specified 500
Created a chunk of size 531, which is longer than the specified 500
Created a chunk of size 542, which is longer than the specified 500
Created a chunk of size 656, which is longer than the specified 500
Created a chunk of size 807, which is longer than the specified 500
Created a chunk of size 717, which is longer than the specified 500
Created a chunk of size 752, which is longer than the specified 500
Created a chunk of size 566, which is longer than the specified 500
Created a chunk of size 808, which is longer than the specified 500
Created a chunk of size 606, which is longer than the specified 500
Created a chunk of size 2111, which is longer th

AuthenticationError: Incorrect API key provided: sk-TbNmk***************************************e23G. You can find your API key at https://platform.openai.com/account/api-keys.