In [1]:
import credentials

from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Pinecone
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader

from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

import pinecone

import os

  from tqdm.autonotebook import tqdm


# Experimentation

This notebook serves as an experimentation on the components involved in the completion of the objective: Generate text in the author's style of writing, making use of stored document information and the knowledge of the trained LLM model. 

The variable breakdown of the experiment is as follows: 

### Constant Variables
- Documents Uploaded (Documents contained are authored only by the intented mimic author)
    - Character Growth Manifesto
    - Tracking
    - Form Follows Function
    - Thriving Romantically
- Queries Asked (Targeting a specific section of document generation)

### Independent Variables
- Embedding Chunk Size
- Embedding Overlap
- Query Search Method
- Q&A LLM model
- Q&A Cohesion Method
- Generative Model Query

### Dependent Variables
- Generative output \
**Note:** Each output will have a stored set of variables describing the experimental parameters that produced that output. 

## Simple Generative Case

Create a simple example generative model ahead of creating experiment

In [2]:
INDEX_NAME = "flourishing-humanity"
DIMENSION = 768
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 0
EMBEDDINGS = HuggingFaceEmbeddings()
QA_MODEL = "gpt-3.5-turbo"
TEMPERATURE = 0
QUERY = "What is the Character Growth manifesto?"

In [3]:
credentials.set_credentials()

In [5]:
pinecone.init(os.getenv("PINECONE_API_KEY"), environment=os.getenv("PINECONE_ENV"))

if INDEX_NAME not in pinecone.list_indexes():
    pinecone.create_index(name=INDEX_NAME, dimension=DIMENSION)
    
index = pinecone.Index(INDEX_NAME)

In [6]:
def load_documents(file_path: str):
    loader = DirectoryLoader(file_path, 
                             glob="**/*.txt", 
                             loader_cls=TextLoader, 
                             show_progress=True)  # Directory uploader for txt documents
    documents = loader.load()  # Load documents
    return documents

In [7]:
path = "../kb/"
documents = load_documents(path)

100%|████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 1645.63it/s]


In [8]:
text_splitter = CharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
docs = text_splitter.split_documents(documents)

Created a chunk of size 1608, which is longer than the specified 1000
Created a chunk of size 1014, which is longer than the specified 1000
Created a chunk of size 1089, which is longer than the specified 1000
Created a chunk of size 1045, which is longer than the specified 1000
Created a chunk of size 1273, which is longer than the specified 1000
Created a chunk of size 1262, which is longer than the specified 1000
Created a chunk of size 2742, which is longer than the specified 1000
Created a chunk of size 1144, which is longer than the specified 1000
Created a chunk of size 1267, which is longer than the specified 1000
Created a chunk of size 1043, which is longer than the specified 1000
Created a chunk of size 1340, which is longer than the specified 1000
Created a chunk of size 2111, which is longer than the specified 1000
Created a chunk of size 1568, which is longer than the specified 1000
Created a chunk of size 1930, which is longer than the specified 1000
Created a chunk of s

In [9]:
pinecone.init(
    api_key=os.getenv("PINECONE_API_KEY"),  # find at app.pinecone.io
    environment=os.getenv("PINECONE_ENV"),  # next to api key in console
)

In [10]:
vectordb = Pinecone.from_documents(docs, EMBEDDINGS, index_name=INDEX_NAME)

In [11]:
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

In [12]:
llm = ChatOpenAI(model_name=QA_MODEL, temperature=TEMPERATURE)

In [13]:
qa = ConversationalRetrievalChain.from_llm(llm, retriever=vectordb.as_retriever(), memory=memory)

In [14]:
result = qa({"question": QUERY})

In [15]:
result["answer"]

'The Character Growth Manifesto is a comprehensive guide to personal growth and self-improvement. It provides actionable strategies and practical advice to help individuals build a stronger, more resilient character. The manifesto explores the foundations of character, the importance of character development, and offers a systematic approach to character growth. It encourages readers to identify promising character traits, select essential traits for growth, cultivate those traits with intention, and reflect on progress to continually evolve and grow. The goal of the manifesto is to empower individuals to reach their full potential and contribute to a more compassionate and harmonious society.'