## Load libraries

In [1]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document

from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain_chroma import Chroma

import os

## Load sources & chunking

In [2]:
# extract address and tenant from file names
def extract_metadata_from_filename(filename):
    parts = filename.replace(".txt", "").split("____")
    address = parts[0].replace("_", " ")
    # contract file
    if len(parts) > 1: 
        tenant_names = parts[1].split("__")
        tenant_names = [name.replace("_", " ") for name in tenant_names]
        tenant_names_string = ", ".join(tenant_names)
    # general terms file
    else: 
        # tenant_names = ["This is a file containing general terms that apply to all tenancy agreements"]
        tenant_names_string = "This is a file containing general terms that apply to all tenancy agreements"
    # return {"property_address": address, "tenant_name": tenant_names}
    return {"property_address": address, "tenant_name": tenant_names_string}
    

In [3]:
text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200)

documents = []
source_file_path = "source_files_tenancy_agreements"
num_file = 0

for filename in os.listdir(source_file_path):
    if filename.endswith(".txt"):
        # print(filename)
        with open(os.path.join(source_file_path, filename), "r") as f:
            text = f.read()
        metadata = extract_metadata_from_filename(filename)
        # print(metadata)

        chunks = text_splitter.split_text(text)

        for chunk in chunks:
            # metadata_string = "Retal property address: " + metadata["property_address"] + "\n" + "Tenant names: " + ", ".join(metadata["tenant_name"]) + "\n"
            metadata_string = "Retal property address: " + metadata["property_address"] + "\n" + "Tenant names: " + metadata["tenant_name"] + "\n"
            chunk_with_metadata = metadata_string + chunk
            doc = Document(page_content = chunk_with_metadata, metadata = metadata)
            documents.append(doc)
        num_file += 1
        
print(f"Number of source files: {num_file}")            
print(f"Number of chunks: {len(documents)}")

Created a chunk of size 1031, which is longer than the specified 1000


Number of source files: 6
Number of chunks: 89


In [4]:
print(documents[85].page_content)

Retal property address: 4 Darleith Street
Tenant names: Kunkala Sandeep
DEPOSIT 
At the start date of the tenancy or before, a deposit of £725 will be paid by the Tenant to the Landlord. The Landlord will issue a receipt for the deposit to the Tenant. No interest shall be paid by the Landlord to the Tenant for the deposit.  
By law, the deposit amount cannot exceed the equivalent of two months’ rent and cannot include any premiums. For example, charging for an administration fee or taking a holding fee (regardless of whether or not the holding fee is refundable).  
The scheme administrator is My|deposits Scotland and 
their contact details are:
http://www.mydepositsscotland.co.uk/
info@mydepositsscotland.co.uk 
0333 321 9402
THE GUARANTOR


## Create vector store

In [5]:
embedding_model = HuggingFaceBgeEmbeddings(model_name = "sentence-transformers/all-MiniLM-L6-v2")
persist_directory = "vector_db_tenancy_agreements"
print('persist_directory is:', persist_directory)

vectordb = Chroma.from_documents(documents=documents,
                                embedding=embedding_model,
                                persist_directory=persist_directory,
                                collection_name="tenecy_agreements")

persist_directory is: vector_db_tenancy_agreements
