In [1]:
from langchain.document_loaders import TextLoader, PyPDFLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores.pgvector import PGVector
from langchain.schema import Document
import re
import os

In [2]:
os.environ['CURL_CA_BUNDLE'] = ''
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

  from tqdm.autonotebook import tqdm, trange


In [3]:
def clean_text(text):
    """Clean extracted text."""
    # Remove special characters but retain letters, digits, and whitespace
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Remove extra whitespace and blank lines
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

In [15]:
def load_and_process_document(file_path, chunk_size=1000, chunk_overlap=20):
    if file_path.lower().endswith('.pdf'):
        loader = PyPDFLoader(file_path)
    else:
        loader = TextLoader(file_path=file_path, encoding='utf-8')
    
    try:
        documents = loader.load()
        print(f"Loaded {len(documents)} document(s) from {file_path}")
        
        # Clean the text of each document
        cleaned_documents = [clean_text(doc.page_content) for doc in documents]
        
        # Join all cleaned documents into a single string
        full_text = " ".join(cleaned_documents)
        print(f"Text cleaned. Total length: {len(full_text)} characters")
        
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        chunks = text_splitter.split_text(full_text)
        
        # Create Document objects
        processed_documents = [Document(page_content=chunk, metadata={"source": file_path}) for chunk in chunks]
        
        print(f"Processed into {len(processed_documents)} document chunks")
        return processed_documents
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return []

In [18]:
def create_pgvector_db(db_name, collection_name, texts):
    connection_string = f"postgresql+psycopg2://postgres:1234@localhost:5432/{db_name}"
    
    db = PGVector.from_documents(
        embedding=embeddings,
        documents=texts,
        collection_name=collection_name,
        connection_string=connection_string
    )
    print(f"Successfully created {db_name}")
    return db

In [7]:
files = [
    r'rich_poor_dad.pdf',
    r'total-money.pdf',
    r'your-money.pdf',
    r'millionaire_fastlane.pdf',
    r'the-psychology-money.pdf',
    r'the-intelligent-investor.pdf',
    r'fast-and-slow.pdf',
    r'principles-life.pdf',
    r'teach-you-to-be-rich.pdf',
    r'the-little-book.pdf',
    r'the-millionaire.pdf',
]

In [8]:
processed_texts = [load_and_process_document(file) for file in files]

Loaded 232 document(s) from rich_poor_dad.pdf
Text cleaned. Total length: 339797 characters
Processed into 347 document chunks
Loaded 251 document(s) from total-money.pdf
Text cleaned. Total length: 420906 characters
Processed into 430 document chunks
Loaded 448 document(s) from your-money.pdf
Text cleaned. Total length: 658806 characters


Ignoring wrong pointing object 8 0 (offset 0)


Processed into 673 document chunks
Loaded 341 document(s) from millionaire_fastlane.pdf
Text cleaned. Total length: 723674 characters
Processed into 739 document chunks
Loaded 214 document(s) from the-psychology-money.pdf
Text cleaned. Total length: 298103 characters
Processed into 305 document chunks
Loaded 629 document(s) from the-intelligent-investor.pdf
Text cleaned. Total length: 1144209 characters
Processed into 1168 document chunks
Loaded 683 document(s) from fast-and-slow.pdf
Text cleaned. Total length: 1132383 characters
Processed into 1156 document chunks
Loaded 538 document(s) from principles-life.pdf
Text cleaned. Total length: 900788 characters
Processed into 920 document chunks
Loaded 508 document(s) from teach-you-to-be-rich.pdf
Text cleaned. Total length: 638397 characters
Processed into 652 document chunks
Loaded 185 document(s) from the-little-book.pdf
Text cleaned. Total length: 264804 characters
Processed into 271 document chunks
Loaded 327 document(s) from the-mill

Ignoring wrong pointing object 8 0 (offset 0)


Processed into 549 document chunks
Loaded 341 document(s) from millionaire_fastlane.pdf
Text cleaned. Total length: 723674 characters
Processed into 739 document chunks


In [19]:
db1 = create_pgvector_db("rich_poor_dad", "rich_poor_dad", processed_texts[0])
db2 = create_pgvector_db("total-money", "total-money", processed_texts[1])
db3 = create_pgvector_db("your-money", "your-money", processed_texts[2])
db4 = create_pgvector_db("millionaire_fastlane", "millionaire_fastlane", processed_texts[3])
db5 = create_pgvector_db("the-psychology-money", "the-psychology-money", processed_texts[4])
db6 = create_pgvector_db("the-intelligent-investor", "the-intelligent-investor", processed_texts[5])

db1 = create_pgvector_db("fast-and-slow", "fast-and-slow", processed_texts[6])
db2 = create_pgvector_db("principles-life", "principles-life", processed_texts[7])
db3 = create_pgvector_db("teach-you-to-be-rich", "teach-you-to-be-rich", processed_texts[8])
db4 = create_pgvector_db("the-little-book", "the-little-book", processed_texts[9])
db5 = create_pgvector_db("the-millionaire", "tthe-millionaire", processed_texts[10])

Successfully created rich_poor_dad
Successfully created total-money
Successfully created your-money
Successfully created millionaire_fastlane
Successfully created the-psychology-money
Successfully created the-intelligent-investor
Successfully created fast-and-slow
Successfully created principles-life
Successfully created teach-you-to-be-rich
Successfully created the-little-book
Successfully created the-millionaire


: 