In [1]:

# Import Libraries and Define Paths
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document
from huggingface_hub import snapshot_download
from tqdm import tqdm

import os

# --- Define File Paths Based on Your Project Structure ---
PROCESSED_DATA_PATH = '../data/processed/filtered_complaints.csv'
VECTOR_STORE_PATH = '../vector_store/db_faiss'

# Ensure the vector store directory exists
os.makedirs(os.path.dirname(VECTOR_STORE_PATH), exist_ok=True)



  from .autonotebook import tqdm as notebook_tqdm


In [2]:

# Import Libraries and Define Paths
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document
from huggingface_hub import snapshot_download
from tqdm import tqdm

import os

# --- Define File Paths Based on Your Project Structure ---
PROCESSED_DATA_PATH = '../data/processed/filtered_complaints.csv'
VECTOR_STORE_PATH = '../vector_store/db_faiss'

# Ensure the vector store directory exists
os.makedirs(os.path.dirname(VECTOR_STORE_PATH), exist_ok=True)



In [3]:

# Import Libraries and Define Paths
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document
from huggingface_hub import snapshot_download
from tqdm import tqdm

import os

# --- Define File Paths Based on Your Project Structure ---
PROCESSED_DATA_PATH = '../data/processed/filtered_complaints.csv'
VECTOR_STORE_PATH = '../vector_store/db_faiss'

# Ensure the vector store directory exists
os.makedirs(os.path.dirname(VECTOR_STORE_PATH), exist_ok=True)



In [4]:

# Load the Processed Data
try:
    df = pd.read_csv(PROCESSED_DATA_PATH)
    print("Processed data loaded successfully.")
    print(f"Dataset shape: {df.shape}")
    # Handle potential empty narratives that might have slipped through
    df.dropna(subset=['cleaned_narrative'], inplace=True)
    print(f"Shape after dropping any remaining NaNs: {df.shape}")
except FileNotFoundError:
    print(f"ERROR: The file was not found at {PROCESSED_DATA_PATH}.")
    print("Please ensure you have run Task 1 successfully.")



Processed data loaded successfully.
Dataset shape: (363409, 8)
Shape after dropping any remaining NaNs: (363409, 8)


In [5]:

# Cell 3: Text Chunking

print("\n--- Starting Text Chunking ---")

# We use LangChain's RecursiveCharacterTextSplitter.
# This splitter tries to split text on a hierarchy of characters (like "\n\n", "\n", " ", "")
# to keep semantically related pieces of text together as much as possible.

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # The maximum size of each chunk (in characters)
    chunk_overlap=100   # The number of characters to overlap between chunks
)

# We will create a new list to hold our chunked documents.
# Each entry will be a LangChain 'Document' object, which holds the text
# and its associated metadata.

docs = []
for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Chunking Narratives"):
    # Split the narrative into chunks
    chunks = text_splitter.split_text(row['cleaned_narrative'])
    
    for chunk in chunks:
        # For each chunk, we create a Document object and add metadata.
        # The metadata is crucial as it allows us to trace a retrieved chunk
        # back to its original complaint.
        docs.append(Document(
            page_content=chunk,
            metadata={
                'complaint_id': row['Complaint ID'],
                'product': row['Product'],
                'issue': row['Issue'],
                'company': row['Company'],
                'date_received': row['Date received']
            }
        ))

print(f"\nTotal number of documents created after chunking: {len(docs)}")
print("Example of a chunked document:")
print(docs[0])





--- Starting Text Chunking ---


Chunking Narratives: 100%|██████████| 363409/363409 [03:36<00:00, 1676.07it/s]


Total number of documents created after chunking: 588437
Example of a chunked document:
page_content='i made the mistake of using my wellsfargo debit card to depsit funds into atm machine outside their branch i went into the branch and was told they couldnt help and had to phone the customer service for help i did this and was told i was helped gave all the info for the time terminal id aact s was able to find the transaction and give me this info he said the dispute would take a few days i waited a few days and got a letter stating my dispute was rejected i went back into and they said they never got the transaction' metadata={'complaint_id': 14061897, 'product': 'Savings Account', 'issue': 'Managing an account', 'company': 'WELLS FARGO & COMPANY', 'date_received': '2025-06-13'}





In [6]:
# Cell 4: Embedding Model Setup

print("\n--- Setting up Embedding Model ---")

# We will use a pre-trained model from Hugging Face via the sentence-transformers library.
# 'all-MiniLM-L6-v2' is a fantastic starting model because it's:
# - Fast and lightweight.
# - Provides high-quality embeddings for semantic search.
# - Works well for general-purpose text.

# LangChain provides a convenient wrapper 'HuggingFaceEmbeddings'
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=model_name)

print(f"Embedding model '{model_name}' loaded successfully.")




--- Setting up Embedding Model ---


  embeddings = HuggingFaceEmbeddings(model_name=model_name)


Embedding model 'sentence-transformers/all-MiniLM-L6-v2' loaded successfully.


In [10]:
print("\n--- Creating and Saving the Vector Store in Batches ---")

# Define the size of our batches
batch_size = 500  # A safe starting number. You can increase if you have a lot of RAM.

# --- Step 1: Create the initial vector store with the first batch ---
print("Processing the first batch to initialize the vector store...")

# Check if there are any documents to process
if not docs:
    print("Error: No documents to process. Please check the chunking step.")
else:
    # Take the first batch of documents
    first_batch = docs[:batch_size]
    # Create the initial FAISS index
    db = FAISS.from_documents(first_batch, embeddings)
    print("Initial vector store created.")

    # --- Step 2: Loop through the rest of the documents in batches and add them ---
    # We start from the end of the first batch
    for i in tqdm(range(batch_size, len(docs), batch_size), desc="Adding Batches to Vector Store"):
        # Get the next batch of documents
        batch = docs[i : i + batch_size]
        # Add the documents in the current batch to the existing FAISS index
        db.add_documents(batch)

    print("\nAll batches have been added to the vector store.")

    # --- Step 3: Save the final, complete vector store ---
    db.save_local(VECTOR_STORE_PATH)

    print(f"\n✅ Vector store created and saved successfully at: {VECTOR_STORE_PATH}")


--- Creating and Saving the Vector Store in Batches ---
Processing the first batch to initialize the vector store...
Initial vector store created.


Adding Batches to Vector Store: 100%|██████████| 1176/1176 [5:30:05<00:00, 16.84s/it] 



All batches have been added to the vector store.


FileExistsError: [WinError 183] Cannot create a file when that file already exists: '..\\vector_store\\db_faiss'

In [12]:
 # --- Step 3: Save the final, complete vector store ---
db.save_local(VECTOR_STORE_PATH)

print(f"\n✅ Vector store created and saved successfully at: {VECTOR_STORE_PATH}")


✅ Vector store created and saved successfully at: ../vector_store/db_faiss


In [None]:
            # Cell 6: (Optional) Test the Vector Store

print("\n--- Testing the Vector Store ---")

# To verify it's working, let's load it back and perform a quick similarity search.
db = FAISS.load_local(VECTOR_STORE_PATH, embeddings, allow_dangerous_deserialization=True) # allow_dangerous_deserialization is needed for FAISS with Langchain

query = "My credit card was charged for something I did not buy"
results = db.similarity_search(query, k=3) # k is the number of results to return

print(f"Query: '{query}'")
print("\nTop 3 similar documents found:")
for i, doc in enumerate(results):
    print(f"\n--- Result {i+1} ---")
    print(f"Product: {doc.metadata.get('product', 'N/A')}")
    print(f"Issue: {doc.metadata.get('issue', 'N/A')}")
    print(f"Source Complaint ID: {doc.metadata.get('complaint_id', 'N/A')}")
    print(f"Text Snippet: {doc.page_content[:400]}...")
    print("--------------------")


--- Testing the Vector Store ---
Query: 'My credit card was charged for something I did not buy'

Top 3 similar documents found:

--- Result 1 ---
Product: Savings Account
Issue: Problem with a lender or other company charging your account
Source Complaint ID: 6190201
Text Snippet: i was charged dollars for something that i did not buy...
--------------------

--- Result 2 ---
Product: Credit Card
Issue: Problem with a purchase shown on your statement
Source Complaint ID: 3117402
Text Snippet: my card was charged with something i did not purchase i contact the credit card company and they said they were looking into it in the mean time dont do anything after a couple of months of this happening several times the issued was resolved but the credit card was reported as late payment for 120 days i tried to have it fixed with the credit card company to remove that late payment but no respon...
--------------------

--- Result 3 ---
Product: Savings Account
Issue: Problem with a lender or 