In [1]:
# 📌 Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
!pip install faiss-cpu
!pip install sentence-transformers
!pip install pandas


Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m57.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from to

In [3]:
import pandas as pd

# Load filtered complaints
file_path = '/content/drive/My Drive/filtered_complaints.csv'
df = pd.read_csv(file_path)
df = df[['cleaned_narrative', 'Product']]  # Include any other metadata like complaint ID if needed

df.dropna(subset=['cleaned_narrative'], inplace=True)
df.reset_index(drop=True, inplace=True)
df.head()


Unnamed: 0,cleaned_narrative,Product
0,a xxxx xxxx card was opened under my name by a...,Credit card
1,dear cfpb i have a secured credit card with ci...,Credit card
2,i have a citi rewards cards the credit balance...,Credit card
3,bi am writing to dispute the following charges...,Credit card
4,although the account had been deemed closed i ...,Credit card


In [4]:
def chunk_text(text, chunk_size=300, chunk_overlap=50):
    words = text.split()
    chunks = []
    i = 0
    while i < len(words):
        chunk = words[i:i + chunk_size]
        chunks.append(' '.join(chunk))
        i += chunk_size - chunk_overlap
    return chunks


In [5]:
chunk_size = 300
chunk_overlap = 50

chunked_texts = []
metadatas = []

for idx, row in df.iterrows():
    chunks = chunk_text(row['cleaned_narrative'], chunk_size, chunk_overlap)
    for chunk in chunks:
        chunked_texts.append(chunk)
        metadatas.append({
            'product': row['Product'],
            'original_index': idx
        })

print(f"Total chunks created: {len(chunked_texts)}")


Total chunks created: 109713


In [6]:
from sentence_transformers import SentenceTransformer

# Use all-MiniLM-L6-v2 for speed and performance balance
model = SentenceTransformer('all-MiniLM-L6-v2')

embeddings = model.encode(chunked_texts, show_progress_bar=True, convert_to_numpy=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/3429 [00:00<?, ?it/s]

In [8]:
import faiss
import os
import pickle

# Create FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

# Save index
os.makedirs('vector_store', exist_ok=True)
faiss.write_index(index, 'vector_store/faiss_index.index')

# Save metadata separately
with open('vector_store/metadata.pkl', 'wb') as f:
    pickle.dump(metadatas, f)

print("Vector store and metadata saved in 'vector_store/' directory.")


Vector store and metadata saved in 'vector_store/' directory.
