In [20]:
import pandas as pd

# Load your CSV
kb_df = pd.read_csv(r"C:\Users\Sana\PycharmProjects\gemini-chatbot\THM_kb_with_text.csv",encoding='latin1')

# Check first rows
print(kb_df.head())

# Combine fields to create a single text per entry (optional: include URL)
kb_df["content"] = kb_df["Page title"] + "\n" + kb_df["Article Body"] + "\nReference: " + kb_df["URL"]

# Convert to a list
documents = kb_df["content"].tolist()
print(f"Total KB entries: {len(documents)}")


                                          Page title  \
0                         Can I Use Voucher Top-Up?    
1                        Can I Tether from My Phone?   
2                How Can I Change My Payment Method?   
3  Are there any charges for calling Talk Home's ...   
4              Talk Home Mobile Coverage in My Area    

                                                 URL  \
0  https://mobile-help.talk-home.com/support/solu...   
1  https://mobile-help.talk-home.com/support/solu...   
2  https://mobile-help.talk-home.com/support/solu...   
3  https://mobile-help.talk-home.com/support/solu...   
4  https://mobile-help.talk-home.com/support/solu...   

                                        Article Body  
0  Yes, you can use a voucher to top up your Talk...  
1  Talk Home Mobile does not allow tethering. If ...  
2  You can change your payment method by updating...  
3  There are no charges for contacting our custom...  
4  Use the coverage map to check Talk Home Mobile..

In [21]:
import pandas as pd

kb_df = pd.read_csv(r"C:\Users\Sana\PycharmProjects\gemini-chatbot\THM_kb_with_text.csv", encoding='latin1')

# Fill missing URL with empty string
kb_df["URL"] = kb_df["URL"].fillna("")

# Convert all columns to string to avoid float NaN issues
kb_df["Page title"] = kb_df["Page title"].astype(str)
kb_df["Article Body"] = kb_df["Article Body"].astype(str)

# Combine fields into content
kb_df["content"] = kb_df["Page title"] + "\n" + kb_df["Article Body"]
kb_df["content"] += kb_df["URL"].apply(lambda x: f"\nReference: {x}" if x else "")

# Convert all content to string explicitly (in case of weird NaN)
documents = [str(doc) for doc in kb_df["content"]]

def chunk_text(text, chunk_size=500):
    # Convert to string in case
    text = str(text)
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size):
        chunk = " ".join(words[i:i+chunk_size])
        chunks.append(chunk)
    return chunks

all_chunks = []
for doc in documents:
    all_chunks.extend(chunk_text(doc))

print(f"Total chunks created: {len(all_chunks)}")


Total chunks created: 130


In [22]:
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import numpy as np
import faiss
import pickle

# Load open-source model
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

# Example: all_chunks is your KB chunks
embeddings = []

for chunk in tqdm(all_chunks, desc="Generating embeddings"):
    vector = embed_model.encode(chunk)
    embeddings.append(vector)

embeddings = np.array(embeddings).astype("float32")
print(f"Generated {embeddings.shape[0]} embeddings of size {embeddings.shape[1]}")


Generating embeddings: 100%|██████████| 130/130 [00:03<00:00, 40.12it/s]

Generated 130 embeddings of size 384





In [23]:
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

print(f"FAISS index has {index.ntotal} vectors")

# Save index and chunks
faiss.write_index(index, "kb.index")
with open("kb_chunks.pkl", "wb") as f:
    pickle.dump(all_chunks, f)


FAISS index has 130 vectors
