In [None]:
!pip -q install -U pandas numpy tqdm rank-bm25 sentencepiece
!pip -q install -U "transformers" "accelerate"


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.5/79.5 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.7/57.7 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.9/10.9 MB[0m [31m114.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.6/16.6 MB[0m [31m119.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.4/78.4 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 3.0.0 which is incompatible.
dask-cudf-cu12 25.10.0 requires pandas<2.4.0dev0,>=2.0, but you have pandas 3.0.0 which is incompatible.
tensorflow 2.19.0 r

In [None]:
import torch
print("cuda available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("gpu:", torch.cuda.get_device_name(0))

cuda available: True
gpu: NVIDIA A100-SXM4-40GB


Loading Data

In [None]:
from google.colab import files
uploaded = files.upload()
CSV_PATH = r"C:\Users\avant\Downloads\uci_drugscom\patient_reviews_uci_drugscom.csv"

Saving patient_reviews_uci_drugscom.csv to patient_reviews_uci_drugscom.csv


In [None]:
import pandas as pd
import numpy as np
import re

df = pd.read_csv("patient_reviews_uci_drugscom.csv")
print("Columns:", df.columns.tolist())
assert "review_text" in df.columns, "CSV must contain a review_text column"

if "review_id" not in df.columns:
    df["review_id"] = np.arange(len(df), dtype=np.int64)

def clean_text(t: str) -> str:
    t = str(t)
    t = re.sub(r"\s+", " ", t).strip()
    return t

df["review_text"] = df["review_text"].astype(str).map(clean_text)
df = df[df["review_text"].str.len() > 30].reset_index(drop=True)

print("Loaded reviews:", len(df))
df.head(2)

Columns: ['drug', 'condition', 'rating', 'date', 'usefulcount', 'review_text']
Loaded reviews: 212428


Unnamed: 0,drug,condition,rating,date,usefulcount,review_text,review_id
0,Valsartan,Left Ventricular Dysfunction,9.0,"May 20, 2012",27,"""It has no side effect, I take it in combinati...",0
1,Guanfacine,ADHD,8.0,"April 27, 2010",192,"""My son is halfway through his fourth week of ...",1


Chunking Strategy

In [None]:
from tqdm import tqdm

def chunk_by_chars(text: str, chunk_size: int = 900, overlap: int = 180):
    text = clean_text(text)
    if len(text) <= chunk_size:
        return [text]
    chunks = []
    i = 0
    while i < len(text):
        j = min(len(text), i + chunk_size)
        chunk = text[i:j].strip()
        if len(chunk) > 40:
            chunks.append(chunk)
        if j == len(text):
            break
        i = max(0, j - overlap)
    return chunks

rows = []
for _, r in tqdm(df.iterrows(), total=len(df), desc="Chunking"):
    chunks = chunk_by_chars(r["review_text"], chunk_size=900, overlap=180)
    for ci, ch in enumerate(chunks):
        rows.append({
            "review_id": int(r["review_id"]),
            "chunk_index": int(ci),
            "drug": r.get("drug", r.get("drugname", None)),
            "condition": r.get("condition", None),
            "rating": r.get("rating", None),
            "date": r.get("date", None),
            "usefulcount": r.get("usefulcount", None),
            "text": ch,
        })

chunks_df = pd.DataFrame(rows)
chunks_df["chunk_id"] = np.arange(len(chunks_df), dtype=np.int64)

print("Total chunks:", len(chunks_df))
chunks_df.head(2)


Chunking: 100%|██████████| 212428/212428 [00:20<00:00, 10373.12it/s]


Total chunks: 213133


Unnamed: 0,review_id,chunk_index,drug,condition,rating,date,usefulcount,text,chunk_id
0,0,0,Valsartan,Left Ventricular Dysfunction,9.0,"May 20, 2012",27,"""It has no side effect, I take it in combinati...",0
1,1,0,Guanfacine,ADHD,8.0,"April 27, 2010",192,"""My son is halfway through his fourth week of ...",1


In [None]:
from rank_bm25 import BM25Okapi

def tokenize(text: str):
    # simple, robust tokenizer
    return re.findall(r"[A-Za-z0-9']+", text.lower())

corpus_tokens = [tokenize(t) for t in chunks_df["text"].tolist()]
bm25 = BM25Okapi(corpus_tokens)

print("BM25 ready.")

BM25 ready.


In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
import os, pickle, json

OUT_DIR = "/content/drive/MyDrive/uci_rag_chunks"  #change folder name for saving chunks as per convenience
os.makedirs(OUT_DIR, exist_ok=True)

# 1) chunks table
chunks_df.to_parquet(f"{OUT_DIR}/chunks.parquet", index=False)

# 2) BM25 tokens
with open(f"{OUT_DIR}/bm25_tokens.pkl", "wb") as f:
    pickle.dump(corpus_tokens, f)

# 3) metadata
meta = {
    "n_chunks": int(len(chunks_df)),
    "chunk_size": 900,
    "overlap": 180,
    "tokenizer": "regex_word",
    "retriever": "bm25 + dense_on_the_fly"
}
with open(f"{OUT_DIR}/meta.json", "w") as f:
    json.dump(meta, f, indent=2)

print("Saved to:", OUT_DIR)


Saved to: /content/drive/MyDrive/uci_rag_chunks
