## Dependencies


In [1]:
!pip install qdrant-client sentence-transformers -q rank-bm25 huggingface_hub

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/378.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m378.5/378.5 kB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
[?25h

###Libraries

In [2]:
import os
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import numpy as np
import re
import spacy
import random
from google.colab import files
from sentence_transformers import SentenceTransformer, CrossEncoder
from huggingface_hub import InferenceClient
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, TextStreamer
import torch
import getpass
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, Batch, PayloadSchemaType
from qdrant_client.models import Filter, FieldCondition, MatchValue, Range
from rank_bm25 import BM25Plus

In [3]:
# Set random seeds for reproducibility

random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

###Crawling and Extracting Texts

In [4]:
# Function to create safe, unique filenames
def safe_filename(name):
    # Replace any non-alphanumeric character with underscore
    name = re.sub(r'[^\w]', '_', name)
    return name

# Function to clean text (remove citation brackets)
def clean_text(text):
    # Remove citation brackets like [1], [2], [3], etc.
    text = re.sub(r'\[\d+\]', '', text)
    return text.strip()

def clean_country(text):
    # 1. Remove citations
    text = re.sub(r'\[.*?\]', '', text)

    # 2. Remove parentheses and content: "Austria (alleged)" -> "Austria"
    text = re.sub(r'\(.*?\)', '', text)

    # 3. Fix (e.g., "UzbekistanKazakhstan" -> "Uzbekistan, Kazakhstan")
    # This regex finds a lowercase letter followed immediately by an uppercase letter
    text = re.sub(r'(?<=[a-z])(?=[A-Z])', ', ', text)

    # 4. Normalize spaces
    text = re.sub(r'\s+', ' ', text).strip()

    # 5. Fix messy commas (e.g., " , United States" -> "United States")
    text = re.sub(r'\s*,\s*', ', ', text) # Ensure ", " spacing
    text = text.strip(' ,')

    return text

# URLs
BASE_URL = "https://en.wikipedia.org"
LIST_URL = "https://en.wikipedia.org/wiki/List_of_serial_killers_by_number_of_victims"

# Custom headers to avoid Wikipedia blocks
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/130.0.0.0 Safari/537.36"
}

# Create folder for text files
os.makedirs("serial_killers_texts", exist_ok=True)

# Step 1: Get list page
resp = requests.get(LIST_URL, headers=HEADERS)
if resp.status_code != 200:
    raise ConnectionError(f"Failed to fetch page: {resp.status_code}")

soup = BeautifulSoup(resp.text, "html.parser")

# Step 2: Find all wikitable tables
tables = soup.find_all("table", class_="wikitable")
print(f"Found {len(tables)} tables on the page.")

if not tables:
    with open("debug_page.html", "w", encoding="utf-8") as f:
        f.write(resp.text)
    raise ValueError("No tables found.")

# Step 3: Extract table headers and rows (40 rows PER TABLE)
killers = []

for table_idx, table in enumerate(tables):
    print(f"\nProcessing Table {table_idx + 1}...")

    # Get table headers
    headers = []
    header_row = table.find("tr")
    if header_row:
        for th in header_row.find_all("th"):
            headers.append(th.get_text(strip=True))

    # If no headers found, use default
    if not headers:
        headers = ["Name", "Country", "Years active", "Proven victims", "Possible victims", "Notes"]

    # Get table rows (skip header row)
    table_rows = table.find_all("tr")[1:]

    # Counter for this table only
    table_count = 0

    for row in table_rows:
        if table_count >= 40:  # Limit to 30 rows per table
            print(f"  Reached 40 rows for Table {table_idx + 1}")
            break

        cells = row.find_all("td")
        if len(cells) < 1:
            continue

        # Extract name and link from first column
        name_cell = cells[0]
        link_tag = name_cell.find("a", href=True)
        if not link_tag:
            continue

        name = link_tag.get_text(strip=True)
        link = BASE_URL + link_tag["href"]

        # Extract row data based on headers
        row_data = {}
        row_data["Name"] = name
        row_data["Link"] = link
        row_data["Table"] = table_idx + 1  # Track which table this came from

        # Map cell data to headers (starting from index 1 since name is already extracted)
        for idx, cell in enumerate(cells):
            if idx < len(headers):
                header = headers[idx]
                raw_text = cell.get_text(separator=", ", strip=True)

                if "Country" in header:
                    cell_text = clean_country(raw_text)
                else:
                    cell_text = clean_text(raw_text)

                row_data[header] = cell_text

        killers.append(row_data)
        table_count += 1

    print(f"  Extracted {table_count} rows from Table {table_idx + 1}")

print(f"\n{'='*80}")
print(f"Total collected: {len(killers)} killer entries across {len(tables)} tables")
print(f"{'='*80}\n")

# Step 4: Scrape each killer's Wikipedia page and combine with table data
for idx, killer_data in enumerate(killers, 1):
    name = killer_data.get("Name", "Unknown")
    link = killer_data.get("Link", "")
    table_num = killer_data.get("Table", "?")

    print(f"[{idx}/{len(killers)}] Scraping: {name} (Table {table_num})")

    try:
        page_resp = requests.get(link, headers=HEADERS)
        if page_resp.status_code != 200:
            print(f"Failed to fetch {link}")
            continue

        page_soup = BeautifulSoup(page_resp.text, "html.parser")

        # Main content section
        content_div = page_soup.find("div", {"class": "mw-parser-output"})
        if not content_div:
            print(f"No content div found")
            continue

        # Collect only paragraphs (no headings)
        paragraphs = []
        for p in content_div.find_all("p"):
            text_par = clean_text(p.get_text().strip())
            # Filter out short paragraphs and navigation content
            if text_par and len(text_par) > 50:
                paragraphs.append(text_par)

        wiki_content = "\n\n".join(paragraphs)

        # Combine table data with Wikipedia content
        combined_text = []

        for key, value in killer_data.items():
          if key not in ("Link", "Table") and value:
            combined_text.append(f"{key}: {value}")


        # Add Wikipedia page content
        if wiki_content.strip():
            combined_text.append(wiki_content)

        final_text = "\n".join(combined_text)

        # Save to file only if content exists
        if final_text.strip():
            safe_name_file = safe_filename(name)
            file_path = os.path.join("serial_killers_texts", f"{safe_name_file}.txt")
            with open(file_path, "w", encoding="utf-8") as f:
                f.write(final_text)
            print(f"Saved {safe_name_file}.txt")
        else:
            print(f"No wikipedia content for {name}")

        time.sleep(1)  # polite delay

    except Exception as e:
        print(f"Error scraping {name}: {e}")

print("\n" + "="*80)
print("SCRAPING COMPLETED!")
print("="*80)
print(f"Files saved in 'serial_killers_texts' folder")
print(f"Total files: {len([f for f in os.listdir('serial_killers_texts') if f.endswith('.txt')])}")

Found 7 tables on the page.

Processing Table 1...
  Extracted 34 rows from Table 1

Processing Table 2...
  Reached 40 rows for Table 2
  Extracted 40 rows from Table 2

Processing Table 3...
  Reached 40 rows for Table 3
  Extracted 40 rows from Table 3

Processing Table 4...
  Reached 40 rows for Table 4
  Extracted 40 rows from Table 4

Processing Table 5...
  Reached 40 rows for Table 5
  Extracted 40 rows from Table 5

Processing Table 6...
  Reached 40 rows for Table 6
  Extracted 40 rows from Table 6

Processing Table 7...
  Reached 40 rows for Table 7
  Extracted 40 rows from Table 7

Total collected: 274 killer entries across 7 tables

[1/274] Scraping: Luis Garavito (Table 1)
Saved Luis_Garavito.txt
[2/274] Scraping: Mariam Soulakiotis (Table 1)
Saved Mariam_Soulakiotis.txt
[3/274] Scraping: Pedro López (Table 1)
Saved Pedro_López.txt
[4/274] Scraping: Javed Iqbal (Table 1)
Saved Javed_Iqbal.txt
[5/274] Scraping: Mikhail Popkov (Table 1)
Saved Mikhail_Popkov.txt
[6/274] Scra

In [5]:
# ------- Preprocessing steps -----------
folder = "serial_killers_texts"
data = []

for filename in os.listdir(folder):
    if filename.endswith(".txt"):
        name = os.path.splitext(filename)[0]  # remove .txt extension
        path = os.path.join(folder, filename)

        with open(path, "r", encoding="utf-8") as f:
            text = f.read()

        # Only include non-empty text
        if text.strip():
            # Parse the structured data
            lines = text.split('\n')

            # Extract table data (before the separator)
            table_data = {}
            wiki_content = []
            in_wiki_section = False

            for line in lines:
                if not in_wiki_section:
                    # Parse key-value pairs from table data
                    if ':' in line:
                        parts = line.split(':', 1)
                        if len(parts) == 2:
                            key = parts[0].strip()
                            value = parts[1].strip()
                            table_data[key] = value
                else:
                    wiki_content.append(line)

            # Create clean text without table number and separators
            clean_text_parts = []

            # Add table data (excluding Table number)
            for key, value in table_data.items():
                if key != "Table" and value:  # Exclude Table field
                    clean_text_parts.append(f"{key}: {value}")

            # Add wiki content (already extracted above)
            wiki_text = '\n'.join(wiki_content).strip()
            if wiki_text:
                clean_text_parts.append("\n" + wiki_text)

            clean_full_text = '\n'.join(clean_text_parts)


            # Combine into a single record
            record = {
                "name": name,
                "full_name": table_data.get("Name", name),
                "country": table_data.get("Country", ""),
                "years_active": table_data.get("Years active", ""),
                "proven_victims": table_data.get("Proven victims", ""),
                "possible_victims": table_data.get("Possible victims", ""),
                "notes": table_data.get("Notes", ""),
                "wiki_content": '\n'.join(wiki_content).strip(),
                "text": text
            }

            data.append(record)

# Create DataFrame
df = pd.DataFrame(data)

print("="*80)
print("DATAFRAME OVERVIEW")
print("="*80)
print(f"\nShape: {df.shape}")
print(f"Columns: {list(df.columns)}")

# Display first few rows (limited view)
print("\n" + "="*80)
print("FIRST 3 ENTRIES (Sample)")
print("="*80)
for idx, row in df.head(3).iterrows():
    print(f"\n{idx+1}. {row['full_name']}")
    print(f"   Country: {row['country']}")
    print(f"   Years Active: {row['years_active']}")
    print(f"   Proven Victims: {row['proven_victims']}")
    print(f"   Possible Victims: {row['possible_victims']}")
    print(f"   Notes (first 100 chars): {row['notes'][:100] if len(row['notes']) > 100 else row['notes']}...")

# Data quality checks
print("\n" + "="*80)
print("DATA QUALITY CHECKS")
print("="*80)

# Count rows where 'wiki_content' is empty or only whitespace
empty_wiki_count = df['wiki_content'].str.strip().eq('').sum()
print(f"Empty wiki content entries: {empty_wiki_count}")

# Count rows where 'notes' is empty
empty_notes_count = df['notes'].str.strip().eq('').sum()
print(f"Empty notes entries: {empty_notes_count}")

# Check for missing values
print("\nMissing values per column:")
print(df.isnull().sum())

# Check for duplicates
print(f"\nDuplicate rows: {df.duplicated().sum()}")

# Save to CSV for easy inspection
df.to_csv("serial_killers_data.csv", index=False, encoding="utf-8")
print("\nData saved to 'serial_killers_data.csv'")

DATAFRAME OVERVIEW

Shape: (274, 9)
Columns: ['name', 'full_name', 'country', 'years_active', 'proven_victims', 'possible_victims', 'notes', 'wiki_content', 'text']

FIRST 3 ENTRIES (Sample)

1. Niels Högel
   Country: Germany
   Years Active: 1999–2005
   Proven Victims: 85+
   Possible Victims: 300, [, 415, ]
   Notes (first 100 chars): Nurse who was sentenced to life imprisonment for the murder of more than 85 people., [, 416, ], [, 4...

2. Viktor Sayenko and Igor Suprunyuk
   Country: Ukraine
   Years Active: 2007
   Proven Victims: 21
   Possible Victims: 
   Notes (first 100 chars): "The Dnepropetrovsk Maniacs". A pair of 19-year-olds who, over the course of less than a month, atta...

3. Kimberly Clark Saenz
   Country: United States
   Years Active: 2008
   Proven Victims: 5
   Possible Victims: 
   Notes (first 100 chars): Killed five patients by using syringes to inject bleach into their dialysis lines....

DATA QUALITY CHECKS
Empty wiki content entries: 274
Empty notes entr

In [6]:
# See all the unique countries that exist on the dataset
unique_countries = df['country'].unique()

print(unique_countries)

['Germany' 'Ukraine' 'United States' 'Colombia' 'Canada' 'Japan' 'Italy'
 'Colombia, Ecuador, Brazil' 'Belgium, France' 'France' 'Soviet Union'
 'United States, Canada' 'China' 'United Kingdom'
 'United States, Austria, , France, , United Kingdom' 'Australia'
 'South Africa' 'Iraq' 'Zambia' 'Afghanistan' 'Thailand' 'Hungary' 'Iran'
 'Mexico' 'Soviet Union, Kazakhstan' 'Switzerland' 'Ecuador'
 'Ottoman Empire, Iraq, Iran' 'France, Germany, , Switzerland'
 'South Korea' 'Indonesia' 'Netherlands' 'Colombia, Ecuador, , Venezuela'
 'Norway' 'Soviet Union, Russia' 'Pakistan' 'Sweden' 'Austria' 'Russia'
 'Swaziland' 'Guatemala'
 'United Kingdom, Ireland, , West Germany, , Netherlands, , France'
 'Brazil' 'Ghana' 'India' 'Rwanda' 'Egypt' 'Greece'
 'Soviet Union, Ukraine' 'Poland'
 'Kingdom of Romania, Yugoslavia, Hungary' 'Venezuela' 'Finland' 'Morocco'
 'Latvia' 'Austria-Hungary' 'Costa Rica' 'Belgian Congo, Tanganyika'
 'Spain' 'West Germany' 'Soviet Union, Russia, Tajikistan'
 'Italy, West 

###Preprocessing

### The Preprocessing steps we will use are:
#### Lower Casing
#### Removing Non-Alphabets
#### Removing Punctuation
#### Removing Stopwords
#### Removing Short Words
#### Lemmatization


In [7]:
# Minima Cleaning(For Embeddings & LLM)
def minimal_clean(text):
    """
    Only fixes encoding/spacing. Keeps grammar/punctuation for the LLM.
    """
    text = str(text).replace('\xa0', ' ')
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply this to the main 'text' column.
# This ensures 'df["text"]' remains readable English.
df['text'] = df['text'].apply(minimal_clean)
df['name'] = df['name'].apply(minimal_clean)

In [8]:
def clean_text_for_search(text):
    text = str(text).lower()

    # 1. Replace all non-alphanumeric, non-space characters with a single SPACE.
    # This separates numbers from dashes, commas, or periods (e.g., '25/05/2024' -> '25 05 2024').
    text = re.sub(r'[^a-z0-9 ]', ' ', text)

    # 2. Normalize whitespace and strip.
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [9]:
# Load spaCy model for:
# 1. Stopword removal
# 2. Lemmatization
# 3. Tokenization
nlp = spacy.load("en_core_web_sm")

In [10]:
# ---- Base stopwords ----
base_stopwords = set(nlp.Defaults.stop_words)

# ---- Words to keep ----
preserve = {
    # Negation
    'not', 'no', "n't", 'never', 'without',
    # Quantity/Extent
    'only', 'less', 'more', 'most', 'almost', 'fully', 'partially',
    # Chronology/Sequence
    'first', 'last', 'before', 'after', 'once', 'while',
    # Comparison/Condition/Relationship
    'against', 'though', 'although', 'unless', 'except', 'whether', 'between', 'with'
}

# Remove preserved words from stopword list
custom_stopwords = base_stopwords - preserve



# ---- Stopword removal using nlp(text) ----
def remove_stopwords(text):
    doc = nlp(text)

    filtered = [
        token.text
        for token in doc
        if token.text.lower() not in custom_stopwords
    ]

    return " ".join(filtered)


In [11]:
def remove_short_words_keep_numbers(text, min_len=3):
    words = text.split()
    filtered = []
    for w in words:
        if w.isdigit():
            # keep any number, regardless of length
            filtered.append(w)
        elif len(w) >= min_len:
            filtered.append(w)
    return " ".join(filtered)

In [12]:
# Function to remove and replace the (+) sign on specific columns
def parse_int(x):
    if pd.isna(x):
        return None
    x = str(x).strip().replace("+", "")
    match = re.search(r"\d+", x)
    return int(match.group()) if match else None

df["proven_victims"] = df["proven_victims"].apply(parse_int)
df["possible_victims"] = df["possible_victims"].apply(parse_int)

#Chunk the text

In [13]:
def chunk_by_words(text, chunk_size=250, overlap=40):
    words = str(text).split()
    if not words:
        return []
    overlap = max(0, min(overlap, max(0, chunk_size - 1)))  # guard
    chunks = []
    start, n = 0, len(words)
    while start < n:
        end = min(n, start + chunk_size)
        chunks.append(" ".join(words[start:end]))
        if end == n:
            break
        start = end - overlap
    return chunks

# Chunk the natural text (df['text'])
df["chunks"] = df["text"].apply(lambda t: chunk_by_words(t, chunk_size=250, overlap=40))

# Show results
total_chunks = sum(len(ch) for ch in df["chunks"])
print(f"Created {total_chunks} total chunks")


Created 2254 total chunks


In [14]:
chunked_rows = []
for doc_idx, row in df.iterrows():
    for chunk_idx, chunk in enumerate(row["chunks"]):
        # 1. Clean
        sparse_text = clean_text_for_search(chunk)
        # 2. Stopwords
        sparse_text = remove_stopwords(sparse_text)
        # 3. Short words
        sparse_text = remove_short_words_keep_numbers(sparse_text, min_len=3)
        # 4. Lemmatization
        sparse_tokens = [token.lemma_ for token in nlp(sparse_text)]
        sparse_text_final = " ".join(sparse_tokens)

        chunked_rows.append({
            "doc_index": doc_idx,
            "name": row["name"],
            "chunk_index": chunk_idx,

            # This is Natural English (for Embeddings/LLM)
            "text": chunk,

            # This is for BM25/TF-IDF
            "text_bm25": sparse_text_final,

            "proven_victims": row["proven_victims"],
            "possible_victims": row["possible_victims"],
            "country": row["country"],
            "years_active": row["years_active"],
        })

# Create DataFrame and sort it
chunked_df = pd.DataFrame(chunked_rows)
chunked_df.to_csv('serial_killers_chunked.csv', index=False)
print(f"Saved {len(chunked_df)} chunks to CSV")

Saved 2254 chunks to CSV


In [15]:
# Inspect the columns
print(f"Columns: {list(chunked_df.columns)}")

Columns: ['doc_index', 'name', 'chunk_index', 'text', 'text_bm25', 'proven_victims', 'possible_victims', 'country', 'years_active']


In [16]:
# Optionally save the file to local environment
files.download("serial_killers_chunked.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

##BM25+ Ranking Algorithm - Keyword Matching

## Prepare tokenized corpus for BM25+

In [17]:
# Prepare for lexical models BM25+ and TF-IDF
chunked_df["text_bm25"] = chunked_df["text_bm25"].astype(str)
chunked_df["tokens"] = chunked_df["text_bm25"].str.split()
corpus_tokens = chunked_df["tokens"].tolist()

# Build BM25+ model
bm25 = BM25Plus(corpus_tokens)
print(f"BM25+ corpus size: {len(corpus_tokens)} chunks")

BM25+ corpus size: 2254 chunks


## BM25+ search function (keyword matching)

In [18]:
def clean_query_for_bm25(query: str) -> list:
# Use functions on the query too
    q = clean_text_for_search(query)
    q = remove_stopwords(q)
    q = remove_short_words_keep_numbers(q, min_len=3)
    # Lemmatize query
    q_tokens = [token.lemma_ for token in nlp(q)]
    return q_tokens # returns list directly

def search_bm25(query: str, top_k: int = 5):
  q_tokens = clean_query_for_bm25(query)
  if not q_tokens:
    print("Empty query.")
    return pd.DataFrame()

  scores = np.array(bm25.get_scores(q_tokens))
  top_idx = np.argsort(scores)[::-1][:top_k]
  results = chunked_df.iloc[top_idx].copy()
  results["bm25_score"] = scores[top_idx]
  # Snippet comes from natural text
  results["snippet"] = results["text"].str[:250] + "..."
  return results.sort_values("bm25_score", ascending=False).reset_index(drop=True)

##TF-IDF Vector Space Search - Semantic

In [19]:
# TF-IDF on lemma chunks
# TF-IDF on the sparse column (text_bm25)
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_df=0.9, min_df=1)
tfidf_matrix = tfidf_vectorizer.fit_transform(chunked_df["text_bm25"])
print("TF-IDF matrix shape:", tfidf_matrix.shape)

def search_tfidf(query: str, top_k: int = 5):
    # Clean query same as corpus
    q = clean_text_for_search(query)
    q = remove_stopwords(q)
    q = remove_short_words_keep_numbers(q, min_len=3)
    q = " ".join([token.lemma_ for token in nlp(q)])

    if not q:
        print("Empty query.")
        return pd.DataFrame()

    sims = cosine_similarity(tfidf_vectorizer.transform([q]), tfidf_matrix)[0]
    top_idx = np.argsort(sims)[::-1][:top_k]
    results = chunked_df.iloc[top_idx].copy()
    results["tfidf_score"] = sims[top_idx]
    # Snippet comes from natural text
    results["snippet"] = results["text"].str[:250] + "..."
    return results.sort_values("tfidf_score", ascending=False).reset_index(drop=True)


TF-IDF matrix shape: (2254, 200823)


## Embeddings with hugging face personal token for model all-mpnet-base-v2.

In [20]:
# Ask the user for their Hugging Face token to access gated models
os.environ['HF_TOKEN'] = getpass.getpass("Enter the value for HF_TOKEN: ")

Enter the value for HF_TOKEN: ··········


In [21]:
# Set the cache directory for Hugging Face models and tokenizers
os.environ["HF_HOME"] = "/content/my_huggingface_cache"

In [22]:
# Check for available GPUs and set the device accordingly
if torch.cuda.device_count()>0:
    my_device = "cuda"
    print(f"You have {torch.cuda.device_count()} GPUs available.")
else:
    my_device = "cpu"
    print("You have no GPUs available. Running on CPU.")

You have 1 GPUs available.


In [23]:
# Load a compact sentence embedding model with authorization and caching
embeddings_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', token=os.environ["HF_TOKEN"],
                                       cache_folder=os.environ["HF_HOME"], device=my_device)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [24]:
# Use natural text (df['text']) for embeddings
my_sentences = chunked_df["text"].tolist()

In [25]:
# Encode document corpus with batches
my_embeddings = embeddings_model.encode(my_sentences, batch_size=32, show_progress_bar=True, convert_to_numpy=True)
print("My embeddings shape:", my_embeddings.shape)

Batches:   0%|          | 0/71 [00:00<?, ?it/s]

My embeddings shape: (2254, 768)


#Qdrant Storage of the Embeddings

In [None]:
# Connect to personal 
Cloud cluster

QDRANT_URL = "...."   # <---- put your QDRANT URL here !!!
API_KEY = "...."   # <---- Put your API key here !!!

client = QdrantClient(
    url=QDRANT_URL,
    api_key=API_KEY
)


# Create a collection in Qdrant

client.recreate_collection(
    collection_name="serial_killers",
    vectors_config=VectorParams(
        size=my_embeddings.shape[1],  # embedding dimension (768 for all-mpnet-base-v2)
        distance=Distance.COSINE
    )
)



# numeric index for proven_victims
client.create_payload_index(
    collection_name="serial_killers",
    field_name="proven_victims",
    field_schema=PayloadSchemaType.INTEGER
)

# numeric index for possible_victims
client.create_payload_index(
    collection_name="serial_killers",
    field_name="possible_victims",
    field_schema=PayloadSchemaType.INTEGER
)

# keyword index for country
client.create_payload_index(
    collection_name="serial_killers",
    field_name="country",
    field_schema=PayloadSchemaType.KEYWORD
)

# keyword index for name
client.create_payload_index(
    collection_name="serial_killers",
    field_name="name",
    field_schema=PayloadSchemaType.KEYWORD
)


# Prepare metadata payloads

payloads = []
for i, row in chunked_df.iterrows():
    payloads.append({
        "text": row["text"],
        "name": row["name"],
        "full_name": row.get("full_name"),
        "proven_victims": int(row["proven_victims"]) if pd.notna(row["proven_victims"]) else None,
        "possible_victims": int(row["possible_victims"]) if pd.notna(row["possible_victims"]) else None,
        "country": row.get("country"),
        "years_active": row.get("years_active"),
    })


# Create unique IDs for each chunk
ids = list(range(len(my_sentences)))


# Upload the embeddings at batches

batch_size = 500
for i in range(0, len(my_sentences), batch_size):
    batch_vectors = my_embeddings[i:i+batch_size]
    batch_payloads = payloads[i:i+batch_size]
    batch_ids = ids[i:i+batch_size]

    client.upsert(
        collection_name="serial_killers",
        points=Batch(
            ids=batch_ids,
            vectors=batch_vectors,
            payloads=batch_payloads
        )
    )

print("All embeddings successfully uploaded to Qdrant Cloud")

  client.recreate_collection(


All embeddings successfully uploaded to Qdrant Cloud


In [27]:
def preprocess_query_for_embedding(query: str):
    # 1. Lowercase
    q = query.lower()

    # 2. Remove special characters (optional, but good for noise reduction)
    # This keeps only letters, numbers, and basic punctuation
    # removes commas so "1,000" becomes "1000")
    q = re.sub(r'[^a-z0-9\s\?\.]', '', q).replace(',', '')

    # 3. Strip extra whitespace
    q = " ".join(q.split())

    return q

In [28]:
def parse_numeric_filters(query: str):
    """
    Detects strict numeric conditions: Greater Than, Less Than, Equal.
    """
    q = query.lower().replace(',', '')

    # 1. Determine Target Field
    target_field = "proven_victims"
    possible_keywords = ["possible", "suspected", "estimated", "alleged", "potential"]
    if any(word in q for word in possible_keywords):
        target_field = "possible_victims"

    # 2. Strict Patterns
    patterns = [
        # GREATER THAN (>)
        (r"more than (\d+)", "gt"),
        (r"over (\d+)", "gt"),
        (r"above (\d+)", "gt"),

        # LESS THAN (<)
        (r"less than (\d+)", "lt"),
        (r"under (\d+)", "lt"),
        (r"below (\d+)", "lt"),

        # EQUAL (=)
        (r"exactly (\d+)", "eq"),
        (r"(\d+)\s+victims", "eq"),
        (r"(\d+)\s+possible", "eq"),
        # Catch just a number if it implies exact count
        # e.g. "killers with 20 victims" -> eq 20
        (r"with (\d+)$", "eq"),
    ]

    for pattern, operator in patterns:
        match = re.search(pattern, q)
        if match:
            value = int(match.group(1))
            return (target_field, operator, value)

    return None

In [29]:
# Only single, valid country names
COUNTRIES = [
    'Afghanistan', 'Argentina', 'Australia', 'Austria', 'Belgium', 'Brazil',
    'Canada', 'Chile', 'China', 'Colombia', 'Czech Republic', 'Czechoslovakia',
    'Ecuador', 'France', 'Germany', 'Ghana', 'Greece', 'Guatemala', 'Hungary',
    'India', 'Indonesia', 'Iran', 'Iraq', 'Ireland', 'Italy', 'Japan',
    'Kazakhstan', 'Latvia', 'Mexico', 'Morocco', 'Netherlands', 'Norway',
    'Pakistan', 'Peru', 'Poland', 'Puerto Rico', 'Romania', 'Russia', 'Rwanda',
    'South Africa', 'South Korea', 'Soviet Union', 'Spain', 'Swaziland',
    'Sweden', 'Switzerland', 'Tajikistan', 'Thailand', 'Tunisia', 'Ukraine',
    'United Kingdom', 'United States', 'Uzbekistan', 'Venezuela', 'West Germany',
    'Yugoslavia', 'Zambia'
]

# Sort by length (descending) to match "South Africa" before "Africa" or "United States" before "States"
COUNTRIES.sort(key=len, reverse=True)

def extract_country(query: str):
    q = query.lower()
    for c in COUNTRIES:
        if c.lower() in q:
            return c
    return None

In [30]:
def build_qdrant_filter(query:str):
    """
    Returns a Qdrant Filter object based on the detected numeric/country filters.
    """

    numeric = parse_numeric_filters(query)
    country = extract_country(query)

    must_clauses = []

    # Country filter
    if country:
        must_clauses.append(
            FieldCondition(
                key="country",
                match=MatchValue(value=country)
            )
        )

    # Numeric victims filter
    if numeric:
        # Unpack the 3 values
        field_name, op, value = numeric

        if op == "gt":
            rng = Range(gt=value)
        elif op == "lt":
            rng = Range(lt=value)
        else:
            rng = Range(gte=value, lte=value)

        # Apply filter to the specific field detected (proven OR possible)
        must_clauses.append(
            FieldCondition(
                key=field_name,
                range=rng
            )
        )

    if not must_clauses:
        return None  # no filtering needed

    return Filter(must=must_clauses)

## Comparison BM25 vs TF-IDF vs Embeddings

In [31]:
# Sample Query
query_sample = "How did Samuel Little killed his victims?"

In [32]:
# Preprocess the sample query
query_sample = preprocess_query_for_embedding(query_sample)

In [33]:
# Embedding the preprocessed sample query
my_question_embedding = embeddings_model.encode([query_sample])[0].tolist()

In [34]:
def hybrid_qdrant_search(client, collection_name, query: str, query_embedding, top_k=40):
    """
    Performs semantic + structured search in Qdrant.
    Returns: raw Qdrant response object (to access .points) and formatted results
    """
    q_filter = build_qdrant_filter(query)

    # Get raw Qdrant response
    raw_results = client.query_points(
        collection_name=collection_name,
        query=query_embedding,
        limit=top_k,
        with_payload=True,
        query_filter=q_filter
    )

    # Also create formatted results
    formatted_results = []
    for point in raw_results.points:
      # Handle Country List for Display
      country_val = point.payload.get("country")

      # If it's a list (e.g., ['USA', 'Canada']), join it into a string
      if isinstance(country_val, list):
        country_display = ", ".join(country_val)
      else:
        # Fallback if it's None or already a string
        country_display = str(country_val) if country_val else "Unknown"

      formatted_results.append({
          "score": point.score,
          "name": point.payload.get("name"),
          "country": point.payload.get("country"),
          "proven_victims": point.payload.get("proven_victims"),
          "possible_victims": point.payload.get("possible_victims"),
          "text": point.payload.get("text")[:300] + "..." if point.payload.get("text") else "",
      })

    return raw_results, formatted_results

# Results
raw_embeddings_results, formatted_embeddings_results = hybrid_qdrant_search(
    client,
    "serial_killers",
    query_sample,
    my_question_embedding,
    top_k=40
)

# -------------------------------
# Display results - Comparison
print(f"Query: {query_sample}")
print("="*50)

print("\n--- Embeddings Results ---")
# Use the formatted results here so we see the clean country names
for i, res in enumerate(formatted_embeddings_results):
    print(f"[{i+1}] {res['name']} ({res['country']})")
    print(f"    Score: {res['score']:.3f}")
    print(f"    Victims: Proven {res['proven_victims']} / Possible {res['possible_victims']}")
    print(f"    Snippet: {res['text'][:100]}...")
    print()

# BM25 and TF-IDF Score Results
bm25_results = search_bm25(query_sample, top_k=40)
tfidf_results = search_tfidf(query_sample, top_k=40)

print("\n--- BM25 Results ---")
for i, row in bm25_results.iterrows():
    scaled = row['bm25_score'] / 100
    print(f"[{i+1}] {row['name']}  score={scaled:.3f}")
    print(row["snippet"])
    print()

print("\n--- TF-IDF Results ---")
for i, row in tfidf_results.iterrows():
    print(f"[{i+1}] {row['name']}  score={row['tfidf_score']:.3f}")
    print(row["snippet"])
    print()

Query: how did samuel little killed his victims?

--- Embeddings Results ---
[1] Samuel_Little (United States)
    Score: 0.611
    Victims: Proven 60 / Possible 93
    Snippet: Name: Samuel Little Country: United States Years active: 1970–2005 Proven victims: 60 Possible victi...

[2] Larry_Eyler (United States)
    Score: 0.518
    Victims: Proven 19 / Possible 23
    Snippet: murder, Little had suggested the two "do a scene", which he had understood to mean commit a murder f...

[3] Larry_Eyler (United States)
    Score: 0.510
    Victims: Proven 19 / Possible 23
    Snippet: several victims were subjected to varying degrees of sadomasochism before being stabbed and/or slash...

[4] Larry_Eyler (United States)
    Score: 0.492
    Victims: Proven 19 / Possible 23
    Snippet: been too fast for his liking. Upon cross-examination, one of Little's defense attorneys, Dennis Zahn...

[5] Larry_Eyler (United States)
    Score: 0.486
    Victims: Proven 19 / Possible 23
    Snippet: victim

###RE-RANKING with cross-encoder:

In [35]:
# Load cross-encoder reranker
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2", device=my_device)


config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

In [36]:
# CROSS-ENCODER RERANKING

# 1. Prepare pairs for the Cross-Encoder
# We keep a reference to the 'points' so we don't lose the metadata
points = raw_embeddings_results.points
texts = [p.payload.get("text", "") for p in points]
pairs = [(query_sample, t) for t in texts]

# 2. Predict scores
if texts:
    scores = reranker.predict(pairs)

    # 3. Sort the point based on the scores (High to Low)
    # zip(points, scores) keeps the metadata attached to the score
    ranked_points = sorted(list(zip(points, scores)), key=lambda x: x[1], reverse=True)

    print("\n--- Cross-Encoder Reranking Results ---")
    nof_keep_sentences = 20

    # We will store the full top-k points here for the RAG step
    top_reranked_points = []

    seen_names = set() # Track names we have seen

    for point, score in ranked_points:

        name = point.payload.get("name", "Unknown")

        # SKIP if we have already seen this name
        if name in seen_names:
            continue

        # Add to set and final list
        seen_names.add(name)

        # Convert logit score to probability (0-1) for display
        normalized = 1 / (1 + np.exp(-score))

        c_val = point.payload.get("country", [])
        c_str = ", ".join(c_val) if isinstance(c_val, list) else str(c_val)
        text_preview = point.payload.get("text", "")[:100].replace("\n", " ")

        print(f"[{len(top_reranked_points) + 1}] {name} ({c_str}) | Score: {normalized:.3f}")
        print(f"  Snippet: {text_preview}...")
        print()

        top_reranked_points.append(point)

        # Stop once we have enough unique items
        if len(top_reranked_points) >= nof_keep_sentences:
            break

    print(f"Selected top {len(top_reranked_points)} unique results for RAG context.")

else:
    print("No results to rerank.")
    top_reranked_points = []


--- Cross-Encoder Reranking Results ---
[1] Samuel_Little (United States) | Score: 0.999
  Snippet: Name: Samuel Little Country: United States Years active: 1970–2005 Proven victims: 60 Possible victi...

[2] Cape_Town_Prostitute_Killer (South Africa) | Score: 0.995
  Snippet: he would order his victim to take her clothes off and then hit her in the face. Every victim had inj...

[3] Larry_Eyler (United States) | Score: 0.980
  Snippet: been too fast for his liking. Upon cross-examination, one of Little's defense attorneys, Dennis Zahn...

[4] Andrei_Chikatilo (Soviet Union) | Score: 0.938
  Snippet: September 1982, he killed a further five victims between the ages of 9 and 18. He established a patt...

[5] Fritz_Haarmann (Germany) | Score: 0.676
  Snippet: to Haarmann, he never actually intended to murder any of his victims, but would be seized by an irre...

[6] Clementine_Barnabet (United States) | Score: 0.217
  Snippet: of the victims. Every victim was slashed or bludgeoned with 

### RAG with Reranking

In [37]:
# COMPLETE RAG IMPLEMENTATION WITH HYBRID FILTERING & STRUCTURED METADATA

# ---------- SETUP: Qwen LLM Model ----------
print("Setting up Qwen model for RAG...")

# Initialize HuggingFace client
llm_model = "Qwen/Qwen2.5-72B-Instruct"
client_llm = InferenceClient(llm_model, token=os.getenv("HF_TOKEN"))

# System instructions
system_instructions = (
    "You are a factual assistant specializing in crime statistics. "
    "Use the structured data provided (name, proven_victims, country, text) to answer accurately. "
    "When discussing victim counts, cite the exact numbers from the 'proven_victims' field. "
    "Keep answers concise (150 words max). Start with 'Hello!'"
)

print(f"Qwen model initialized: {llm_model}")
print("="*60 + "\n")

# RETRIEVAL WITH HYBRID FILTERING
print("Performing hybrid search (semantic + numeric filtering)...")

# We use the raw_embeddings_results because we need the full Payload objects
raw_embeddings_results, formatted_embeddings_results = hybrid_qdrant_search(
    client,
    "serial_killers",
    query_sample,
    my_question_embedding,
    top_k=50  # Retrieve more, then rerank to top 10
)

print(f"Found {len(raw_embeddings_results.points)} results with hybrid filtering")
print("="*60 + "\n")

# ---------- CROSS-ENCODER RERANKING (With Deduplication) ----------
print("Reranking results with cross-encoder...")

# 1. Get the points and their texts
points = raw_embeddings_results.points
texts = [p.payload.get("text", "") for p in points]

if texts:
    # 2. Score the pairs
    pairs = [(query_sample, t) for t in texts]
    scores = reranker.predict(pairs)

    # 3. ZIP the POINT object with the SCORE
    ranked_points = sorted(list(zip(points, scores)), key=lambda x: x[1], reverse=True)

    print(f"\n--- Cross-Encoder Reranking Results (Top Unique) ---")

    # --- DEDUPLICATION LOGIC ---
    nof_keep_sentences = 10
    top_context_points = []
    seen_names = set()

    for point, score in ranked_points:
        # Get the name
        name = point.payload.get("name", "Unknown")

        # IF we have already seen this killer, SKIP this chunk
        if name in seen_names:
            continue

        # OTHERWISE, add to our final list
        seen_names.add(name)
        top_context_points.append(point)

        # Visualization (Optional)
        normalized = 1 / (1 + np.exp(-score))
        snippet = point.payload.get("text", "")[:100].replace("\n", " ")
        print(f"[{len(top_context_points)}] {name} | Score: {normalized:.3f}")
        print(f"    {snippet}...")
        print()

        # Stop once we have found 10 UNIQUE killers
        if len(top_context_points) >= nof_keep_sentences:
            break

    print(f"Selected {len(top_context_points)} unique killers for RAG context.")
else:
    top_context_points = []
    print("No texts found for reranking.")

print("="*60 + "\n")


# ---------- CREATE STRUCTURED CONTEXT WITH METADATA ----------
print("Creating structured context with metadata...\n")

context_parts = []

if top_context_points:
    for i, point in enumerate(top_context_points):
        payload = point.payload

        # 1. Handle Country List Display
        c_val = payload.get("country", "Unknown")
        if isinstance(c_val, list):
            country_str = ", ".join(c_val)
        else:
            country_str = str(c_val)

        # 2. Build the structured block
        context_part = f"[SERIAL KILLER {i+1}]\n"
        context_part += f"Name: {payload.get('name', 'Unknown')}\n"
        context_part += f"Country: {country_str}\n"
        context_part += f"Proven victims: {payload.get('proven_victims', 'Unknown')}\n"

        possible = payload.get('possible_victims')
        if possible:
            context_part += f"Possible victims: {possible}\n"

        context_part += f"Details: {payload.get('text', '')}\n"

        context_parts.append(context_part)

    context = "\n\n".join(context_parts)

    augmented_prompt = (
        f"STRUCTURED CRIME DATA:\n\n"
        f"{context}\n\n"
        f"QUESTION: {query_sample}\n\n"
        f"INSTRUCTIONS:\n"
        f"1. Answer based ONLY on the structured data above\n"
        f"2. When mentioning victim counts, use the exact 'Proven victims' numbers\n"
        f"3. Only mention the killer's name if the user's question specifically asks for name or names\n"
        f"4. Only mention the Country if the user's question specifically asks for location, origin, or limits by geography. Otherwise, do not mention the country\n"
        f"5. Only mention the victim counts (proven or possible) if the user's question specifically asks for numbers, statistics, ranking (e.g. 'top', 'worst'), or severity. Otherwise, do not mention the victim counts.\n"
        f"6. If the user asks for a summary, overview, or explanation of patterns: Provide a synthesized paragraph explaining the commonalities, behaviors, or trends found in the context. Do NOT just list names.\n"
        f"7. If the user asks for a definition: Define the term clearly based on the context provided.\n"
        f"8. Be concise and factual"
    )

    print("Created structured context.")
else:
    augmented_prompt = query_sample
    print("No context available, using query directly")

print("="*60 + "\n")

# ---------- RAG GENERATION ----------
max_tokens = 300
temperature = 0.7

messages = [
    {"role": "system", "content": system_instructions},
    {"role": "user", "content": augmented_prompt}
]

print("Generating response with Qwen...")
try:
    completion = client_llm.chat_completion(
        messages=messages,
        max_tokens=max_tokens,
        temperature=temperature,
        stream=False
    )

    answer = completion.choices[0].message["content"]

    print("="*60)
    print("🤖 QWEN RESPONSE:")
    print("="*60)
    print(answer)
    print("="*60 + "\n")

except Exception as e:
    print(f"Error: {e}")

Setting up Qwen model for RAG...
Qwen model initialized: Qwen/Qwen2.5-72B-Instruct

Performing hybrid search (semantic + numeric filtering)...
Found 50 results with hybrid filtering

Reranking results with cross-encoder...

--- Cross-Encoder Reranking Results (Top Unique) ---
[1] Samuel_Little | Score: 0.999
    Name: Samuel Little Country: United States Years active: 1970–2005 Proven victims: 60 Possible victi...

[2] Cape_Town_Prostitute_Killer | Score: 0.995
    he would order his victim to take her clothes off and then hit her in the face. Every victim had inj...

[3] Larry_Eyler | Score: 0.980
    been too fast for his liking. Upon cross-examination, one of Little's defense attorneys, Dennis Zahn...

[4] Andrei_Chikatilo | Score: 0.938
    September 1982, he killed a further five victims between the ages of 9 and 18. He established a patt...

[5] Fritz_Haarmann | Score: 0.676
    to Haarmann, he never actually intended to murder any of his victims, but would be seized by an irre..

In [38]:
# Re-initialize the Qdrant client
client = QdrantClient(url=QDRANT_URL, api_key=API_KEY)

# ---------- Evaluation loop ----------
evaluation_questions = [
    "Who is Mariam Soulakiotis?",
    "What crimes did Karl Denke commit?",
    "Tell me something about Carl Eugene Watts.",
    "What is known about Joachim Georg Kroll?",
    "Give me a general summary about serial killers who murder mostly young boys."
]

# Settings
nof_keep_sentences = 10
evaluation_results = []

print(f"Starting evaluation on {len(evaluation_questions)} questions...")

for question in evaluation_questions:
    print(f"\nProcessing: {question}")
    print('-'*60)

    # 1. PREPROCESS QUERY
    clean_q = minimal_clean(question)
    question_embedding = embeddings_model.encode([clean_q])[0].tolist()

    # 2. HYBRID SEARCH (Retrieval)
    # Fetch 50 candidates to ensure we have enough for deduplication
    raw_results, _ = hybrid_qdrant_search(
        client,
        "serial_killers",
        question,
        question_embedding,
        top_k=50
    )

    # 3. RERANKING & DEDUPLICATION
    points = raw_results.points
    texts = [p.payload.get("text", "") for p in points]

    top_context_points = []

    if texts:
        # Score pairs
        pairs = [(question, t) for t in texts]
        scores = reranker.predict(pairs)

        # Sort by Score
        ranked_points = sorted(list(zip(points, scores)), key=lambda x: x[1], reverse=True)

        # --- DEDUPLICATION LOGIC ---
        seen_names = set()

        for point, score in ranked_points:
            name = point.payload.get("name", "Unknown")

            # Skip duplicates
            if name in seen_names:
                continue

            seen_names.add(name)
            top_context_points.append(point)

            # Stop when we have enough unique killers
            if len(top_context_points) >= nof_keep_sentences:
                break
    else:
        print("   Warning: No results found.")

    # 4. BUILD STRUCTURED CONTEXT
    context_parts = []
    for i, point in enumerate(top_context_points):
        p = point.payload

        # Handle Country List
        c_val = p.get("country", "Unknown")
        c_str = ", ".join(c_val) if isinstance(c_val, list) else str(c_val)

        block = (
            f"[SERIAL KILLER {i+1}]\n"
            f"Name: {p.get('name', 'Unknown')}\n"
            f"Country: {c_str}\n"
            f"Proven victims: {p.get('proven_victims', 'Unknown')}\n"
            f"Details: {p.get('text', '')[:500]}...\n"
        )
        context_parts.append(block)

    context_str = "\n\n".join(context_parts)

    # 5. BUILD PROMPT
    if context_str:
        augmented_prompt = (
            f"STRUCTURED CRIME DATA:\n\n"
            f"{context_str}\n\n"
            f"QUESTION: {question}\n\n"
            f"INSTRUCTIONS:\n"
            f"1. Answer based ONLY on the structured data above\n"
            f"2. When mentioning victim counts, use the exact 'Proven victims' numbers\n"
            f"3. Only mention the killer's name if the user's question specifically asks for name or names\n"
            f"4. Only mention the Country if the user's question specifically asks for location, origin, or limits by geography. Otherwise, do not mention the country\n"
            f"5. Only mention the victim counts (proven or possible) if the user's question specifically asks for numbers, statistics, ranking (e.g. 'top', 'worst'), or severity. Otherwise, do not mention the victim counts.\n"
            f"6. If the user asks for a summary, overview, or explanation of patterns: Provide a synthesized paragraph explaining the commonalities, behaviors, or trends found in the context. Do NOT just list names.\n"
            f"7. If the user asks for a definition: Define the term clearly based on the context provided.\n"
            f"8. Be concise and factual"
        )
    else:
        augmented_prompt = question # Fallback if no context found

    messages = [
        {"role": "system", "content": "You are a factual assistant specializing in crime statistics."},
        {"role": "user", "content": augmented_prompt}
    ]

    # 6. GENERATE WITH QWEN
    try:
        completion = client_llm.chat_completion(
            messages=messages,
            max_tokens=300,
            temperature=0.7,
            stream=False
        )
        answer = completion.choices[0].message["content"]
        print(f"Answer: {answer}\n")
    except Exception as e:
        answer = f"Error: {e}"
        print(f"Generation Failed: {e}")

    # 7. LOG RESULTS
    evaluation_results.append({
        "Question": question,
        "Retrieved Context": context_str,
        "Generated Answer": answer
    })

# Save results
df_eval_reranked = pd.DataFrame(evaluation_results)
df_eval_reranked.to_csv("rag_evaluation_results_reranked_qwen.csv", index=False)
print("Exported results to rag_evaluation_results_reranked_qwen.csv")

Starting evaluation on 5 questions...

Processing: Who is Mariam Soulakiotis?
------------------------------------------------------------
Answer: Mariam Soulakiotis was an individual known as "Mother Rasputin," an abbess who was convicted of murdering 177 individuals. She operated an abbey where she fraudulently offered free tuberculosis treatment, but the treatments were ineffective and often harmful, leading to the deaths of many of her victims. The true extent of her crimes remains uncertain, with estimates suggesting that the number of victims could be much higher, possibly exceeding 500. Her modus operandi involved targeting wealthy women and children who sought refuge or medical help at her abbey.


Processing: What crimes did Karl Denke commit?
------------------------------------------------------------
Answer: Karl Denke killed and cannibalized poor travelers and homeless vagrants. He kept a ledger recording his murders, which included at least 31 names, confirming at least 3

In [39]:
# Optionally save the file to local environment
files.download("rag_evaluation_results_reranked_qwen.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

##Streamlit Interface