## Libraries Installation

### NLTK Installation

In [None]:
%pip install nltk

In [None]:
import nltk
nltk.download()

### ir_datasets Installation

In [None]:
%pip install ir_datasets

### rank_bm25 Installation

In [None]:
%pip install rank_bm25

### joblib Installation

In [None]:
%pip install joblib

### dill Installation

In [None]:
%pip install dill

### contractions Installation

In [None]:
%pip install contractions

### chromadb Installation

In [None]:
%pip install chromadb

### scikit-learn Installation

In [None]:
%pip install scikit-learn

### matplotlib Installation

In [None]:
%pip install matplotlib

### fastapi Installation

In [None]:
%pip install fastapi

## Apply ir_datasets fix for encoding

In [None]:
import os

# Replace the user's path with the correct path
ir_datasets_path = r"C:\Users\FSOS\AppData\Local\Programs\Python\Python311\Lib\site-packages\ir_datasets\formats\tsv.py"

In [None]:
# Check if file exists
if not os.path.exists(ir_datasets_path):
    print(f"Error: File not found at {ir_datasets_path}")
    exit()

# Read the file content
with open(ir_datasets_path, 'r', encoding='utf-8') as file:
    lines = file.readlines()

# Replace lines 25-28 with new code
    new_lines = [
        "            if isinstance(self.dlc, list):\n",
        "                self.stream = io.TextIOWrapper(self.ctxt.enter_context(self.dlc[self.stream_idx].stream()), encoding='utf-8')\n",
        "            else:\n",
        "                self.stream = io.TextIOWrapper(self.ctxt.enter_context(self.dlc.stream()), encoding='utf-8')\n"
    ]
    
    # Replace the lines (note: list indices are 0-based, so lines 25-28 are indices 24-27)
    lines[24:28] = new_lines
    
    # Write back the modified content
    with open(ir_datasets_path, 'w', encoding='utf-8') as file:
        file.writelines(lines)
    print("Successfully updated the file encoding settings.")

In [None]:
pip install matplotlib

In [None]:
import os
import joblib
import numpy as np
from pathlib import Path
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.utils import resample
import time
from sklearn.neighbors import NearestNeighbors # To help estimate eps for DBSCAN

# --- Path to your dimensionality-reduced data (Google Drive) ---
google_drive_model_path = "C:\\Users\\FSOS\\Documents\\Projects\\ir-project\\src\\data\\antique"
data_dir_path = Path(google_drive_model_path)

tfidf_matrix_reduced_svd_path = data_dir_path / "tfidf_matrix_reduced_svd.joblib"
tfidf_matrix_reduced_umap_path = data_dir_path / "tfidf_matrix_reduced_umap.joblib" # For later use

tfidf_matrix_reduced_svd = None
tfidf_matrix_reduced_umap = None

print(f"--- Attempting to load dimensionality-reduced matrices from: {data_dir_path.resolve()} ---")

# Load SVD-reduced matrix
if not tfidf_matrix_reduced_svd_path.exists():
    print(f"❌ SVD-reduced matrix NOT FOUND at {tfidf_matrix_reduced_svd_path.resolve()}. Please ensure TruncatedSVD was run successfully.")
else:
    try:
        print("\nAttempting joblib.load for SVD-reduced matrix...")
        tfidf_matrix_reduced_svd = joblib.load(tfidf_matrix_reduced_svd_path)
        print("✅ SVD-reduced matrix loaded successfully.")
    except Exception as e:
        print(f"❌ Critical Error loading SVD-reduced matrix: {e}.")

# Load UMAP-reduced matrix (optional, for later use)
if tfidf_matrix_reduced_umap_path.exists():
     try:
        print("\nAttempting joblib.load for UMAP-reduced matrix...")
        tfidf_matrix_reduced_umap = joblib.load(tfidf_matrix_reduced_umap_path)
        print("✅ UMAP-reduced matrix loaded successfully.")
     except Exception as e:
        print(f"❌ Critical Error loading UMAP-reduced matrix: {e}.")
else:
    print(f"⚠️ UMAP-reduced matrix NOT FOUND at {tfidf_matrix_reduced_umap_path.resolve()}. Skipping UMAP-based clustering for now.")


print("-" * 50)

# --- Apply DBSCAN to SVD-reduced data ---
if tfidf_matrix_reduced_svd is not None and tfidf_matrix_reduced_svd.shape[0] > 0:
    print("\n--- Applying DBSCAN to SVD-reduced data ---")

    # DBSCAN requires parameter tuning (eps and min_samples).
    # Estimating eps using NearestNeighbors can be a starting point.
    # min_samples is often set based on the number of dimensions (e.g., 2*n_dimensions).
    min_samples_dbscan = max(2, 2 * tfidf_matrix_reduced_svd.shape[1]) # min_samples >= 2

    print(f"  Estimating eps for DBSCAN using NearestNeighbors (min_samples={min_samples_dbscan})...")
    # Using a sample for NearestNeighbors to speed up
    sample_size_nn = min(10000, tfidf_matrix_reduced_svd.shape[0])
    if tfidf_matrix_reduced_svd.shape[0] > sample_size_nn:
        print(f"  Using a sample of {sample_size_nn} for NearestNeighbors.")
        tfidf_matrix_sample_svd_nn, _ = resample(tfidf_matrix_reduced_svd, n_samples=sample_size_nn, replace=False, random_state=42)
    else:
        tfidf_matrix_sample_svd_nn = tfidf_matrix_reduced_svd


    # Calculate the distance to the n_neighbors-th nearest neighbor
    # min_samples is the number of neighbors (including the point itself)
    # So we look at the distance to the (min_samples - 1)-th neighbor
    if tfidf_matrix_sample_svd_nn.shape[0] > min_samples_dbscan:
        try:
            nn = NearestNeighbors(n_neighbors=min_samples_dbscan).fit(tfidf_matrix_sample_svd_nn)
            distances, indices = nn.kneighbors(tfidf_matrix_sample_svd_nn)
            # Sort distances and plot to find an elbow (visual method) or pick a value
            # For automated selection, we might pick a percentile or look for a knee point.
            # Let's pick a high percentile as a starting point for eps.
            distances = np.sort(distances[:, min_samples_dbscan-1])
            estimated_eps = np.percentile(distances, 90) # Example: 90th percentile

            print(f"  Estimated starting eps for DBSCAN: {estimated_eps:.4f}")

            # Now, evaluate DBSCAN for a range of eps values around the estimate
            # We'll need to run DBSCAN and calculate metrics (Silhouette, Davies-Bouldin)
            # This can be computationally expensive. Let's evaluate a few eps values.
            eps_values_to_test = [estimated_eps * factor for factor in [0.5, 0.75, 1.0, 1.25, 1.5]] # Test around the estimate
            # Ensure eps values are positive
            eps_values_to_test = [eps for eps in eps_values_to_test if eps > 0]
            print(f"  Testing DBSCAN with min_samples={min_samples_dbscan} and eps values: {eps_values_to_test}")

            best_dbscan_score = -1.0 # Using Silhouette score for evaluation
            best_dbscan_eps = None
            best_dbscan_model = None
            evaluated_dbscan_data = []

            # Sample data for Silhouette/Davies-Bouldin calculation (speeds up computation)
            sample_size_metrics = min(10000, tfidf_matrix_reduced_svd.shape[0])
            if tfidf_matrix_reduced_svd.shape[0] > sample_size_metrics:
                print(f"  Sampling {sample_size_metrics} documents for DBSCAN metrics calculation.")
                tfidf_matrix_sample_svd_metrics, _ = resample(tfidf_matrix_reduced_svd, n_samples=sample_size_metrics, replace=False, random_state=42)
            else:
                tfidf_matrix_sample_svd_metrics = tfidf_matrix_reduced_svd


            for eps in eps_values_to_test:
                iteration_start_time = time.time()
                print(f"\n--- Testing DBSCAN with eps = {eps:.4f}, min_samples = {min_samples_dbscan} --- (Starting at {time.ctime()})")

                # Apply DBSCAN to the full SVD-reduced data
                dbscan = DBSCAN(eps=eps, min_samples=min_samples_dbscan)
                dbscan_labels = dbscan.fit_predict(tfidf_matrix_reduced_svd) # Fit on full data

                n_clusters_dbscan = len(set(dbscan_labels)) - (1 if -1 in dbscan_labels else 0)
                n_noise_points = list(dbscan_labels).count(-1)

                print(f"  ✅ DBSCAN fitted. Found {n_clusters_dbscan} clusters and {n_noise_points} noise points.")

                current_silhouette = -1.0
                current_davies_bouldin = float('inf')

                # Calculate Silhouette and Davies-Bouldin Scores on sampled data
                # Ensure sample size is sufficient and there's more than one cluster (excluding noise)
                if n_clusters_dbscan >= 2 and tfidf_matrix_sample_svd_metrics.shape[0] >= n_clusters_dbscan:
                    # Predict labels for sampled data using the fitted DBSCAN model
                    # DBSCAN predict is not standard, usually you fit and get labels.
                    # To evaluate on sample, we would ideally fit DBSCAN on the sample,
                    # or use a metric that works with the full labels and data but is sample-based.
                    # For simplicity here, let's calculate metrics on the full data if feasible,
                    # or use a metric that handles noise points. Silhouette score can handle -1 labels.
                    # Let's calculate metrics on the full data if it's not too memory intensive.
                    # If full data is too large for metrics, we'd need a different approach or sample.

                    # Let's calculate metrics on the full data if possible, otherwise use sample
                    data_for_metrics = tfidf_matrix_reduced_svd
                    labels_for_metrics = dbscan_labels

                    # Ensure there are at least 2 unique labels (excluding -1) for metrics
                    if len(set(labels_for_metrics) - {-1}) >= 2:
                         # Silhouette score can handle noise points (-1)
                         current_silhouette = silhouette_score(data_for_metrics, labels_for_metrics)
                         # Davies-Bouldin Index does NOT handle noise points. We need to filter.
                         # Filter out noise points for Davies-Bouldin
                         non_noise_indices = np.where(labels_for_metrics != -1)[0]
                         if len(np.unique(labels_for_metrics[non_noise_indices])) >= 2:
                             current_davies_bouldin = davies_bouldin_score(data_for_metrics[non_noise_indices], labels_for_metrics[non_noise_indices])
                         else:
                             print(f"  ⚠️ Skipping Davies-Bouldin for eps={eps:.4f}: Not enough non-noise clusters (>=2).")
                             current_davies_bouldin = float('inf') # Assign infinity if cannot compute


                    else:
                        print(f"  ⚠️ Skipping Silhouette/Davies-Bouldin for eps={eps:.4f}: Not enough clusters found (excluding noise).")

                else:
                    print(f"  ⚠️ Skipping Silhouette/Davies-Bouldin for eps={eps:.4f}: Not enough clusters found ({n_clusters_dbscan}) or sample size too small.")


                silhouette_scores.append(current_silhouette)
                davies_bouldin_scores.append(current_davies_bouldin)

                print(f"  ✅ Silhouette Score: {current_silhouette:.4f}")
                print(f"  ✅ Davies-Bouldin Index: {current_davies_bouldin:.4f}")

                # Update best model based on Silhouette score (higher is better)
                if current_silhouette > best_dbscan_score:
                    best_dbscan_score = current_silhouette
                    best_dbscan_eps = eps
                    # Store parameters, not the fitted model if memory is an issue
                    best_dbscan_model_params = {'eps': eps, 'min_samples': min_samples_dbscan}


                evaluated_dbscan_data.append({
                    'eps': eps,
                    'min_samples': min_samples_dbscan,
                    'n_clusters': n_clusters_dbscan,
                    'n_noise': n_noise_points,
                    'silhouette': current_silhouette,
                    'davies_bouldin': current_davies_bouldin,
                    # 'model': dbscan # Avoid storing large models if memory is tight
                })

                iteration_end_time = time.time()
                elapsed_time = iteration_end_time - iteration_start_time
                print(f"--- DBSCAN (eps={eps:.4f}) processing complete in {elapsed_time:.2f} seconds ---")
                print("-" * 20)

            print(f"\n--- DBSCAN Optimal Parameter Selection Results (on SVD Data) ---")
            print(f"Best eps based on max Silhouette Score ({best_dbscan_score:.4f}): {best_dbscan_eps:.4f}")
            print("-" * 50)

            # --- Step 4: Apply DBSCAN with Best Parameters and Save Labels ---
            if best_dbscan_eps is not None:
                print(f"\n--- Applying final DBSCAN with best parameters (eps={best_dbscan_eps:.4f}, min_samples={min_samples_dbscan}) ---")
                final_dbscan_model = DBSCAN(eps=best_dbscan_eps, min_samples=min_samples_dbscan)
                final_dbscan_labels = final_dbscan_model.fit_predict(tfidf_matrix_reduced_svd)

                # Save the DBSCAN cluster labels
                dbscan_labels_svd_path = data_dir_path / f"document_cluster_labels_dbscan_svd_eps{best_dbscan_eps:.4f}.joblib"
                # Clean up the filename to remove potentially invalid characters from float
                dbscan_labels_svd_path_cleaned = data_dir_path / f"document_cluster_labels_dbscan_svd_eps_{str(best_dbscan_eps).replace('.', '_')}.joblib"

                joblib.dump(final_dbscan_labels, dbscan_labels_svd_path_cleaned)
                print(f"✅ DBSCAN cluster labels saved to: {dbscan_labels_svd_path_cleaned.resolve()}")

                n_clusters_final = len(set(final_dbscan_labels)) - (1 if -1 in final_dbscan_labels else 0)
                n_noise_final = list(final_dbscan_labels).count(-1)
                print(f"Final DBSCAN found {n_clusters_final} clusters and {n_noise_final} noise points.")
                print("-" * 50)

            else:
                print("❌ Cannot apply final DBSCAN: No best eps found.")


        else:
             print(f"⚠️ Cannot estimate eps: Sample size ({tfidf_matrix_sample_svd_nn.shape[0]}) is not large enough compared to min_samples ({min_samples_dbscan}).")
             print("Consider reducing min_samples or using a smaller dataset.")

    except Exception as e:
        print(f"❌ An error occurred during DBSCAN processing: {e}")
        # Add more specific error handling if needed


else:
    print("❌ Cannot proceed with DBSCAN: SVD-reduced matrix is not available.")

Attempting to load TF-IDF matrix from: C:\Users\FSOS\Documents\Projects\ir-project\src\data\antique\tfidf_matrix.joblib
✅ Successfully loaded TF-IDF matrix.
TF-IDF Matrix shape (num_docs, vocab_size): (403666, 35560)

Calculating cumulative explained variance for n_components from 1 to 500...
  Calculated for n_components = 50


KeyboardInterrupt: 