# BERTopic Document Vectorization & Tag Evaluation

This notebook experiments with BERTopic for creating vector representations of documents from GCS bucket.
It evaluates whether assigned tags are correctly predicted based on document content.

## Setup:
- GCS Bucket: `gosexpert_categorize`
- Database: PostgreSQL with pgvector extension
- Split: 70% train / 30% test
- Text extraction: First couple pages of each document
- MLflow: Experiment tracking

## 1. Import Dependencies

In [1]:
# Standard library imports
import sys
import os
from pathlib import Path
from typing import List, Dict, Tuple, Any
import warnings
warnings.filterwarnings('ignore')

# Add project root to path
project_root = Path().absolute().parent.parent
sys.path.insert(0, str(project_root))

# Data manipulation
import pandas as pd
import numpy as np

# PDF processing
import PyPDF2
from io import BytesIO

# BERTopic and embeddings
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

# Database
import psycopg2
from psycopg2.extras import execute_values
from pgvector.psycopg2 import register_vector

# GCS and MLflow
from gcs_bucket_interface import GCSBucketInterface
from mlflow_recorder import MLflowRecorder

# Evaluation metrics
from sklearn.metrics import (
    accuracy_score, 
    precision_score, 
    recall_score, 
    f1_score,
    classification_report,
    confusion_matrix
)
from sklearn.model_selection import train_test_split

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Load environment variables
from dotenv import load_dotenv
load_dotenv()

print("All imports successful!")

ModuleNotFoundError: No module named 'pandas'

## 2. Configuration

In [None]:
# Experiment configuration
EXPERIMENT_NAME = "bertopic_tag_eval_v1"
BUCKET_NAME = "gosexpert_categorize"
TRAIN_TEST_SPLIT = 0.3  # 30% test, 70% train
RANDOM_STATE = 42
PAGES_TO_EXTRACT = 3  # Extract first 3 pages from each document

# BERTopic configuration
EMBEDDING_MODEL = "paraphrase-multilingual-MiniLM-L12-v2"
MIN_TOPIC_SIZE = 10
N_NEIGHBORS = 15
N_COMPONENTS = 5

# Database configuration from .env
DB_CONFIG = {
    'host': '127.0.0.1',
    'port': 5433,
    'database': os.getenv('DB_NAME'),
    'user': os.getenv('DB_USER'),
    'password': os.getenv('DB_PASS')
}

print(f"Configuration loaded")
print(f"Database: {DB_CONFIG['database']}")
print(f"Embedding Model: {EMBEDDING_MODEL}")
print(f"Train/Test Split: {int((1-TRAIN_TEST_SPLIT)*100)}/{int(TRAIN_TEST_SPLIT*100)}")

## 3. Load Documents from GCS Bucket

In [None]:
# Initialize GCS interface
gcs_interface = GCSBucketInterface(bucket_name=BUCKET_NAME)

# List all files with metadata
print("Loading files from GCS bucket...")
all_files = gcs_interface.list()

print(f"Total files found: {len(all_files)}")

# Filter for PDF files and files with metadata tags
pdf_files_with_tags = []
for file_info in all_files:
    if file_info['name'].lower().endswith('.pdf') and file_info.get('metadata'):
        # Check if metadata contains tag information
        metadata = file_info['metadata']
        if any(key.startswith('tag') or 'category' in key.lower() for key in metadata.keys()):
            pdf_files_with_tags.append(file_info)

print(f"PDF files with tags: {len(pdf_files_with_tags)}")

# Create DataFrame for easier manipulation
df_files = pd.DataFrame([{
    'file_name': f['name'],
    'size': f['size'],
    'gcs_uri': f['gcs_uri'],
    'metadata': f['metadata'],
    'tags': [v for k, v in f['metadata'].items() if 'tag' in k.lower() or 'category' in k.lower()]
} for f in pdf_files_with_tags])

print(f"\nDataFrame shape: {df_files.shape}")
print(f"\nSample files:")
display(df_files.head())

## 4. Extract Text from PDF Documents

In [None]:
def extract_text_from_pdf_blob(blob_name: str, max_pages: int = PAGES_TO_EXTRACT) -> str:
    """
    Extract text from the first N pages of a PDF file in GCS.
    
    Args:
        blob_name: Name of the blob in GCS bucket
        max_pages: Maximum number of pages to extract
        
    Returns:
        Extracted text as string
    """
    try:
        # Get blob from bucket
        blob = gcs_interface.bucket.blob(blob_name)
        
        # Download as bytes
        pdf_bytes = blob.download_as_bytes()
        
        # Read PDF
        pdf_file = BytesIO(pdf_bytes)
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        
        # Extract text from first N pages
        text = ""
        num_pages = min(len(pdf_reader.pages), max_pages)
        
        for page_num in range(num_pages):
            page = pdf_reader.pages[page_num]
            text += page.extract_text() + "\n"
        
        return text.strip()
    
    except Exception as e:
        print(f"Error extracting text from {blob_name}: {e}")
        return ""

# Extract text from all documents
print("Extracting text from PDF documents...")
texts = []
valid_indices = []

for idx, row in df_files.iterrows():
    text = extract_text_from_pdf_blob(row['file_name'])
    if text:  # Only keep documents with successfully extracted text
        texts.append(text)
        valid_indices.append(idx)
    
    if (idx + 1) % 10 == 0:
        print(f"Processed {idx + 1}/{len(df_files)} documents...")

# Filter DataFrame to only include documents with extracted text
df_files = df_files.loc[valid_indices].reset_index(drop=True)
df_files['text'] = texts
df_files['text_length'] = df_files['text'].str.len()

print(f"\nSuccessfully extracted text from {len(df_files)} documents")
print(f"Average text length: {df_files['text_length'].mean():.0f} characters")
print(f"Median text length: {df_files['text_length'].median():.0f} characters")

# Display sample
print(f"\nSample text (first 500 chars):")
print(df_files['text'].iloc[0][:500])

## 5. Train/Test Split

In [None]:
# Prepare labels (extract primary tag from metadata)
def get_primary_tag(tags_list):
    """Get the first/primary tag from tags list."""
    if isinstance(tags_list, list) and len(tags_list) > 0:
        return tags_list[0]
    return "unknown"

df_files['primary_tag'] = df_files['tags'].apply(get_primary_tag)

# Remove documents without valid tags
df_files = df_files[df_files['primary_tag'] != "unknown"].reset_index(drop=True)

print(f"Documents with valid tags: {len(df_files)}")
print(f"\nTag distribution:")
print(df_files['primary_tag'].value_counts())

# Check which tags have enough samples for stratified split (need at least 2 per class)
tag_counts = df_files['primary_tag'].value_counts()
rare_tags = tag_counts[tag_counts < 2].index
valid_tags = tag_counts[tag_counts >= 2].index

if len(rare_tags) > 0:
    print(f"\n⚠️ Warning: {len(rare_tags)} tags have < 2 samples and will be added to train set only:")
    for tag in rare_tags:
        print(f"  - {tag}: {tag_counts[tag]} sample(s)")
    
    # Separate rare and valid tag documents
    rare_tags_df = df_files[df_files['primary_tag'].isin(rare_tags)].copy()
    df_files_for_split = df_files[df_files['primary_tag'].isin(valid_tags)].copy()
    
    print(f"\nProceeding with stratified split on {len(df_files_for_split)} documents...")
    
    # Perform stratified split only on valid tags
    train_df, test_df = train_test_split(
        df_files_for_split, 
        test_size=TRAIN_TEST_SPLIT, 
        random_state=RANDOM_STATE,
        stratify=df_files_for_split['primary_tag']
    )
    
    # Add rare tag documents to training set
    print(f"Adding {len(rare_tags_df)} rare-tag documents to training set...")
    train_df = pd.concat([train_df, rare_tags_df], ignore_index=True)
    
else:
    print(f"\nAll tags have sufficient samples for stratified split")
    
    # Perform standard stratified split
    train_df, test_df = train_test_split(
        df_files, 
        test_size=TRAIN_TEST_SPLIT, 
        random_state=RANDOM_STATE,
        stratify=df_files['primary_tag']
    )

print(f"\nFinal split:")
print(f"Train set size: {len(train_df)} ({len(train_df)/(len(train_df) + len(test_df))*100:.1f}%)")
print(f"Test set size: {len(test_df)} ({len(test_df)/(len(train_df) + len(test_df))*100:.1f}%)")

print(f"\nTrain tag distribution:")
print(train_df['primary_tag'].value_counts())

print(f"\nTest tag distribution:")
print(test_df['primary_tag'].value_counts())



## 6. Initialize BERTopic Model

In [None]:
# Initialize embedding model
print(f"Loading embedding model: {EMBEDDING_MODEL}...")
embedding_model = SentenceTransformer(EMBEDDING_MODEL)

# Initialize BERTopic with custom configuration
print("Initializing BERTopic model...")

# Vectorizer for better Russian/multilingual support
vectorizer_model = CountVectorizer(
    ngram_range=(1, 2),
    stop_words=None,  # Can add custom stop words if needed
    min_df=2
)

# Initialize BERTopic
topic_model = BERTopic(
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    min_topic_size=MIN_TOPIC_SIZE,
    n_gram_range=(1, 2),
    calculate_probabilities=True,
    verbose=True
)

print("BERTopic model initialized successfully!")

In [None]:

train_df.columns

Index(['file_name', 'size', 'gcs_uri', 'metadata', 'tags', 'primary_tag'], dtype='object')

## 7. Generate Embeddings and Train BERTopic

In [18]:
# Generate embeddings for train set
print("Generating embeddings for training set...")
train_texts = train_df['text'].tolist()
train_embeddings = embedding_model.encode(
    train_texts, 
    show_progress_bar=True,
    batch_size=32
)

print(f"Train embeddings shape: {train_embeddings.shape}")

# Fit BERTopic on training data
print("\nFitting BERTopic model on training data...")
topics, probs = topic_model.fit_transform(train_texts, train_embeddings)

print(f"\nNumber of topics discovered: {len(set(topics)) - 1}")  # -1 for outlier topic
print(f"Number of outliers: {sum(1 for t in topics if t == -1)}")

# Add topic assignments to train DataFrame
train_df = train_df.copy()
train_df['bertopic_topic'] = topics
train_df['bertopic_prob'] = [p.max() if len(p) > 0 else 0.0 for p in probs]

# Display topic info
print("\nTop topics:")
topic_info = topic_model.get_topic_info()
display(topic_info.head(10))

Generating embeddings for training set...


KeyError: 'text'

## 8. Generate Embeddings for Test Set

In [None]:
# Generate embeddings for test set
print("Generating embeddings for test set...")
test_texts = test_df['text'].tolist()
test_embeddings = embedding_model.encode(
    test_texts, 
    show_progress_bar=True,
    batch_size=32
)

print(f"Test embeddings shape: {test_embeddings.shape}")

# Transform test data using fitted model
print("\nTransforming test data with fitted BERTopic model...")
test_topics, test_probs = topic_model.transform(test_texts, test_embeddings)

# Add topic assignments to test DataFrame
test_df = test_df.copy()
test_df['bertopic_topic'] = test_topics
test_df['bertopic_prob'] = [p.max() if len(p) > 0 else 0.0 for p in test_probs]

print(f"Test set topic distribution:")
print(test_df['bertopic_topic'].value_counts().head(10))

## 9. Setup PostgreSQL with pgvector

In [None]:
# Connect to PostgreSQL
print("Connecting to PostgreSQL database...")
conn = psycopg2.connect(**DB_CONFIG)
register_vector(conn)
cur = conn.cursor()

print("Connected successfully!")

# Create table for document vectors
table_name = "document_vectors_experiment"

print(f"\nCreating table '{table_name}'...")

cur.execute(f"""
    CREATE TABLE IF NOT EXISTS {table_name} (
        id SERIAL PRIMARY KEY,
        file_name TEXT NOT NULL,
        gcs_uri TEXT NOT NULL,
        assigned_tag TEXT NOT NULL,
        bertopic_topic INTEGER,
        bertopic_prob FLOAT,
        embedding vector({train_embeddings.shape[1]}),
        text_excerpt TEXT,
        dataset_split TEXT,  -- 'train' or 'test'
        created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
    )
""")

# Create index for vector similarity search
print("Creating vector index...")
cur.execute(f"""
    CREATE INDEX IF NOT EXISTS {table_name}_embedding_idx 
    ON {table_name} 
    USING ivfflat (embedding vector_cosine_ops)
    WITH (lists = 100)
""")

conn.commit()
print("Database setup complete!")

## 10. Store Vectors in pgvector Database

In [None]:
def insert_vectors_batch(df, embeddings, dataset_split: str):
    """
    Insert document vectors into database.
    
    Args:
        df: DataFrame with document information
        embeddings: Document embeddings array
        dataset_split: 'train' or 'test'
    """
    values = []
    for idx, row in df.iterrows():
        embedding_idx = df.index.get_loc(idx)
        values.append((
            row['file_name'],
            row['gcs_uri'],
            row['primary_tag'],
            int(row['bertopic_topic']),
            float(row['bertopic_prob']),
            embeddings[embedding_idx].tolist(),
            row['text'][:1000],  # Store first 1000 chars as excerpt
            dataset_split
        ))
    
    execute_values(
        cur,
        f"""
        INSERT INTO {table_name} 
        (file_name, gcs_uri, assigned_tag, bertopic_topic, bertopic_prob, 
         embedding, text_excerpt, dataset_split)
        VALUES %s
        """,
        values
    )
    conn.commit()

# Insert training vectors
print(f"Inserting {len(train_df)} training vectors...")
insert_vectors_batch(train_df, train_embeddings, 'train')
print("Training vectors inserted!")

# Insert test vectors
print(f"\nInserting {len(test_df)} test vectors...")
insert_vectors_batch(test_df, test_embeddings, 'test')
print("Test vectors inserted!")

# Verify insertion
cur.execute(f"SELECT COUNT(*), dataset_split FROM {table_name} GROUP BY dataset_split")
results = cur.fetchall()
print("\nDatabase contents:")
for count, split in results:
    print(f"  {split}: {count} documents")

## 11. Evaluate Tag Prediction Quality

In [None]:
# Create a mapping from BERTopic topics to assigned tags
# For each BERTopic topic, find the most common assigned tag in training set
topic_to_tag_mapping = {}

for topic in train_df['bertopic_topic'].unique():
    if topic == -1:  # Skip outlier topic
        continue
    
    # Get all documents in this topic
    topic_docs = train_df[train_df['bertopic_topic'] == topic]
    
    # Find most common tag
    most_common_tag = topic_docs['primary_tag'].mode()[0] if len(topic_docs) > 0 else "unknown"
    topic_to_tag_mapping[topic] = most_common_tag

print(f"Created topic-to-tag mapping for {len(topic_to_tag_mapping)} topics")
print("\nSample mappings:")
for topic, tag in list(topic_to_tag_mapping.items())[:10]:
    print(f"  Topic {topic} -> Tag: {tag}")

In [None]:
# Predict tags for test set based on BERTopic topics
test_df['predicted_tag'] = test_df['bertopic_topic'].map(topic_to_tag_mapping)

# Handle outliers and unknown topics
test_df['predicted_tag'] = test_df['predicted_tag'].fillna('unknown')

# Calculate metrics
y_true = test_df['primary_tag'].tolist()
y_pred = test_df['predicted_tag'].tolist()

# Overall metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)

print("=" * 60)
print("EVALUATION METRICS")
print("=" * 60)
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")
print("=" * 60)

# Detailed classification report
print("\nDetailed Classification Report:")
print(classification_report(y_true, y_pred, zero_division=0))

# Calculate per-tag accuracy
print("\nPer-Tag Accuracy:")
for tag in sorted(test_df['primary_tag'].unique()):
    tag_mask = test_df['primary_tag'] == tag
    tag_correct = (test_df[tag_mask]['primary_tag'] == test_df[tag_mask]['predicted_tag']).sum()
    tag_total = tag_mask.sum()
    tag_accuracy = tag_correct / tag_total if tag_total > 0 else 0
    print(f"  {tag}: {tag_accuracy:.4f} ({tag_correct}/{tag_total})")

## 12. Visualization

In [None]:
# Confusion Matrix
plt.figure(figsize=(12, 10))
cm = confusion_matrix(y_true, y_pred)
labels = sorted(list(set(y_true + y_pred)))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
plt.title('Confusion Matrix: Assigned Tags vs Predicted Tags')
plt.ylabel('True Tag')
plt.xlabel('Predicted Tag')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

print("Confusion matrix saved as 'confusion_matrix.png'")

In [None]:
# Topic probability distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Train set
axes[0].hist(train_df['bertopic_prob'], bins=50, edgecolor='black', alpha=0.7)
axes[0].set_title('BERTopic Probability Distribution - Train Set')
axes[0].set_xlabel('Topic Probability')
axes[0].set_ylabel('Frequency')
axes[0].axvline(train_df['bertopic_prob'].mean(), color='red', linestyle='--', 
                label=f'Mean: {train_df["bertopic_prob"].mean():.3f}')
axes[0].legend()

# Test set
axes[1].hist(test_df['bertopic_prob'], bins=50, edgecolor='black', alpha=0.7)
axes[1].set_title('BERTopic Probability Distribution - Test Set')
axes[1].set_xlabel('Topic Probability')
axes[1].set_ylabel('Frequency')
axes[1].axvline(test_df['bertopic_prob'].mean(), color='red', linestyle='--',
                label=f'Mean: {test_df["bertopic_prob"].mean():.3f}')
axes[1].legend()

plt.tight_layout()
plt.savefig('topic_probability_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print("Topic probability distribution saved as 'topic_probability_distribution.png'")

## 13. Log Metrics to MLflow

In [None]:
# Initialize MLflow recorder
print("Initializing MLflow recorder...")
mlflow_recorder = MLflowRecorder(experiment_name=EXPERIMENT_NAME)

# Log parameters
params = {
    "embedding_model": EMBEDDING_MODEL,
    "min_topic_size": MIN_TOPIC_SIZE,
    "n_neighbors": N_NEIGHBORS,
    "n_components": N_COMPONENTS,
    "pages_extracted": PAGES_TO_EXTRACT,
    "train_test_split": f"{int((1-TRAIN_TEST_SPLIT)*100)}/{int(TRAIN_TEST_SPLIT*100)}",
    "total_documents": len(df_files),
    "train_size": len(train_df),
    "test_size": len(test_df),
    "num_topics": len(set(topics)) - 1,
    "embedding_dim": train_embeddings.shape[1]
}

mlflow_recorder.log_params(params)

# Log evaluation metrics
metrics = {
    "accuracy": accuracy,
    "precision": precision,
    "recall": recall,
    "f1_score": f1,
    "avg_topic_prob_train": float(train_df['bertopic_prob'].mean()),
    "avg_topic_prob_test": float(test_df['bertopic_prob'].mean()),
    "num_outliers_train": int((train_df['bertopic_topic'] == -1).sum()),
    "num_outliers_test": int((test_df['bertopic_topic'] == -1).sum())
}

mlflow_recorder.log_metrics(metrics)

# Log artifacts (visualizations)
print("\nLogging artifacts...")
mlflow_recorder.log_artifact('confusion_matrix.png')
mlflow_recorder.log_artifact('topic_probability_distribution.png')

# Set tags
mlflow_recorder.set_tag("model_type", "bertopic")
mlflow_recorder.set_tag("task", "tag_prediction")
mlflow_recorder.set_tag("bucket", BUCKET_NAME)

# End run
mlflow_recorder.end_run()

print("\nMLflow logging complete!")
print(f"View results at: {os.getenv('MLFLOW_TRACKING_URI')}")

## 14. Analysis & Insights

In [None]:
# Analyze misclassifications
print("=" * 60)
print("MISCLASSIFICATION ANALYSIS")
print("=" * 60)

misclassified = test_df[test_df['primary_tag'] != test_df['predicted_tag']].copy()
print(f"\nTotal misclassifications: {len(misclassified)} out of {len(test_df)} "
      f"({len(misclassified)/len(test_df)*100:.2f}%)")

if len(misclassified) > 0:
    print("\nMost common misclassifications:")
    misclass_pairs = misclassified.groupby(['primary_tag', 'predicted_tag']).size().sort_values(ascending=False)
    print(misclass_pairs.head(10))
    
    # Analyze low-confidence predictions
    print("\nLow confidence predictions (prob < 0.5):")
    low_conf = test_df[test_df['bertopic_prob'] < 0.5]
    print(f"Count: {len(low_conf)} ({len(low_conf)/len(test_df)*100:.2f}%)")
    
    if len(low_conf) > 0:
        low_conf_accuracy = (low_conf['primary_tag'] == low_conf['predicted_tag']).mean()
        print(f"Accuracy for low-confidence predictions: {low_conf_accuracy:.4f}")
    
    # Analyze high-confidence predictions
    print("\nHigh confidence predictions (prob >= 0.5):")
    high_conf = test_df[test_df['bertopic_prob'] >= 0.5]
    print(f"Count: {len(high_conf)} ({len(high_conf)/len(test_df)*100:.2f}%)")
    
    if len(high_conf) > 0:
        high_conf_accuracy = (high_conf['primary_tag'] == high_conf['predicted_tag']).mean()
        print(f"Accuracy for high-confidence predictions: {high_conf_accuracy:.4f}")

print("\n" + "=" * 60)

## 15. Summary & Recommendations

In [None]:
print("=" * 60)
print("EXPERIMENT SUMMARY")
print("=" * 60)
print(f"\nExperiment Name: {EXPERIMENT_NAME}")
print(f"Embedding Model: {EMBEDDING_MODEL}")
print(f"Embedding Dimension: {train_embeddings.shape[1]}")
print(f"\nDataset:")
print(f"  Total documents: {len(df_files)}")
print(f"  Train: {len(train_df)} ({len(train_df)/len(df_files)*100:.1f}%)")
print(f"  Test: {len(test_df)} ({len(test_df)/len(df_files)*100:.1f}%)")
print(f"\nBERTopic:")
print(f"  Topics discovered: {len(set(topics)) - 1}")
print(f"  Avg topic probability (train): {train_df['bertopic_prob'].mean():.4f}")
print(f"  Avg topic probability (test): {test_df['bertopic_prob'].mean():.4f}")
print(f"\nPerformance:")
print(f"  Accuracy: {accuracy:.4f}")
print(f"  Precision: {precision:.4f}")
print(f"  Recall: {recall:.4f}")
print(f"  F1 Score: {f1:.4f}")
print("\n" + "=" * 60)

print("\nRECOMMENDATIONS:")
print("1. Try different embedding models (multilingual vs monolingual)")
print("2. Experiment with different page extraction ranges (1-5 pages)")
print("3. Adjust BERTopic parameters (min_topic_size, n_gram_range)")
print("4. Consider fine-tuning the embedding model on your domain")
print("5. Implement ensemble methods combining multiple models")
print("6. Add text preprocessing (cleaning, normalization)")
print("\nAll metrics and artifacts logged to MLflow!")
print(f"View at: {os.getenv('MLFLOW_TRACKING_URI')}")

## 16. Cleanup

In [None]:
# Close database connection
cur.close()
conn.close()
print("Database connection closed.")