# LaBSE Classification for Document Tagging

This notebook uses LaBSE (Language-agnostic BERT Sentence Embedding) for document classification.

## Approach:
- Use LaBSE to generate multilingual embeddings
- Train classifiers on top of frozen LaBSE embeddings
- Compare with lightweight classifier fine-tuning
- Local execution

## Setup:
- Model: `sentence-transformers/LaBSE`
- GCS Bucket: `gosexpert_categorize`
- Split: 70% train / 30% test
- MLflow: Experiment tracking

## 1. Import Dependencies

In [None]:
# Standard library
import sys
import os
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Project root
project_root = Path().absolute().parent.parent
sys.path.insert(0, str(project_root))

# Data
import pandas as pd
import numpy as np

# Embeddings
from sentence_transformers import SentenceTransformer

# ML Models
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.neural_network import MLPClassifier

# GCS and MLflow
from gcs_bucket_interface import GCSBucketInterface
from mlflow_recorder import MLflowRecorder

# Evaluation
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    confusion_matrix
)
from sklearn.model_selection import train_test_split

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Environment
from dotenv import load_dotenv
load_dotenv()

print("All imports successful!")

## 2. Configuration

In [None]:
# Experiment configuration
EXPERIMENT_NAME = "labse_classification_v1"
BUCKET_NAME = "gosexpert_categorize"
TRAIN_TEST_SPLIT = 0.3
RANDOM_STATE = 42
PAGES_TO_EXTRACT = 3

# Model configuration
EMBEDDING_MODEL = "sentence-transformers/LaBSE"

print(f"Configuration loaded")
print(f"Model: {EMBEDDING_MODEL}")
print(f"Train/Test Split: {int((1-TRAIN_TEST_SPLIT)*100)}/{int(TRAIN_TEST_SPLIT*100)}")

## 3. Load Documents

In [None]:
# Initialize GCS
gcs_interface = GCSBucketInterface(bucket_name=BUCKET_NAME)

print("Loading files from GCS...")
all_files = gcs_interface.list()

# Filter PDFs with tags
pdf_files_with_tags = []
for file_info in all_files:
    if file_info['name'].lower().endswith('.pdf') and file_info.get('metadata'):
        metadata = file_info['metadata']
        if any(key.startswith('tag') or 'category' in key.lower() for key in metadata.keys()):
            pdf_files_with_tags.append(file_info)

print(f"Found {len(pdf_files_with_tags)} PDF files")

# Create DataFrame
df_files = pd.DataFrame([{
    'file_name': f['name'],
    'gcs_uri': f['gcs_uri'],
    'tags': [v for k, v in f['metadata'].items() if 'tag' in k.lower()]
} for f in pdf_files_with_tags])

print(f"DataFrame: {df_files.shape}")

## 4. Load Pre-Extracted Text

In [None]:
TEXT_EXTRACTION_DIR = project_root / "text_extraction_results"

def load_text_from_json(json_filename: str, max_pages: int = PAGES_TO_EXTRACT) -> str:
    json_path = TEXT_EXTRACTION_DIR / json_filename
    if not json_path.exists():
        return ""
    try:
        import json
        with open(json_path, 'r', encoding='utf-8') as f:
            page_data = json.load(f)
        text_parts = [page_data[f"page{i}"] for i in range(1, max_pages+1) if f"page{i}" in page_data]
        return "\n".join(text_parts).strip()
    except:
        return ""

print("Loading text...")
texts, valid_indices = [], []
for idx, row in df_files.iterrows():
    text = load_text_from_json(row['file_name'].replace('.pdf', '.json'))
    if text:
        texts.append(text)
        valid_indices.append(idx)
    if (idx + 1) % 50 == 0:
        print(f"Processed {idx + 1}/{len(df_files)}...")

df_files = df_files.loc[valid_indices].reset_index(drop=True)
df_files['text'] = texts
print(f"Loaded {len(df_files)} documents")

## 5. Prepare Labels and Split

In [None]:
# Extract tags
def get_primary_tag(tags_list):
    return tags_list[0] if isinstance(tags_list, list) and len(tags_list) > 0 else "unknown"

df_files['primary_tag'] = df_files['tags'].apply(get_primary_tag)
df_files = df_files[df_files['primary_tag'] != "unknown"].reset_index(drop=True)

print(f"Documents: {len(df_files)}")
print(f"Unique tags: {df_files['primary_tag'].nunique()}")
print(f"\nTop tags:")
print(df_files['primary_tag'].value_counts().head(10))

# Train/test split
tag_counts = df_files['primary_tag'].value_counts()
rare_tags = tag_counts[tag_counts < 2].index
valid_tags = tag_counts[tag_counts >= 2].index

if len(rare_tags) > 0:
    print(f"\n⚠️ {len(rare_tags)} rare tags")
    rare_df = df_files[df_files['primary_tag'].isin(rare_tags)]
    valid_df = df_files[df_files['primary_tag'].isin(valid_tags)]
    train_df, test_df = train_test_split(
        valid_df, test_size=TRAIN_TEST_SPLIT, random_state=RANDOM_STATE,
        stratify=valid_df['primary_tag']
    )
    train_df = pd.concat([train_df, rare_df], ignore_index=True)
else:
    train_df, test_df = train_test_split(
        df_files, test_size=TRAIN_TEST_SPLIT, random_state=RANDOM_STATE,
        stratify=df_files['primary_tag']
    )

print(f"\nSplit: Train={len(train_df)} | Test={len(test_df)}")

## 6. Generate LaBSE Embeddings

In [None]:
print(f"Loading LaBSE model: {EMBEDDING_MODEL}...")
print("Note: This may take a few minutes on first run...")
embedding_model = SentenceTransformer(EMBEDDING_MODEL)
print("Model loaded!")

# Generate train embeddings
print("\nGenerating train embeddings...")
train_texts = train_df['text'].tolist()
train_embeddings = embedding_model.encode(
    train_texts,
    show_progress_bar=True,
    batch_size=16,  # Reduce if OOM
    normalize_embeddings=True  # L2 normalization for better similarity
)
print(f"Train embeddings: {train_embeddings.shape}")

# Generate test embeddings
print("\nGenerating test embeddings...")
test_texts = test_df['text'].tolist()
test_embeddings = embedding_model.encode(
    test_texts,
    show_progress_bar=True,
    batch_size=16,
    normalize_embeddings=True
)
print(f"Test embeddings: {test_embeddings.shape}")

# Prepare labels
y_train = train_df['primary_tag'].values
y_test = test_df['primary_tag'].values

print(f"\nClasses: {len(np.unique(y_train))}")

## 7. Train Multiple Classifiers

### 7.1 XGBoost

In [None]:
print("="*60)
print("XGBOOST")
print("="*60)

label_encoder = LabelEncoder()
y_train_enc = label_encoder.fit_transform(y_train)
y_test_enc = label_encoder.transform(y_test)

xgb_model = XGBClassifier(
    n_estimators=300,
    max_depth=12,
    learning_rate=0.1,
    random_state=RANDOM_STATE,
    eval_metric='mlogloss',
    use_label_encoder=False
)

print("Training...")
xgb_model.fit(train_embeddings, y_train_enc)
xgb_pred_enc = xgb_model.predict(test_embeddings)
xgb_pred = label_encoder.inverse_transform(xgb_pred_enc)

xgb_acc = accuracy_score(y_test, xgb_pred)
xgb_f1 = f1_score(y_test, xgb_pred, average='weighted', zero_division=0)

print(f"Accuracy: {xgb_acc:.4f}")
print(f"F1 Score: {xgb_f1:.4f}")

### 7.2 Random Forest

In [None]:
print("="*60)
print("RANDOM FOREST")
print("="*60)

rf_model = RandomForestClassifier(
    n_estimators=300,
    max_depth=25,
    random_state=RANDOM_STATE,
    n_jobs=-1
)

print("Training...")
rf_model.fit(train_embeddings, y_train)
rf_pred = rf_model.predict(test_embeddings)

rf_acc = accuracy_score(y_test, rf_pred)
rf_f1 = f1_score(y_test, rf_pred, average='weighted', zero_division=0)

print(f"Accuracy: {rf_acc:.4f}")
print(f"F1 Score: {rf_f1:.4f}")

### 7.3 Logistic Regression

In [None]:
print("="*60)
print("LOGISTIC REGRESSION")
print("="*60)

lr_model = LogisticRegression(
    max_iter=1000,
    C=1.0,
    random_state=RANDOM_STATE,
    n_jobs=-1
)

print("Training...")
lr_model.fit(train_embeddings, y_train)
lr_pred = lr_model.predict(test_embeddings)

lr_acc = accuracy_score(y_test, lr_pred)
lr_f1 = f1_score(y_test, lr_pred, average='weighted', zero_division=0)

print(f"Accuracy: {lr_acc:.4f}")
print(f"F1 Score: {lr_f1:.4f}")

### 7.4 Linear SVM

In [None]:
print("="*60)
print("LINEAR SVM")
print("="*60)

svm_model = LinearSVC(
    C=1.0,
    max_iter=2000,
    random_state=RANDOM_STATE
)

print("Training...")
svm_model.fit(train_embeddings, y_train)
svm_pred = svm_model.predict(test_embeddings)

svm_acc = accuracy_score(y_test, svm_pred)
svm_f1 = f1_score(y_test, svm_pred, average='weighted', zero_division=0)

print(f"Accuracy: {svm_acc:.4f}")
print(f"F1 Score: {svm_f1:.4f}")

### 7.5 MLP Neural Network

In [None]:
print("="*60)
print("MLP NEURAL NETWORK")
print("="*60)

mlp_model = MLPClassifier(
    hidden_layer_sizes=(512, 256, 128),
    activation='relu',
    max_iter=500,
    learning_rate='adaptive',
    early_stopping=True,
    random_state=RANDOM_STATE
)

print("Training...")
mlp_model.fit(train_embeddings, y_train)
mlp_pred = mlp_model.predict(test_embeddings)

mlp_acc = accuracy_score(y_test, mlp_pred)
mlp_f1 = f1_score(y_test, mlp_pred, average='weighted', zero_division=0)

print(f"Accuracy: {mlp_acc:.4f}")
print(f"F1 Score: {mlp_f1:.4f}")

## 8. Model Comparison

In [None]:
# Compare models
results_df = pd.DataFrame({
    'Model': ['XGBoost', 'Random Forest', 'Logistic Regression', 'Linear SVM', 'MLP'],
    'Accuracy': [xgb_acc, rf_acc, lr_acc, svm_acc, mlp_acc],
    'F1 Score': [xgb_f1, rf_f1, lr_f1, svm_f1, mlp_f1]
})

# Calculate additional metrics for all models
precision_scores = []
recall_scores = []

for pred in [xgb_pred, rf_pred, lr_pred, svm_pred, mlp_pred]:
    precision_scores.append(precision_score(y_test, pred, average='weighted', zero_division=0))
    recall_scores.append(recall_score(y_test, pred, average='weighted', zero_division=0))

results_df['Precision'] = precision_scores
results_df['Recall'] = recall_scores

print("="*60)
print("MODEL COMPARISON - LaBSE EMBEDDINGS")
print("="*60)
print(results_df.to_string(index=False))
print("\n")

# Best model
best_idx = results_df['F1 Score'].argmax()
best_model = results_df.iloc[best_idx]['Model']
print(f"Best Model: {best_model} (F1: {results_df.iloc[best_idx]['F1 Score']:.4f})")

## 9. Detailed Report for Best Model

In [None]:
# Get predictions from best model
predictions_map = {
    'XGBoost': xgb_pred,
    'Random Forest': rf_pred,
    'Logistic Regression': lr_pred,
    'Linear SVM': svm_pred,
    'MLP': mlp_pred
}

best_pred = predictions_map[best_model]

print(f"Classification Report - {best_model}")
print("="*60)
print(classification_report(y_test, best_pred, zero_division=0))

## 10. Confusion Matrix

In [None]:
plt.figure(figsize=(14, 12))
cm = confusion_matrix(y_test, best_pred)
labels = sorted(list(set(y_test.tolist() + best_pred.tolist())))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
plt.title(f'Confusion Matrix: LaBSE + {best_model}')
plt.ylabel('True Tag')
plt.xlabel('Predicted Tag')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.savefig('labse_confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

print("Confusion matrix saved!")

## 11. Model Comparison Visualization

In [None]:
# Bar plot
fig, ax = plt.subplots(figsize=(14, 6))

x = np.arange(len(results_df))
width = 0.2

ax.bar(x - 1.5*width, results_df['Accuracy'], width, label='Accuracy', alpha=0.8)
ax.bar(x - 0.5*width, results_df['Precision'], width, label='Precision', alpha=0.8)
ax.bar(x + 0.5*width, results_df['Recall'], width, label='Recall', alpha=0.8)
ax.bar(x + 1.5*width, results_df['F1 Score'], width, label='F1 Score', alpha=0.8)

ax.set_xlabel('Model')
ax.set_ylabel('Score')
ax.set_title('LaBSE Embeddings: Model Performance Comparison')
ax.set_xticks(x)
ax.set_xticklabels(results_df['Model'], rotation=15, ha='right')
ax.legend()
ax.set_ylim(0, 1)
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('labse_model_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("Model comparison saved!")

## 12. Per-Class Performance

In [None]:
# Per-class F1 scores for best model
from sklearn.metrics import classification_report

report_dict = classification_report(y_test, best_pred, output_dict=True, zero_division=0)

# Extract per-class metrics
class_metrics = []
for class_name, metrics in report_dict.items():
    if class_name not in ['accuracy', 'macro avg', 'weighted avg']:
        class_metrics.append({
            'Class': class_name,
            'Precision': metrics['precision'],
            'Recall': metrics['recall'],
            'F1-Score': metrics['f1-score'],
            'Support': metrics['support']
        })

class_df = pd.DataFrame(class_metrics).sort_values('F1-Score', ascending=False)

print("\nPer-Class Performance (Top 10):")
print(class_df.head(10).to_string(index=False))

print("\nWorst Performing Classes (Bottom 5):")
print(class_df.tail(5).to_string(index=False))

## 13. Save Best Model

In [None]:
import pickle
import joblib

# Get best model object
models_map = {
    'XGBoost': xgb_model,
    'Random Forest': rf_model,
    'Logistic Regression': lr_model,
    'Linear SVM': svm_model,
    'MLP': mlp_model
}

best_model_obj = models_map[best_model]

# Save
model_dir = './labse_models'
os.makedirs(model_dir, exist_ok=True)

print(f"Saving {best_model}...")
joblib.dump(best_model_obj, f'{model_dir}/best_classifier.pkl')
joblib.dump(label_encoder, f'{model_dir}/label_encoder.pkl') if 'label_encoder' in dir() else None

# Save metadata
metadata = {
    'best_model': best_model,
    'embedding_model': EMBEDDING_MODEL,
    'embedding_dim': train_embeddings.shape[1],
    'num_classes': len(np.unique(y_train)),
    'accuracy': results_df.iloc[best_idx]['Accuracy'],
    'f1_score': results_df.iloc[best_idx]['F1 Score']
}

with open(f'{model_dir}/metadata.json', 'w') as f:
    import json
    json.dump(metadata, f, indent=2)

print(f"Model saved to {model_dir}/")

## 14. Log to MLflow

In [None]:
print("Logging to MLflow...")
mlflow_recorder = MLflowRecorder(experiment_name=EXPERIMENT_NAME)

# Parameters
params = {
    "embedding_model": EMBEDDING_MODEL,
    "embedding_dim": train_embeddings.shape[1],
    "train_size": len(train_df),
    "test_size": len(test_df),
    "num_classes": len(np.unique(y_train)),
    "best_model": best_model
}
mlflow_recorder.log_params(params)

# Metrics for all models
metrics = {
    "xgb_accuracy": xgb_acc,
    "xgb_f1": xgb_f1,
    "rf_accuracy": rf_acc,
    "rf_f1": rf_f1,
    "lr_accuracy": lr_acc,
    "lr_f1": lr_f1,
    "svm_accuracy": svm_acc,
    "svm_f1": svm_f1,
    "mlp_accuracy": mlp_acc,
    "mlp_f1": mlp_f1,
    "best_accuracy": results_df['Accuracy'].max(),
    "best_f1": results_df['F1 Score'].max()
}
mlflow_recorder.log_metrics(metrics)

# Artifacts
mlflow_recorder.log_artifact('labse_confusion_matrix.png')
mlflow_recorder.log_artifact('labse_model_comparison.png')

# Tags
mlflow_recorder.set_tag("model_type", "labse_classification")
mlflow_recorder.set_tag("best_classifier", best_model)

mlflow_recorder.end_run()
print("MLflow logging complete!")

## 15. Summary

In [None]:
print("="*60)
print("EXPERIMENT SUMMARY: LaBSE CLASSIFICATION")
print("="*60)
print(f"Embedding Model: {EMBEDDING_MODEL}")
print(f"Embedding Dimension: {train_embeddings.shape[1]}")
print(f"\nDataset:")
print(f"  Train: {len(train_df)} | Test: {len(test_df)}")
print(f"  Classes: {len(np.unique(y_train))}")
print(f"\nBest Classifier: {best_model}")
print(f"  Accuracy:  {results_df.iloc[best_idx]['Accuracy']:.4f}")
print(f"  Precision: {results_df.iloc[best_idx]['Precision']:.4f}")
print(f"  Recall:    {results_df.iloc[best_idx]['Recall']:.4f}")
print(f"  F1 Score:  {results_df.iloc[best_idx]['F1 Score']:.4f}")
print(f"\nModel saved to: {model_dir}/")
print("="*60)