# SAP RPT-1 Investigation
 
**Goal:** Understand how to use RPT-1 as an encoder for material embeddings

**Key Questions:**
1. How to initialize RPT-1?
2. Can we extract embeddings without training?
3. What's the structure of tokenized data?
4. How to integrate with our multimodal pipeline?

## 1. Setup and Imports

In [19]:

import sys
from pathlib import Path
import pandas as pd
import numpy as np
import torch

# Add project root to path
project_root = Path.cwd().parent.parent
sys.path.insert(0, str(project_root))

print(f"Project root: {project_root}")


Project root: /Users/antonio/Documents/Herramientas SAP/ML/RPT-1/materials-sap-embeddings


In [20]:

# Import RPT-1
from sap_rpt_oss import SAP_RPT_OSS_Classifier

print("‚úì Imports successful")



‚úì Imports successful


## 2. Initialize RPT-1

In [21]:

# Initialize with lightweight config for exploration
classifier = SAP_RPT_OSS_Classifier(
    bagging=1,              # Single model (faster)
    max_context_size=2048,  # Smaller context (faster)
)

print("‚úì Classifier initialized")
print(f"  Type: {type(classifier)}")


Start async server to compute embedding on port 5655
Port already in use, not starting again.
‚úì Classifier initialized
  Type: <class 'sap_rpt_oss.rpt.SAP_RPT_OSS_Classifier'>


## 3. Explore Structure



In [22]:
# %%
# Check for tokenizer
if hasattr(classifier, 'tokenizer'):
    tokenizer = classifier.tokenizer
    print(f"‚úì Found tokenizer: {type(tokenizer)}")
    
    # Explore tokenizer
    print("\nTokenizer attributes:")
    for attr in dir(tokenizer):
        if not attr.startswith('_') and not callable(getattr(tokenizer, attr, None)):
            print(f"  - {attr}")

# %%
# Check for estimators
attrs_to_check = ['estimators', 'estimators_', 'estimator', 'estimator_', 'model']

for attr in attrs_to_check:
    if hasattr(classifier, attr):
        val = getattr(classifier, attr)
        print(f"‚úì Found {attr}: {type(val)}")



‚úì Found tokenizer: <class 'sap_rpt_oss.data.tokenizer.Tokenizer'>

Tokenizer attributes:
  - QUANTILE_DIMENSION
  - classification_type
  - embedding_dim
  - is_valid
  - num_regression_bins
  - random_seed
  - regression_type
  - sentence_embedding_model_name
  - socket
  - zmq_port
‚úì Found model: <class 'sap_rpt_oss.model.torch_model.RPT'>


## 4. Create Test Data

In [23]:
# Material-like data
df_test = pd.DataFrame({
    'MATNR': ['MAT001', 'MAT002', 'MAT003'],
    'MAKTX': [
        'Steel Bolt M8x50 DIN 933',
        'Plastic Washer M8',
        'Stainless Steel Nut M6'
    ],
    'MATKL': ['BOLTS', 'WASHERS', 'NUTS'],
    'MTART': ['FERT', 'FERT', 'FERT'],
    'PRICE': [0.50, 0.10, 0.15],
    'NUM_PLANTS': [3, 2, 4],
})

print("Test data:")
display(df_test)

Test data:


Unnamed: 0,MATNR,MAKTX,MATKL,MTART,PRICE,NUM_PLANTS
0,MAT001,Steel Bolt M8x50 DIN 933,BOLTS,FERT,0.5,3
1,MAT002,Plastic Washer M8,WASHERS,FERT,0.1,2
2,MAT003,Stainless Steel Nut M6,NUTS,FERT,0.15,4


## 5. Attempt: Use Tokenizer Directly

In [24]:
# Try to tokenize without fitting
if hasattr(classifier, 'tokenizer'):
    tokenizer = classifier.tokenizer
    
    print("Attempting direct tokenization...")
    
    try:
        # See tokenizer methods
        methods = [m for m in dir(tokenizer) if not m.startswith('_') and callable(getattr(tokenizer, m))]
        print(f"Tokenizer methods: {methods[:10]}")
        
        # Try tokenize method if it exists
        if hasattr(tokenizer, 'tokenize'):
            result = tokenizer.tokenize(df_test)
            print(f"‚úì Tokenization successful!")
            print(f"  Type: {type(result)}")
            
        elif hasattr(tokenizer, '__call__'):
            result = tokenizer(df_test)
            print(f"‚úì Tokenization successful (via __call__)!")
            print(f"  Type: {type(result)}")
            
    except Exception as e:
        print(f"‚úó Error: {e}")


Attempting direct tokenization...
Tokenizer methods: ['build_labels', 'convert_type_', 'process_features', 'process_target', 'quantize_column', 'replace_inf_values', 'socket_init', 'standard_scale_column', 'texts_to_tensor', 'time_to_seconds']
‚úó Error: Tokenizer.__call__() missing 4 required positional arguments: 'y_context', 'X_query', 'y_query', and 'classification_or_regression'


 ## 6. Attempt: Fit on Dummy Task

In [25]:

# Strategy: Fit on a simple supervised task to initialize the model
# Then we can extract embeddings

# Create target: predict MaterialGroup
df_train = df_test.copy()

# CRITICAL: Check for duplicate columns BEFORE dropping
print("Before dropping MATKL:")
print(f"  Columns: {df_train.columns.tolist()}")
print(f"  Has duplicate columns: {not df_train.columns.is_unique}")

y_train = df_train['MATKL']  # Use MaterialGroup as target
X_train = df_train.drop('MATKL', axis=1)

# Check again after dropping
print("\nAfter dropping MATKL:")
print(f"  Columns: {X_train.columns.tolist()}")
print(f"  Has duplicate columns: {not X_train.columns.is_unique}")

# If there are duplicates, remove them
if not X_train.columns.is_unique:
    print("‚ö†Ô∏è Found duplicate columns, removing...")
    X_train = X_train.loc[:, ~X_train.columns.duplicated()]
    print(f"  Cleaned columns: {X_train.columns.tolist()}")

# Reset index
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

print("\nTraining data:")
print(f"  X shape: {X_train.shape}")
print(f"  X index: {X_train.index.tolist()}")
print(f"  X columns: {X_train.columns.tolist()}")
print(f"  y unique values: {y_train.unique()}")

# %%
# Fit the classifier
print("Fitting classifier...")
try:
    classifier.fit(X_train, y_train)
    print("‚úì Classifier fitted!")
    
    # Check stored data
    if hasattr(classifier, 'X_'):
        print(f"  X_ shape: {classifier.X_.shape}")
        print(f"  X_ columns: {classifier.X_.columns.tolist()}")
        print(f"  X_ has unique columns: {classifier.X_.columns.is_unique}")
        
except Exception as e:
    print(f"‚úó Fitting failed: {e}")
    import traceback
    traceback.print_exc()

Before dropping MATKL:
  Columns: ['MATNR', 'MAKTX', 'MATKL', 'MTART', 'PRICE', 'NUM_PLANTS']
  Has duplicate columns: False

After dropping MATKL:
  Columns: ['MATNR', 'MAKTX', 'MTART', 'PRICE', 'NUM_PLANTS']
  Has duplicate columns: False

Training data:
  X shape: (3, 5)
  X index: [0, 1, 2]
  X columns: ['MATNR', 'MAKTX', 'MTART', 'PRICE', 'NUM_PLANTS']
  y unique values: ['BOLTS' 'WASHERS' 'NUTS']
Fitting classifier...
‚úì Classifier fitted!
  X_ shape: (3, 5)
  X_ columns: ['MATNR', 'MAKTX', 'MTART', 'PRICE', 'NUM_PLANTS']
  X_ has unique columns: True


## 7. Extract Embeddings After Fitting

In [26]:
# CRITICAL: Match columns exactly with training data
df_test_clean = df_test.copy()

# Remove MATKL if it exists (to match X_train structure)
if 'MATKL' in df_test_clean.columns:
    df_test_clean = df_test_clean.drop('MATKL', axis=1)

# Remove duplicate columns if any
if not df_test_clean.columns.is_unique:
    print("‚ö†Ô∏è Removing duplicate columns from test data")
    df_test_clean = df_test_clean.loc[:, ~df_test_clean.columns.duplicated()]

# Reset index
df_test_clean = df_test_clean.reset_index(drop=True)

print(f"‚úì Test data prepared:")
print(f"  Shape: {df_test_clean.shape}")
print(f"  Columns: {df_test_clean.columns.tolist()}")
print(f"  Index: {df_test_clean.index.tolist()}")

# Verify columns match
if hasattr(classifier, 'X_'):
    columns_match = df_test_clean.columns.equals(classifier.X_.columns)
    print(f"  Columns match training data: {columns_match}")
    
    if not columns_match:
        print("  ‚ö†Ô∏è Column mismatch!")
        print(f"    Train columns: {classifier.X_.columns.tolist()}")
        print(f"    Test columns:  {df_test_clean.columns.tolist()}")

# Now try get_tokenized_data
if hasattr(classifier, 'X_'):
    print("\nAttempting to get tokenized data...")
    
    try:
        tokenized = classifier.get_tokenized_data(df_test_clean, bagging_index=0)
        
        print(f"‚úì Got tokenized data!")
        # ... rest of the cell
        
    except Exception as e:
        print(f"‚úó Error: {e}")
        import traceback
        traceback.print_exc()

‚úì Test data prepared:
  Shape: (3, 5)
  Columns: ['MATNR', 'MAKTX', 'MTART', 'PRICE', 'NUM_PLANTS']
  Index: [0, 1, 2]
  Columns match training data: True

Attempting to get tokenized data...
‚úì Got tokenized data!


##  8. Extract Material-Level Embeddings




In [27]:
## ## 

# %%
if 'tokenized' in locals() and isinstance(tokenized, dict) and 'text_embeddings' in tokenized:
    text_emb = tokenized['text_embeddings']
    print(f"Text embeddings shape: {text_emb.shape}")
    print(f"Expected: (n_materials, n_columns, embedding_dim)")
    
    # Strategy 1: Mean pooling over columns
    material_embeddings_mean = text_emb.mean(dim=1)
    print(f"\n‚úì Mean pooled embeddings: {material_embeddings_mean.shape}")
    print(f"  First material, first 10 dims:")
    print(f"  {material_embeddings_mean[0, :10].detach().numpy()}")
    
    # Strategy 2: Use last column (like [CLS] token)
    material_embeddings_cls = text_emb[:, -1, :]
    print(f"\n‚úì [CLS]-like embeddings: {material_embeddings_cls.shape}")
    print(f"  First material, first 10 dims:")
    print(f"  {material_embeddings_cls[0, :10].detach().numpy()}")
    
    # Compare both
    print(f"\nüìä Comparison:")
    print(f"  Mean pooling L2 norm: {torch.norm(material_embeddings_mean[0]).item():.4f}")
    print(f"  [CLS]-like L2 norm: {torch.norm(material_embeddings_cls[0]).item():.4f}")



## 9. Compute Similarity


In [28]:
if 'material_embeddings_mean' in locals():
    # Compute cosine similarity between materials
    from torch.nn.functional import cosine_similarity
    
    # Material 0 vs Material 1
    sim_0_1 = cosine_similarity(
        material_embeddings_mean[0].unsqueeze(0),
        material_embeddings_mean[1].unsqueeze(0)
    )
    
    # Material 0 vs Material 2
    sim_0_2 = cosine_similarity(
        material_embeddings_mean[0].unsqueeze(0),
        material_embeddings_mean[2].unsqueeze(0)
    )
    
    print("üéØ Similarity results:")
    print(f"  {df_test.iloc[0]['MAKTX']}")
    print(f"  vs")
    print(f"  {df_test.iloc[1]['MAKTX']}")
    print(f"  ‚Üí Similarity: {sim_0_1.item():.4f}")
    print()
    print(f"  {df_test.iloc[0]['MAKTX']}")
    print(f"  vs")
    print(f"  {df_test.iloc[2]['MAKTX']}")
    print(f"  ‚Üí Similarity: {sim_0_2.item():.4f}")


## 10. Conclusions

In [29]:

print("=" * 60)
print("KEY FINDINGS")
print("=" * 60)
print()
print("‚úì RPT-1 can be used as an encoder")
print("‚úì Requires fitting on a supervised task first")
print("‚úì Embeddings accessible via get_tokenized_data()")
print("‚úì text_embeddings shape: (n_materials, n_columns, embedding_dim)")
print("‚úì Can extract material-level embeddings via pooling")
print()
print("NEXT STEPS:")
print("1. Create SAPRPT1Encoder class")
print("2. Integrate with MultimodalMaterialEmbeddings")
print("3. Compare: Current encoders vs RPT-1")
print("=" * 60)



KEY FINDINGS

‚úì RPT-1 can be used as an encoder
‚úì Requires fitting on a supervised task first
‚úì Embeddings accessible via get_tokenized_data()
‚úì text_embeddings shape: (n_materials, n_columns, embedding_dim)
‚úì Can extract material-level embeddings via pooling

NEXT STEPS:
1. Create SAPRPT1Encoder class
2. Integrate with MultimodalMaterialEmbeddings
3. Compare: Current encoders vs RPT-1


## Appendix: Helper Functions

In [30]:
# %% [markdown]
## ## Appendix: Helper Functions (FINAL)

# %%
def extract_rpt1_embeddings(classifier, df, pooling='mean'):
    """
    Extract embeddings from fitted RPT-1 classifier
    
    Args:
        classifier: Fitted SAP_RPT_OSS_Classifier
        df: DataFrame with materials (can include target column)
        pooling: 'mean' or 'cls'
        
    Returns:
        torch.Tensor of shape (n_materials, embedding_dim)
    """
    # Prepare DataFrame
    df_clean = df.copy()
    
    # Remove target column if exists
    if 'MATKL' in df_clean.columns:
        df_clean = df_clean.drop('MATKL', axis=1)
    
    # Remove duplicate columns
    if not df_clean.columns.is_unique:
        df_clean = df_clean.loc[:, ~df_clean.columns.duplicated()]
    
    # Reset index
    df_clean = df_clean.reset_index(drop=True)
    
    # Get tokenized data
    tokenized = classifier.get_tokenized_data(df_clean, bagging_index=0)
    
    # Extract text embeddings (inside 'data' dict)
    text_emb = tokenized['data']['text_embeddings']
    
    # Apply pooling over columns (dim=1)
    if pooling == 'mean':
        pooled = text_emb.mean(dim=1)  # (n_train + n_test, embedding_dim)
    elif pooling == 'cls':
        pooled = text_emb[:, -1, :]
    else:
        raise ValueError(f"Unknown pooling: {pooling}")
    
    # CRITICAL: Extract only test samples (last n_materials rows)
    # RPT-1 concatenates [train, test], we only want test
    n_materials = len(df_clean)
    embeddings_test = pooled[-n_materials:]
    
    return embeddings_test

# Test
print("Testing FINAL helper function...")
if 'classifier' in locals() and hasattr(classifier, 'X_'):
    embeddings = extract_rpt1_embeddings(classifier, df_test, pooling='mean')
    
    print(f"‚úÖ Helper function works!")
    print(f"  Input materials: {len(df_test)}")
    print(f"  Output shape: {embeddings.shape}")
    print(f"  Embedding dimension: {embeddings.shape[1]}")
    print(f"\n  First material embedding (first 10 dims):")
    print(f"    {embeddings[0, :10].detach().numpy()}")
    
    # Verify correct number of samples
    assert embeddings.shape[0] == len(df_test), "Sample count mismatch!"
    print(f"\n‚úì Sample count verified: {embeddings.shape[0]} materials")
    
else:
    print("‚ö†Ô∏è Classifier not fitted")

Testing FINAL helper function...
‚úÖ Helper function works!
  Input materials: 3
  Output shape: torch.Size([3, 384])
  Embedding dimension: 384

  First material embedding (first 10 dims):
    [-0.2426   -0.0096   -0.1543   -0.00938  -0.001734 -0.03326   0.1707
  0.0841    0.012146 -0.1013  ]

‚úì Sample count verified: 3 materials
