# Install + Hugging Face login



In [9]:
# Install the official SAP RPT-1 OSS library + dependencies
!pip install -q git+https://github.com/SAP-samples/sap-rpt-1-oss.git
!pip install -q transformers torch pandas scikit-learn huggingface_hub python-dotenv

from huggingface_hub import login
import os
from dotenv import load_dotenv
from pathlib import Path

print("üìç Using ABSOLUTE path for .env")

# üî• ESPECIFICAR RUTA ABSOLUTA
env_path = "/Users/antonio/Documents/Herramientas SAP/ML/RPT-1/rpt1-sap-playground/.env"
load_dotenv(env_path)

HF_TOKEN = os.getenv('HF_TOKEN')

if HF_TOKEN:
    print(f"‚úÖ Token loaded: {HF_TOKEN[:10]}...")
    try:
        login(token=HF_TOKEN)
        print("üéâ Hugging Face login successful!")
        print("üöÄ Ready to download RPT-1 OSS model!")
    except Exception as e:
        print(f"‚ùå Login failed: {e}")
else:
    print("‚ùå HF_TOKEN still not found")
    print(f"üìÅ Checking .env at: {env_path}")
    print(f"   File exists: {Path(env_path).exists()}")

  Preparing metadata (setup.py) ... [?25l[?25hdone
üìç Using ABSOLUTE path for .env
‚ùå HF_TOKEN still not found
üìÅ Checking .env at: /Users/antonio/Documents/Herramientas SAP/ML/RPT-1/rpt1-sap-playground/.env
   File exists: False


In [None]:
# Install the official SAP RPT-1 OSS library + dependencies
!pip install -q git+https://github.com/SAP-samples/sap-rpt-1-oss.git
!pip install -q transformers torch pandas scikit-learn huggingface_hub

from huggingface_hub import login
import os
from dotenv import load_dotenv

load_dotenv()

def detect_real_environment():
    """Detect if we're truly in Colab UI or just using Colab kernel"""
    try:
        # Esta es la forma m√°s confiable de detectar Colab real
        import google.colab
        try:
            # Intentar acceso que solo funciona en Colab UI real
            from google.colab import _message
            _message.blocking_request('get_env', request='', timeout_sec=1)
            return "colab_ui"
        except:
            return "colab_kernel_external"  # Kernel de Colab desde VS Code
    except ImportError:
        return "vscode_local"  # VS Code puro

environment = detect_real_environment()
print(f"üîç Environment: {environment}")

# Estrategia basada en entorno real
if environment == "colab_ui":
    # Solo aqu√≠ intentar usar Colab secrets
    try:
        from google.colab import userdata
        HF_TOKEN = userdata.get('HF_TOKEN')
        print("‚úÖ Token from Colab secrets")
    except:
        HF_TOKEN = os.getenv('HF_TOKEN')
else:
    # En cualquier otro caso, usar solo variables de entorno
    HF_TOKEN = os.getenv('HF_TOKEN')

# Login
if HF_TOKEN:
    login(token=HF_TOKEN)
    print("üéâ Login successful!")
else:
    print("‚ùå No HF_TOKEN found")
    print("üí° Use .env file: HF_TOKEN=your_token")

  Preparing metadata (setup.py) ... [?25l[?25hdone
üîç Environment: colab_ui
‚ùå No HF_TOKEN found
üí° Use .env file: HF_TOKEN=your_token


In [7]:
# Install the official SAP RPT-1 OSS library + dependencies
!pip install -q git+https://github.com/SAP-samples/sap-rpt-1-oss.git
!pip install -q transformers torch pandas scikit-learn huggingface_hub

from huggingface_hub import login
import os

HF_TOKEN = None

# Try different sources for the token
try:
    # 1.Try from Colab
    from google.colab import userdata
    HF_TOKEN = userdata.get('HF_TOKEN')
    print("‚úì Token obtained from Colab secrets")
except:
    try:
        # 2. Try from environment variable
        HF_TOKEN = os.getenv('HF_TOKEN')
        if HF_TOKEN:
            print("‚úì Token obtained from environment variable")
    except:
        pass

if HF_TOKEN:
    login(token=HF_TOKEN)
    print("Login successful! Ready to download RPT-1 OSS")
else:
    print("Error: Unable to obtain HF_TOKEN")
    print("\nSolution:")
    print("- In Colab: Go to Secrets ‚Üí Add 'HF_TOKEN' with your token")
    print("- In VS Code: Create an .env file with: HF_TOKEN=your_token")

  Preparing metadata (setup.py) ... [?25l[?25hdone
Error: Unable to obtain HF_TOKEN

Solution:
- In Colab: Go to Secrets ‚Üí Add 'HF_TOKEN' with your token
- In VS Code: Create an .env file with: HF_TOKEN=your_token


## Generation of Sysnthetic Data simulating serveral S/4HANA CDS Views.
1. MATERIAL MASTER DATA (I_ProductPlantBasic) 
2. CURRENT STOCK (I_MaterialStock)
3. CONSUMPTION PATTERNS (I_MaterialDocumentItem aggregated)
4. DEMAND TREND (Last 30 days vs previous 30)
5. PLANNING PARAMETERS (I_ProductPlantBasic)
6. SUPPLIER DATA (I_Supplier + I_PurchasingInfoRecord)
7. OPEN PURCHASE ORDERS (I_PurchaseOrderItemAPI01)
8. ADDITIONAL RISK SIGNALS
9. TARGET VARIABLE: STOCKOUT_RISK_14D

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

np.random.seed(42)

# Configuraci√≥n realista
n_materials = 600
n_plants = 3
n_suppliers = 8

print("Generating realistic SAP S/4HANA stockout risk dataset...")

# ============================================
# 1. MATERIAL MASTER DATA (I_ProductPlantBasic)
# ============================================

plants = ['1010', '1020', '1030']
material_types = ['ROH', 'HALB', 'FERT']  # Raw, Semi-finished, Finished
abc_classes = ['A', 'B', 'C']
mrp_types = ['PD', 'VB', 'ND']  # MRP, Reorder point, No planning

data = {
    'MATNR': [f'MAT{i:06d}' for i in range(1, n_materials+1)],
    'WERKS': np.random.choice(plants, n_materials),
    'MTART': np.random.choice(material_types, n_materials, p=[0.4, 0.3, 0.3]),
    'MATKL': np.random.choice(['MECH', 'ELEC', 'CHEM', 'PACK'], n_materials),
    'ABC_IND': np.random.choice(abc_classes, n_materials, p=[0.15, 0.35, 0.50]),  # Pareto
    'MRP_TYPE': np.random.choice(mrp_types, n_materials, p=[0.6, 0.3, 0.1]),
}

df = pd.DataFrame(data)

# ============================================
# 2. CURRENT STOCK (I_MaterialStock)
# ============================================

# Stock realista seg√∫n tipo y ABC
def generate_stock(row):
    if row['ABC_IND'] == 'A':
        base_stock = np.random.randint(500, 3000)
    elif row['ABC_IND'] == 'B':
        base_stock = np.random.randint(200, 1500)
    else:  # C
        base_stock = np.random.randint(50, 800)

    # 10% de materiales con stock muy bajo (situaci√≥n cr√≠tica)
    if np.random.random() < 0.10:
        base_stock = int(base_stock * 0.15)

    return base_stock

df['LABST'] = df.apply(generate_stock, axis=1)

# Blocked stock (5-15% del total en algunos casos)
df['SPEME'] = np.where(
    np.random.random(n_materials) < 0.15,  # 15% tienen stock bloqueado
    (df['LABST'] * np.random.uniform(0.05, 0.15, n_materials)).astype(int),
    0
)

# ============================================
# 3. CONSUMPTION PATTERNS (I_MaterialDocumentItem aggregated)
# ============================================

def generate_consumption(row):
    base_consumption = {
        'A': (50, 250),   # High runners
        'B': (20, 100),   # Medium
        'C': (5, 30)      # Slow movers
    }

    min_cons, max_cons = base_consumption[row['ABC_IND']]
    avg = np.random.uniform(min_cons, max_cons)

    # Std dev es ~30% del promedio (volatilidad realista)
    std = avg * np.random.uniform(0.2, 0.4)

    # Max es ~2-3x el promedio (picos de demanda)
    max_val = avg * np.random.uniform(2.0, 3.5)

    return pd.Series({
        'avg': round(avg, 2),
        'std': round(std, 2),
        'max': round(max_val, 2)
    })

consumption = df.apply(generate_consumption, axis=1)
df['AVG_DAILY_CONSUMPTION'] = consumption['avg']
df['STD_DEV_CONSUMPTION'] = consumption['std']
df['MAX_DAILY_CONSUMPTION'] = consumption['max']

# D√≠as de cobertura actual (key metric!)
df['DAYS_OF_SUPPLY'] = np.round(df['LABST'] / df['AVG_DAILY_CONSUMPTION'].replace(0, 1), 1)

# ============================================
# 4. DEMAND TREND (Last 30 days vs previous 30)
# ============================================

def assign_trend(days_of_supply, abc):
    if days_of_supply < 10:
        # Si ya est√° bajo, probablemente hay tendencia creciente
        return np.random.choice(['Increasing', 'Stable', 'Decreasing'], p=[0.6, 0.3, 0.1])
    elif abc == 'A':
        return np.random.choice(['Increasing', 'Stable', 'Decreasing'], p=[0.3, 0.5, 0.2])
    else:
        return np.random.choice(['Increasing', 'Stable', 'Decreasing'], p=[0.2, 0.6, 0.2])

df['DEMAND_TREND_30D'] = df.apply(lambda x: assign_trend(x['DAYS_OF_SUPPLY'], x['ABC_IND']), axis=1)

# ============================================
# 5. PLANNING PARAMETERS (I_ProductPlantBasic)
# ============================================

# Safety stock = ~7-14 d√≠as de consumo promedio
df['SAFETY_STOCK'] = (df['AVG_DAILY_CONSUMPTION'] * np.random.uniform(7, 14, n_materials)).astype(int)

# Reorder point = safety stock + lead time demand
lead_time_days_options = [7, 10, 14, 21, 30, 45, 60]
df['LEAD_TIME_DAYS'] = np.random.choice(
    lead_time_days_options,
    n_materials,
    p=[0.05, 0.15, 0.25, 0.25, 0.15, 0.10, 0.05]  # Most common: 14-21 days
)

df['REORDER_POINT'] = (
    df['SAFETY_STOCK'] +
    (df['AVG_DAILY_CONSUMPTION'] * df['LEAD_TIME_DAYS'])
).astype(int)

# ============================================
# 6. SUPPLIER DATA (I_Supplier + I_PurchasingInfoRecord)
# ============================================

suppliers = [f'V{i:04d}' for i in range(1, n_suppliers+1)]
df['LIFNR'] = np.random.choice(suppliers, n_materials)

# Supplier reliability (based en on-time historicaldelivery)
supplier_reliability_map = {
    'V0001': 'High',    # 95%+ on-time
    'V0002': 'High',    # 93%+
    'V0003': 'Medium',  # 85-90%
    'V0004': 'Medium',  # 80-85%
    'V0005': 'Medium',  # 85-90%
    'V0006': 'Low',     # 70-75%
    'V0007': 'Low',     # 65-70%
    'V0008': 'High',    # 90%+
}

df['SUPPLIER_RELIABILITY'] = df['LIFNR'].map(supplier_reliability_map)

# Actual lead time vs planned (supplier performance)
df['ACTUAL_VS_PLANNED_LT'] = df['SUPPLIER_RELIABILITY'].map({
    'High': np.random.uniform(0.95, 1.05, n_materials),    # ¬±5%
    'Medium': np.random.uniform(1.0, 1.20, n_materials),   # +0-20%
    'Low': np.random.uniform(1.10, 1.40, n_materials)      # +10-40%
})

# ============================================
# 7. OPEN PURCHASE ORDERS (I_PurchaseOrderItemAPI01)
# ============================================

# 60% of materials have open PO
has_open_po = np.random.random(n_materials) < 0.60

df['OPEN_PO_QTY'] = np.where(
    has_open_po,
    (df['AVG_DAILY_CONSUMPTION'] * np.random.uniform(10, 30, n_materials)).astype(int),
    0
)

# Delivery date of open PO (days from today)
def calculate_po_delivery(row):
    if row['OPEN_PO_QTY'] == 0:
        return -999  # No open PO

    # Realistic: POs usually arrive within 7-30 days.
    # But some are delayed (>lead time)
    if row['SUPPLIER_RELIABILITY'] == 'Low' and np.random.random() < 0.3:
        # 30% of purchase orders from poor suppliers are delayed.
        return np.random.randint(row['LEAD_TIME_DAYS'] + 5, row['LEAD_TIME_DAYS'] + 20)
    else:
        return np.random.randint(3, min(row['LEAD_TIME_DAYS'] + 5, 30))

df['OPEN_PO_DELIVERY_DAYS'] = df.apply(calculate_po_delivery, axis=1)

# ============================================
# 8. ADDITIONAL RISK SIGNALS
# ============================================

# Ratio: stock actual vs reorder point
df['STOCK_VS_REORDER_RATIO'] = np.round(df['LABST'] / df['REORDER_POINT'].replace(0, 1), 2)

# Ratio: stock actual vs safety stock
df['STOCK_VS_SAFETY_RATIO'] = np.round(df['LABST'] / df['SAFETY_STOCK'].replace(0, 1), 2)

# Coverage until next instalment (if there is a PO)
df['COVERAGE_UNTIL_NEXT_PO'] = np.where(
    df['OPEN_PO_DELIVERY_DAYS'] > 0,
    df['DAYS_OF_SUPPLY'] - df['OPEN_PO_DELIVERY_DAYS'],
    -999
)

# ============================================
# 9. TARGET VARIABLE: STOCKOUT_RISK_14D
# ============================================

# Realistic logic for determining risk
def calculate_stockout_risk(row):
    """
    Calculate the risk of stockouts in the next 14 days
    Based on multiple factors, as an expert planner would do
    """
    risk_score = 0

    # Factor 1: Days of coverage
    if row['DAYS_OF_SUPPLY'] < 7:
        risk_score += 4
    elif row['DAYS_OF_SUPPLY'] < 14:
        risk_score += 3
    elif row['DAYS_OF_SUPPLY'] < 21:
        risk_score += 1

    # Factor 2: Stock vs safety stock
    if row['STOCK_VS_SAFETY_RATIO'] < 0.5:
        risk_score += 3
    elif row['STOCK_VS_SAFETY_RATIO'] < 1.0:
        risk_score += 2
    elif row['STOCK_VS_SAFETY_RATIO'] < 1.5:
        risk_score += 1

    # Factor 3: PO coverage
    if row['OPEN_PO_DELIVERY_DAYS'] == -999:  # No PO
        risk_score += 3
    elif row['OPEN_PO_DELIVERY_DAYS'] > row['DAYS_OF_SUPPLY']:
        risk_score += 2  # PO llega despu√©s de stockout
    elif row['OPEN_PO_DELIVERY_DAYS'] > 14:
        risk_score += 1  # PO lejana

    # Factor 4: Supplier reliability
    if row['SUPPLIER_RELIABILITY'] == 'Low':
        risk_score += 2
    elif row['SUPPLIER_RELIABILITY'] == 'Medium':
        risk_score += 1

    # Factor 5: Demand trend
    if row['DEMAND_TREND_30D'] == 'Increasing':
        risk_score += 2
    elif row['DEMAND_TREND_30D'] == 'Decreasing':
        risk_score -= 1

    # Factor 6: Criticality (ABC)
    if row['ABC_IND'] == 'A':
        risk_score += 1  # More sensitive to stockouts

    # Factor 7: Demand volatility
    cv = row['STD_DEV_CONSUMPTION'] / row['AVG_DAILY_CONSUMPTION']  # Coefficient of variation
    if cv > 0.5:  # High volatility
        risk_score += 2
    elif cv > 0.3:
        risk_score += 1

    # Final standings
    if risk_score >= 8:
        return 'HIGH'
    elif risk_score >= 5:
        return 'MEDIUM'
    else:
        return 'LOW'

# Create target for subset of materials (few-shot learning scenario)
df['STOCKOUT_RISK_14D'] = pd.NA
df['STOCKOUT_RISK_14D'] = df['STOCKOUT_RISK_14D'].astype('string')

# Select 80 materials with known labels (mixed by criticality)
high_risk_candidates = df[df.apply(calculate_stockout_risk, axis=1) == 'HIGH'].sample(n=min(25, len(df)), random_state=42)
medium_risk_candidates = df[df.apply(calculate_stockout_risk, axis=1) == 'MEDIUM'].sample(n=min(35, len(df)), random_state=42)
low_risk_candidates = df[df.apply(calculate_stockout_risk, axis=1) == 'LOW'].sample(n=min(20, len(df)), random_state=42)

known_indices = pd.concat([high_risk_candidates, medium_risk_candidates, low_risk_candidates]).index

for idx in known_indices:
    df.loc[idx, 'STOCKOUT_RISK_14D'] = calculate_stockout_risk(df.loc[idx])

# ============================================
# 10. FINAL DATASET
# ============================================

# Reorder columns to make it more readable
column_order = [
    'MATNR', 'WERKS', 'MTART', 'MATKL', 'ABC_IND', 'MRP_TYPE',
    'LABST', 'SPEME',
    'AVG_DAILY_CONSUMPTION', 'STD_DEV_CONSUMPTION', 'MAX_DAILY_CONSUMPTION',
    'DAYS_OF_SUPPLY', 'DEMAND_TREND_30D',
    'SAFETY_STOCK', 'REORDER_POINT', 'LEAD_TIME_DAYS',
    'STOCK_VS_REORDER_RATIO', 'STOCK_VS_SAFETY_RATIO',
    'LIFNR', 'SUPPLIER_RELIABILITY', 'ACTUAL_VS_PLANNED_LT',
    'OPEN_PO_QTY', 'OPEN_PO_DELIVERY_DAYS', 'COVERAGE_UNTIL_NEXT_PO',
    'STOCKOUT_RISK_14D'
]

df = df[column_order]

# Save to CSV
df.to_csv('realistic_stockout_risk_600_materials.csv', index=False)

# ============================================
# SUMMARY STATS
# ============================================

print("\n" + "="*60)
print("DATASET GENERATED SUCCESSFULLY")
print("="*60)

print(f"\nTotal materials: {len(df)}")
print(f"Plants: {df['WERKS'].nunique()} ‚Üí {df['WERKS'].unique().tolist()}")
print(f"Suppliers: {df['LIFNR'].nunique()}")

print(f"\n--- ABC Distribution ---")
print(df['ABC_IND'].value_counts().sort_index())

print(f"\n--- Known Risk Labels (few-shot examples) ---")
known_df = df[df['STOCKOUT_RISK_14D'].notna()]
print(f"Total labeled: {len(known_df)} materials ({len(known_df)/len(df)*100:.1f}%)")
print(known_df['STOCKOUT_RISK_14D'].value_counts())

print(f"\n--- Materials to predict ---")
print(f"Unlabeled: {df['STOCKOUT_RISK_14D'].isna().sum()} materials")

print(f"\n--- Stock Coverage Stats ---")
print(f"Days of supply - Mean: {df['DAYS_OF_SUPPLY'].mean():.1f}")
print(f"Days of supply - Median: {df['DAYS_OF_SUPPLY'].median():.1f}")
print(f"Materials with <14 days coverage: {(df['DAYS_OF_SUPPLY'] < 14).sum()} ({(df['DAYS_OF_SUPPLY'] < 14).sum()/len(df)*100:.1f}%)")
print(f"Materials with <7 days coverage: {(df['DAYS_OF_SUPPLY'] < 7).sum()} ({(df['DAYS_OF_SUPPLY'] < 7).sum()/len(df)*100:.1f}%)")

print(f"\n--- Open POs ---")
print(f"Materials with open PO: {(df['OPEN_PO_QTY'] > 0).sum()} ({(df['OPEN_PO_QTY'] > 0).sum()/len(df)*100:.1f}%)")
print(f"Materials without PO: {(df['OPEN_PO_QTY'] == 0).sum()}")

print(f"\n--- Supplier Reliability ---")
print(df['SUPPLIER_RELIABILITY'].value_counts())

print("\n" + "="*60)
print("File saved: realistic_stockout_risk_600_materials.csv")
print("="*60)

# Show sample
print("\nSample of HIGH RISK materials (known labels):")
sample_high = df[(df['STOCKOUT_RISK_14D'] == 'HIGH')].head(5)
print(sample_high[['MATNR', 'WERKS', 'LABST', 'DAYS_OF_SUPPLY', 'OPEN_PO_DELIVERY_DAYS', 'SUPPLIER_RELIABILITY', 'STOCKOUT_RISK_14D']])

Generating realistic SAP S/4HANA stockout risk dataset...

DATASET GENERATED SUCCESSFULLY

Total materials: 600
Plants: 3 ‚Üí ['1030', '1010', '1020']
Suppliers: 8

--- ABC Distribution ---
ABC_IND
A     91
B    202
C    307
Name: count, dtype: int64

--- Known Risk Labels (few-shot examples) ---
Total labeled: 80 materials (13.3%)
STOCKOUT_RISK_14D
MEDIUM    35
HIGH      25
LOW       20
Name: count, dtype: Int64

--- Materials to predict ---
Unlabeled: 520 materials

--- Stock Coverage Stats ---
Days of supply - Mean: 21.5
Days of supply - Median: 16.2
Materials with <14 days coverage: 264 (44.0%)
Materials with <7 days coverage: 128 (21.3%)

--- Open POs ---
Materials with open PO: 372 (62.0%)
Materials without PO: 228

--- Supplier Reliability ---
SUPPLIER_RELIABILITY
Medium    254
High      203
Low       143
Name: count, dtype: int64

File saved: realistic_stockout_risk_600_materials.csv

Sample of HIGH RISK materials (known labels):
        MATNR WERKS  LABST  DAYS_OF_SUPPLY  OPEN

In [1]:
from google.colab import drive
drive.mount('/content/drive')

df.to_csv('/content/drive/MyDrive/realistic_stockout_risk_600_materials.csv', index=False)


ValueError: mount failed

## Predict using SAP RPT-1

In [3]:
from sap_rpt_oss import SAP_RPT_OSS_Classifier
import time

# Load data
df = pd.read_csv('realistic_stockout_risk_600_materials.csv')

# Split
train_df = df[df['STOCKOUT_RISK_14D'].notna()].copy()
test_df = df[df['STOCKOUT_RISK_14D'].isna()].copy()

print(f"Few-shot examples: {len(train_df)} materials")
print(f"Materials to assess: {len(test_df)} materials")
print(f"\nRisk distribution in training set:")
print(train_df['STOCKOUT_RISK_14D'].value_counts())

# Initialize classifier
clf = SAP_RPT_OSS_Classifier(max_context_size=8192, bagging=8)

# Fit (in-context learning)
start = time.time()
clf.fit(
    train_df.drop('STOCKOUT_RISK_14D', axis=1),
    train_df['STOCKOUT_RISK_14D']
)
fit_time = time.time() - start
print(f"\nFit completed in {fit_time:.2f} seconds")

# Predict
start = time.time()
predictions = clf.predict(test_df.drop('STOCKOUT_RISK_14D', axis=1))
probabilities = clf.predict_proba(test_df.drop('STOCKOUT_RISK_14D', axis=1))
prediction_time = time.time() - start

print(f"Prediction on {len(test_df)} materials took {prediction_time:.3f} seconds")

# Add results
test_df = test_df.copy()
test_df['PREDICTED_RISK'] = predictions
test_df['CONFIDENCE'] = [max(p) for p in probabilities]

# Distribution of predictions
print(f"\n--- Prediction Distribution ---")
print(test_df['PREDICTED_RISK'].value_counts())

# Filter HIGH risk materials for action
high_risk = test_df[test_df['PREDICTED_RISK'] == 'HIGH'].sort_values('CONFIDENCE', ascending=False)

print(f"\n*** ALERT: {len(high_risk)} HIGH RISK materials identified ***")
print("\nTop 15 materials requiring immediate action:")

action_list = high_risk[[
    'MATNR', 'WERKS', 'ABC_IND',
    'LABST', 'DAYS_OF_SUPPLY', 'STOCK_VS_SAFETY_RATIO',
    'LEAD_TIME_DAYS', 'OPEN_PO_QTY', 'OPEN_PO_DELIVERY_DAYS',
    'LIFNR', 'SUPPLIER_RELIABILITY',
    'PREDICTED_RISK', 'CONFIDENCE'
]].head(15)

display(action_list)

# Save for procurement team
high_risk.to_csv('HIGH_RISK_materials_action_required.csv', index=False)
print("\n>>> Action list saved: HIGH_RISK_materials_action_required.csv")

# Summary by plant
print("\n--- HIGH RISK materials by Plant ---")
print(high_risk['WERKS'].value_counts())

# Summary by supplier
print("\n--- HIGH RISK materials by Supplier ---")
print(high_risk['LIFNR'].value_counts().head(5))

Few-shot examples: 80 materials
Materials to assess: 520 materials

Risk distribution in training set:
STOCKOUT_RISK_14D
MEDIUM    35
HIGH      25
LOW       20
Name: count, dtype: int64


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


2025-11-04_sap-rpt-one-oss.pt:   0%|          | 0.00/64.6M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]


Fit completed in 0.00 seconds
Prediction on 520 materials took 187.028 seconds

--- Prediction Distribution ---
PREDICTED_RISK
HIGH      192
MEDIUM    190
LOW       138
Name: count, dtype: int64

*** ALERT: 192 HIGH RISK materials identified ***

Top 15 materials requiring immediate action:


Unnamed: 0,MATNR,WERKS,ABC_IND,LABST,DAYS_OF_SUPPLY,STOCK_VS_SAFETY_RATIO,LEAD_TIME_DAYS,OPEN_PO_QTY,OPEN_PO_DELIVERY_DAYS,LIFNR,SUPPLIER_RELIABILITY,PREDICTED_RISK,CONFIDENCE
150,MAT000151,1020,C,40,1.7,0.21,30,0,-999,V0004,Medium,HIGH,0.998293
593,MAT000594,1030,C,63,2.8,0.21,21,0,-999,V0008,High,HIGH,0.997942
255,MAT000256,1030,B,144,3.6,0.3,14,1074,18,V0005,Medium,HIGH,0.997785
94,MAT000095,1010,A,235,2.7,0.24,21,0,-999,V0005,Medium,HIGH,0.99777
312,MAT000313,1030,B,62,0.7,0.07,30,0,-999,V0005,Medium,HIGH,0.997765
528,MAT000529,1020,B,267,3.2,0.28,21,2302,6,V0002,High,HIGH,0.997353
508,MAT000509,1020,C,41,2.8,0.22,21,200,22,V0003,Medium,HIGH,0.997278
332,MAT000333,1010,B,282,5.0,0.43,14,906,10,V0008,High,HIGH,0.997227
271,MAT000272,1020,C,32,1.5,0.17,21,335,18,V0004,Medium,HIGH,0.997191
434,MAT000435,1020,B,205,2.2,0.15,14,1206,5,V0005,Medium,HIGH,0.997186



>>> Action list saved: HIGH_RISK_materials_action_required.csv

--- HIGH RISK materials by Plant ---
WERKS
1010    74
1020    64
1030    54
Name: count, dtype: int64

--- HIGH RISK materials by Supplier ---
LIFNR
V0004    33
V0005    26
V0008    25
V0007    23
V0006    23
Name: count, dtype: int64


## Ground Truth for validation - Calculation
Calculate the ground truth for all test materials.
We use the same risk function that generated the training labels.

In [4]:
# Calculate the ground truth for all test materials.
# We use the same risk function that generated the training labels.

def calculate_stockout_risk(row):
    """
    Calcula riesgo de stockout en pr√≥ximos 14 d√≠as
    Basado en m√∫ltiples factores como lo har√≠a un planificador experto
    """
    risk_score = 0

    # Factor 1: Days of coverage
    if row['DAYS_OF_SUPPLY'] < 7:
        risk_score += 4
    elif row['DAYS_OF_SUPPLY'] < 14:
        risk_score += 3
    elif row['DAYS_OF_SUPPLY'] < 21:
        risk_score += 1

    # Factor 2: Stock vs safety stock
    if row['STOCK_VS_SAFETY_RATIO'] < 0.5:
        risk_score += 3
    elif row['STOCK_VS_SAFETY_RATIO'] < 1.0:
        risk_score += 2
    elif row['STOCK_VS_SAFETY_RATIO'] < 1.5:
        risk_score += 1

    # Factor 3: PO coverage
    if row['OPEN_PO_DELIVERY_DAYS'] == -999:  # No PO
        risk_score += 3
    elif row['OPEN_PO_DELIVERY_DAYS'] > row['DAYS_OF_SUPPLY']:
        risk_score += 2  # PO arrives after stockout
    elif row['OPEN_PO_DELIVERY_DAYS'] > 14:
        risk_score += 1  # distant PO

    # Factor 4: Supplier reliability
    if row['SUPPLIER_RELIABILITY'] == 'Low':
        risk_score += 2
    elif row['SUPPLIER_RELIABILITY'] == 'Medium':
        risk_score += 1

    # Factor 5: Demand trend
    if row['DEMAND_TREND_30D'] == 'Increasing':
        risk_score += 2
    elif row['DEMAND_TREND_30D'] == 'Decreasing':
        risk_score -= 1

    # Factor 6: Criticality (ABC)
    if row['ABC_IND'] == 'A':
        risk_score += 1  # More sensitive to stockouts

    # Factor 7: Demand volatility
    cv = row['STD_DEV_CONSUMPTION'] / row['AVG_DAILY_CONSUMPTION']  # Coefficient of variation
    if cv > 0.5:  # High volatility
        risk_score += 2
    elif cv > 0.3:
        risk_score += 1

    # Final standings
    if risk_score >= 8:
        return 'HIGH'
    elif risk_score >= 5:
        return 'MEDIUM'
    else:
        return 'LOW'

# Calcular ground truth para test set
print("Calculating ground truth for validation...")
test_df['TRUE_RISK'] = test_df.apply(calculate_stockout_risk, axis=1)

print("\n--- Ground Truth Distribution ---")
print(test_df['TRUE_RISK'].value_counts())

print("\n--- Predicted Distribution ---")
print(test_df['PREDICTED_RISK'].value_counts())

Calculating ground truth for validation...

--- Ground Truth Distribution ---
TRUE_RISK
HIGH      217
LOW       207
MEDIUM     96
Name: count, dtype: int64

--- Predicted Distribution ---
PREDICTED_RISK
HIGH      192
MEDIUM    190
LOW       138
Name: count, dtype: int64


# Complete ranking metrics

In [5]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pandas as pd

print("="*70)
print("CLASSIFICATION PERFORMANCE REPORT")
print("="*70)

# Overall accuracy
accuracy = accuracy_score(test_df['TRUE_RISK'], test_df['PREDICTED_RISK'])
print(f"\nOverall Accuracy: {accuracy:.2%}")

# Classification report detallado
print("\n--- Classification Report ---")
report = classification_report(
    test_df['TRUE_RISK'],
    test_df['PREDICTED_RISK'],
    labels=['HIGH', 'MEDIUM', 'LOW'],
    target_names=['HIGH', 'MEDIUM', 'LOW'],
    digits=3
)
print(report)

# Confusion Matrix
print("\n--- Confusion Matrix ---")
cm = confusion_matrix(
    test_df['TRUE_RISK'],
    test_df['PREDICTED_RISK'],
    labels=['HIGH', 'MEDIUM', 'LOW']
)

cm_df = pd.DataFrame(
    cm,
    index=['True HIGH', 'True MEDIUM', 'True LOW'],
    columns=['Pred HIGH', 'Pred MEDIUM', 'Pred LOW']
)

print(cm_df)

# Interpretaci√≥n de la confusion matrix
print("\n--- Key Metrics Interpretation ---")
print(f"HIGH precision: {cm[0,0] / cm[:,0].sum():.2%} (when model says HIGH, how often is it right?)")
print(f"HIGH recall: {cm[0,0] / cm[0,:].sum():.2%} (of all true HIGH risk, how many did we catch?)")
print(f"HIGH ‚Üí MEDIUM misclassification: {cm[0,1]} materials (acceptable)")
print(f"HIGH ‚Üí LOW misclassification: {cm[0,2]} materials (CRITICAL if >0!)")

# False negatives cr√≠ticos (predijo LOW pero es HIGH)
false_negatives_critical = test_df[
    (test_df['TRUE_RISK'] == 'HIGH') &
    (test_df['PREDICTED_RISK'] == 'LOW')
]

if len(false_negatives_critical) > 0:
    print(f"\n‚ö†Ô∏è  WARNING: {len(false_negatives_critical)} critical false negatives (missed HIGH risk)")
    print("These materials need manual review:")
    print(false_negatives_critical[['MATNR', 'WERKS', 'DAYS_OF_SUPPLY', 'PREDICTED_RISK', 'TRUE_RISK', 'CONFIDENCE']].head())
else:
    print("\n‚úÖ No critical false negatives (no HIGH risk materials missed)")

CLASSIFICATION PERFORMANCE REPORT

Overall Accuracy: 75.77%

--- Classification Report ---
              precision    recall  f1-score   support

        HIGH      0.953     0.843     0.895       217
      MEDIUM      0.426     0.844     0.566        96
         LOW      0.942     0.628     0.754       207

    accuracy                          0.758       520
   macro avg      0.774     0.772     0.738       520
weighted avg      0.851     0.758     0.778       520


--- Confusion Matrix ---
             Pred HIGH  Pred MEDIUM  Pred LOW
True HIGH          183           34         0
True MEDIUM          7           81         8
True LOW             2           75       130

--- Key Metrics Interpretation ---
HIGH precision: 95.31% (when model says HIGH, how often is it right?)
HIGH recall: 84.33% (of all true HIGH risk, how many did we catch?)
HIGH ‚Üí MEDIUM misclassification: 34 materials (acceptable)
HIGH ‚Üí LOW misclassification: 0 materials (CRITICAL if >0!)

‚úÖ No critical fals

# Confidence analysis

In [6]:
print("="*70)
print("CONFIDENCE ANALYSIS")
print("="*70)

# Confidence stats por clase predicha
print("\n--- Confidence by Predicted Risk Level ---")
confidence_stats = test_df.groupby('PREDICTED_RISK')['CONFIDENCE'].describe()
print(confidence_stats)

# Simple visualisation (without matplotlib)
print("\n--- Confidence Distribution Summary ---")
for risk_level in ['HIGH', 'MEDIUM', 'LOW']:
    subset = test_df[test_df['PREDICTED_RISK'] == risk_level]
    if len(subset) > 0:
        mean_conf = subset['CONFIDENCE'].mean()
        min_conf = subset['CONFIDENCE'].min()
        max_conf = subset['CONFIDENCE'].max()
        print(f"{risk_level:6s}: Mean={mean_conf:.3f}, Min={min_conf:.3f}, Max={max_conf:.3f}")

# HIGH risk materials with low confidence (review manually)
high_risk_low_conf = test_df[
    (test_df['PREDICTED_RISK'] == 'HIGH') &
    (test_df['CONFIDENCE'] < 0.6)
]

if len(high_risk_low_conf) > 0:
    print(f"\n‚ö†Ô∏è  {len(high_risk_low_conf)} HIGH risk materials with LOW confidence (<0.6)")
    print("Recommend manual review for these:")
    print(high_risk_low_conf[['MATNR', 'WERKS', 'DAYS_OF_SUPPLY', 'CONFIDENCE', 'TRUE_RISK']].head(10))
else:
    print("\n‚úÖ All HIGH risk predictions have confidence ‚â•0.6")

# Accuracy stratified by confidence
print("\n--- Accuracy by Confidence Threshold ---")
for threshold in [0.5, 0.6, 0.7, 0.8]:
    high_conf = test_df[test_df['CONFIDENCE'] >= threshold]
    if len(high_conf) > 0:
        acc = accuracy_score(high_conf['TRUE_RISK'], high_conf['PREDICTED_RISK'])
        print(f"Confidence ‚â•{threshold}: {len(high_conf):3d} materials ({len(high_conf)/len(test_df)*100:5.1f}%) ‚Üí Accuracy: {acc:.2%}")

CONFIDENCE ANALYSIS

--- Confidence by Predicted Risk Level ---
                count      mean       std       min       25%       50%  \
PREDICTED_RISK                                                            
HIGH            192.0  0.889053  0.157195  0.487721  0.847547  0.975744   
LOW             138.0  0.804636  0.138522  0.513351  0.673703  0.852638   
MEDIUM          190.0  0.729020  0.111187  0.493695  0.654650  0.723618   

                     75%       max  
PREDICTED_RISK                      
HIGH            0.993458  0.998293  
LOW             0.920137  0.985059  
MEDIUM          0.820758  0.910633  

--- Confidence Distribution Summary ---
HIGH  : Mean=0.889, Min=0.488, Max=0.998
MEDIUM: Mean=0.729, Min=0.494, Max=0.911
LOW   : Mean=0.805, Min=0.513, Max=0.985

‚ö†Ô∏è  22 HIGH risk materials with LOW confidence (<0.6)
Recommend manual review for these:
         MATNR  WERKS  DAYS_OF_SUPPLY  CONFIDENCE TRUE_RISK
30   MAT000031   1020             9.7    0.598068      HI

# Business Sanity Checks

In [7]:
print("="*70)
print("BUSINESS VALIDATION - SANITY CHECKS")
print("="*70)

# Check 1: HIGH risk should have low coverage days
print("\n--- Check 1: Days of Supply by Risk Level ---")
for risk_level in ['HIGH', 'MEDIUM', 'LOW']:
    subset = test_df[test_df['PREDICTED_RISK'] == risk_level]
    if len(subset) > 0:
        mean_dos = subset['DAYS_OF_SUPPLY'].mean()
        median_dos = subset['DAYS_OF_SUPPLY'].median()
        print(f"{risk_level:6s}: Mean={mean_dos:5.1f} days, Median={median_dos:5.1f} days")

# Check 2: HIGH risk should have more materials without PO
print("\n--- Check 2: Open PO Coverage by Risk Level ---")
for risk_level in ['HIGH', 'MEDIUM', 'LOW']:
    subset = test_df[test_df['PREDICTED_RISK'] == risk_level]
    if len(subset) > 0:
        no_po_pct = (subset['OPEN_PO_QTY'] == 0).sum() / len(subset) * 100
        print(f"{risk_level:6s}: {no_po_pct:5.1f}% without open PO")

# Check 3: HIGH risk should have more Low reliability suppliers
print("\n--- Check 3: Supplier Reliability by Risk Level ---")
for risk_level in ['HIGH', 'MEDIUM', 'LOW']:
    subset = test_df[test_df['PREDICTED_RISK'] == risk_level]
    if len(subset) > 0:
        low_reliability_pct = (subset['SUPPLIER_RELIABILITY'] == 'Low').sum() / len(subset) * 100
        print(f"{risk_level:6s}: {low_reliability_pct:5.1f}% with Low reliability suppliers")

# Check 4: HIGH risk should have low stock vs safety stock
print("\n--- Check 4: Stock vs Safety Stock Ratio by Risk Level ---")
for risk_level in ['HIGH', 'MEDIUM', 'LOW']:
    subset = test_df[test_df['PREDICTED_RISK'] == risk_level]
    if len(subset) > 0:
        mean_ratio = subset['STOCK_VS_SAFETY_RATIO'].mean()
        below_safety = (subset['STOCK_VS_SAFETY_RATIO'] < 1.0).sum() / len(subset) * 100
        print(f"{risk_level:6s}: Mean ratio={mean_ratio:.2f}, {below_safety:5.1f}% below safety stock")

# Check 5: Class A materials should be more represented in HIGH risk.
print("\n--- Check 5: ABC Distribution by Risk Level ---")
for risk_level in ['HIGH', 'MEDIUM', 'LOW']:
    subset = test_df[test_df['PREDICTED_RISK'] == risk_level]
    if len(subset) > 0:
        abc_dist = subset['ABC_IND'].value_counts(normalize=True) * 100
        print(f"\n{risk_level}:")
        print(abc_dist.to_string())

print("\n" + "="*70)
print("‚úÖ Business validation complete")
print("="*70)

BUSINESS VALIDATION - SANITY CHECKS

--- Check 1: Days of Supply by Risk Level ---
HIGH  : Mean=  6.3 days, Median=  6.1 days
MEDIUM: Mean= 20.6 days, Median= 18.6 days
LOW   : Mean= 43.2 days, Median= 37.6 days

--- Check 2: Open PO Coverage by Risk Level ---
HIGH  :  40.1% without open PO
MEDIUM:  50.0% without open PO
LOW   :  17.4% without open PO

--- Check 3: Supplier Reliability by Risk Level ---
HIGH  :  24.0% with Low reliability suppliers
MEDIUM:  28.9% with Low reliability suppliers
LOW   :  18.8% with Low reliability suppliers

--- Check 4: Stock vs Safety Stock Ratio by Risk Level ---
HIGH  : Mean ratio=0.61,  87.0% below safety stock
MEDIUM: Mean ratio=2.04,   8.9% below safety stock
LOW   : Mean ratio=4.31,   0.0% below safety stock

--- Check 5: ABC Distribution by Risk Level ---

HIGH:
ABC_IND
B    42.708333
C    34.895833
A    22.395833

MEDIUM:
ABC_IND
C    50.526316
B    34.736842
A    14.736842

LOW:
ABC_IND
C    71.014493
B    21.739130
A     7.246377

‚úÖ Busines

# Train vs Test Comparison (Distribution Shift)

In [8]:
print("="*70)
print("TRAIN vs TEST DISTRIBUTION CHECK")
print("="*70)

# Compare distributions of key features
key_features = ['DAYS_OF_SUPPLY', 'STOCK_VS_SAFETY_RATIO', 'LEAD_TIME_DAYS']

print("\n--- Feature Distribution Comparison ---")
for feature in key_features:
    print(f"\n{feature}:")
    print(f"  Train: Mean={train_df[feature].mean():6.2f}, Std={train_df[feature].std():6.2f}")
    print(f"  Test:  Mean={test_df[feature].mean():6.2f}, Std={test_df[feature].std():6.2f}")

# Compare target distribution
print("\n--- Risk Distribution Comparison ---")
print("\nTraining set:")
print(train_df['STOCKOUT_RISK_14D'].value_counts(normalize=True).apply(lambda x: f"{x:.1%}"))

print("\nTest set (Ground Truth):")
print(test_df['TRUE_RISK'].value_counts(normalize=True).apply(lambda x: f"{x:.1%}"))

print("\nTest set (Predictions):")
print(test_df['PREDICTED_RISK'].value_counts(normalize=True).apply(lambda x: f"{x:.1%}"))

# Check: ¬øLas distribuciones son similares?
train_high_pct = (train_df['STOCKOUT_RISK_14D'] == 'HIGH').sum() / len(train_df)
test_high_pct = (test_df['TRUE_RISK'] == 'HIGH').sum() / len(test_df)
pred_high_pct = (test_df['PREDICTED_RISK'] == 'HIGH').sum() / len(test_df)

print(f"\nHIGH risk percentage:")
print(f"  Training:   {train_high_pct:.1%}")
print(f"  Test (true): {test_high_pct:.1%}")
print(f"  Test (pred): {pred_high_pct:.1%}")

if abs(train_high_pct - test_high_pct) < 0.05:
    print("\n‚úÖ Train and test distributions are similar (good)")
else:
    print("\n‚ö†Ô∏è  Distribution shift detected between train and test")

TRAIN vs TEST DISTRIBUTION CHECK

--- Feature Distribution Comparison ---

DAYS_OF_SUPPLY:
  Train: Mean= 22.46, Std= 18.67
  Test:  Mean= 21.33, Std= 19.26

STOCK_VS_SAFETY_RATIO:
  Train: Mean=  2.16, Std=  1.89
  Test:  Mean=  2.12, Std=  1.99

LEAD_TIME_DAYS:
  Train: Mean= 22.88, Std= 13.10
  Test:  Mean= 22.71, Std= 13.83

--- Risk Distribution Comparison ---

Training set:
STOCKOUT_RISK_14D
MEDIUM    43.8%
HIGH      31.2%
LOW       25.0%
Name: proportion, dtype: object

Test set (Ground Truth):
TRUE_RISK
HIGH      41.7%
LOW       39.8%
MEDIUM    18.5%
Name: proportion, dtype: object

Test set (Predictions):
PREDICTED_RISK
HIGH      36.9%
MEDIUM    36.5%
LOW       26.5%
Name: proportion, dtype: object

HIGH risk percentage:
  Training:   31.2%
  Test (true): 41.7%
  Test (pred): 36.9%

‚ö†Ô∏è  Distribution shift detected between train and test


# Complete export of results

In [9]:
print("="*70)
print("EXPORTING RESULTS")
print("="*70)

# 1. Export HIGH risk with all columns + prediction
high_risk_full = test_df[test_df['PREDICTED_RISK'] == 'HIGH'].copy()
high_risk_full = high_risk_full.sort_values('CONFIDENCE', ascending=False)
high_risk_full.to_csv('HIGH_RISK_materials_full_analysis.csv', index=False)
print(f"\n‚úÖ Exported {len(high_risk_full)} HIGH risk materials")
print("   File: HIGH_RISK_materials_full_analysis.csv")

# 2. Export validation report (predicci√≥n vs ground truth)
validation_df = test_df[['MATNR', 'WERKS', 'ABC_IND', 'DAYS_OF_SUPPLY',
                          'OPEN_PO_DELIVERY_DAYS', 'SUPPLIER_RELIABILITY',
                          'TRUE_RISK', 'PREDICTED_RISK', 'CONFIDENCE']].copy()
validation_df['MATCH'] = validation_df['TRUE_RISK'] == validation_df['PREDICTED_RISK']
validation_df.to_csv('validation_results.csv', index=False)
print(f"\n‚úÖ Exported validation results for all {len(validation_df)} test materials")
print("   File: validation_results.csv")

# 3. Export summary statistics
summary_stats = {
    'Total Materials': len(test_df),
    'Accuracy': accuracy_score(test_df['TRUE_RISK'], test_df['PREDICTED_RISK']),
    'HIGH Precision': cm[0,0] / cm[:,0].sum() if cm[:,0].sum() > 0 else 0,
    'HIGH Recall': cm[0,0] / cm[0,:].sum() if cm[0,:].sum() > 0 else 0,
    'HIGH Predicted': (test_df['PREDICTED_RISK'] == 'HIGH').sum(),
    'HIGH True': (test_df['TRUE_RISK'] == 'HIGH').sum(),
    'Avg Confidence HIGH': test_df[test_df['PREDICTED_RISK'] == 'HIGH']['CONFIDENCE'].mean(),
    'Prediction Time (sec)': prediction_time,
    'Throughput (materials/sec)': len(test_df) / prediction_time
}

summary_df = pd.DataFrame([summary_stats])
summary_df.to_csv('model_performance_summary.csv', index=False)
print(f"\n‚úÖ Exported performance summary")
print("   File: model_performance_summary.csv")

print("\n" + "="*70)
print("ALL EXPORTS COMPLETE")
print("="*70)

# Display summary
print("\n--- Final Summary ---")
for key, value in summary_stats.items():
    if 'Time' in key or 'Throughput' in key:
        print(f"{key:30s}: {value:.3f}")
    elif 'Precision' in key or 'Recall' in key or 'Accuracy' in key or 'Confidence' in key:
        print(f"{key:30s}: {value:.2%}")
    else:
        print(f"{key:30s}: {value}")

EXPORTING RESULTS

‚úÖ Exported 192 HIGH risk materials
   File: HIGH_RISK_materials_full_analysis.csv

‚úÖ Exported validation results for all 520 test materials
   File: validation_results.csv

‚úÖ Exported performance summary
   File: model_performance_summary.csv

ALL EXPORTS COMPLETE

--- Final Summary ---
Total Materials               : 520
Accuracy                      : 75.77%
HIGH Precision                : 95.31%
HIGH Recall                   : 84.33%
HIGH Predicted                : 192
HIGH True                     : 217
Avg Confidence HIGH           : 88.91%
Prediction Time (sec)         : 187.028
Throughput (materials/sec)    : 2.780
