In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import json
from pathlib import Path
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Set random seed for reproducibility
np.random.seed(42)
print("✓ Libraries imported successfully")

✓ Libraries imported successfully


## Step 1: Load Drug Profile Data

We'll load substance data including:
- Neurotransmitter mechanism buckets (GABA, serotonin, dopamine, etc.)
- Categories (stimulant, depressant, psychedelic, etc.)
- Pharmacokinetic properties (half-life, duration)
- Known safety flags (MAOI, respiratory depression, seizure risk)

In [2]:
def load_drug_profiles(data_path='../../data_collector/processed/drug_profiles.json'):
    """
    Load drug profile data from JSON or CSV.
    
    Expected structure:
    {
        "substance_name": {
            "categories": ["stimulant", "entactogen"],
            "neuro_buckets": {
                "serotonin_release": 0.9,
                "dopamine_release": 0.3,
                "stimulant": 0.6
            },
            "half_life_hours": 8,
            "duration_hours": 6,
            "safety_flags": ["serotonergic"],
            "known_tolerance_model": true
        }
    }
    """
    try:
        with open(data_path, 'r') as f:
            profiles = json.load(f)
        print(f"✓ Loaded {len(profiles)} drug profiles")
        return profiles
    except FileNotFoundError:
        print(f"⚠ File not found: {data_path}")
        print("Creating synthetic sample data for demonstration...")
        return create_sample_data()

def create_sample_data():
    """Create sample drug profiles for demonstration"""
    return {
        "MDMA": {
            "categories": ["stimulant", "entactogen"],
            "neuro_buckets": {"serotonin_release": 0.9, "dopamine_release": 0.3, "stimulant": 0.6},
            "half_life_hours": 8,
            "duration_hours": 6,
            "safety_flags": ["serotonergic"]
        },
        "LSD": {
            "categories": ["psychedelic"],
            "neuro_buckets": {"serotonin_agonist": 0.95, "dopamine_modulation": 0.2},
            "half_life_hours": 5,
            "duration_hours": 12,
            "safety_flags": ["serotonergic"]
        },
        "Alcohol": {
            "categories": ["depressant"],
            "neuro_buckets": {"gaba_positive": 0.8, "nmda_antagonist": 0.3},
            "half_life_hours": 1,
            "duration_hours": 4,
            "safety_flags": ["respiratory_depressant", "gaba"]
        },
        "Cannabis": {
            "categories": ["cannabinoid"],
            "neuro_buckets": {"cb1_agonist": 0.9},
            "half_life_hours": 20,
            "duration_hours": 4,
            "safety_flags": []
        },
        "Cocaine": {
            "categories": ["stimulant"],
            "neuro_buckets": {"dopamine_reuptake": 0.9, "stimulant": 0.95},
            "half_life_hours": 1,
            "duration_hours": 2,
            "safety_flags": ["cardiovascular"]
        }
    }

# Load data
drug_profiles = load_drug_profiles()
print(f"\nSample drugs: {list(drug_profiles.keys())[:5]}")

⚠ File not found: ../../data_collector/processed/drug_profiles.json
Creating synthetic sample data for demonstration...

Sample drugs: ['MDMA', 'LSD', 'Alcohol', 'Cannabis', 'Cocaine']


## Step 2: Extract Features from Drug Profiles

Convert drug profiles into numerical feature vectors for ML processing.

In [3]:
def extract_drug_features(drug_profile):
    """
    Extract numerical features from a drug profile.
    Returns a feature vector suitable for similarity comparison.
    """
    features = {}
    
    # Neurotransmitter bucket weights
    neuro_buckets = drug_profile.get('neuro_buckets', {})
    for bucket in ['serotonin_release', 'serotonin_agonist', 'dopamine_release', 
                   'dopamine_reuptake', 'gaba_positive', 'nmda_antagonist', 
                   'stimulant', 'cb1_agonist', 'opioid', 'dopamine_modulation']:
        features[bucket] = neuro_buckets.get(bucket, 0.0)
    
    # Pharmacokinetics (normalized)
    features['half_life_norm'] = min(drug_profile.get('half_life_hours', 0) / 24, 1.0)
    features['duration_norm'] = min(drug_profile.get('duration_hours', 0) / 24, 1.0)
    
    # Category indicators (binary)
    categories = drug_profile.get('categories', [])
    for cat in ['stimulant', 'depressant', 'psychedelic', 'entactogen', 'dissociative', 'cannabinoid']:
        features[f'cat_{cat}'] = 1 if cat in categories else 0
    
    # Safety flag indicators
    safety_flags = drug_profile.get('safety_flags', [])
    for flag in ['serotonergic', 'respiratory_depressant', 'gaba', 'maoi', 'cardiovascular', 'seizure_risk']:
        features[f'flag_{flag}'] = 1 if flag in safety_flags else 0
    
    return features

# Extract features for all drugs
drug_features = {}
for drug_name, profile in drug_profiles.items():
    drug_features[drug_name] = extract_drug_features(profile)

# Convert to DataFrame
features_df = pd.DataFrame(drug_features).T
print(f"✓ Extracted features for {len(features_df)} drugs")
print(f"Feature dimensions: {features_df.shape}")
print("\nFeature columns:")
print(features_df.columns.tolist())
print("\nSample features:")
features_df.head()

✓ Extracted features for 5 drugs
Feature dimensions: (5, 24)

Feature columns:
['serotonin_release', 'serotonin_agonist', 'dopamine_release', 'dopamine_reuptake', 'gaba_positive', 'nmda_antagonist', 'stimulant', 'cb1_agonist', 'opioid', 'dopamine_modulation', 'half_life_norm', 'duration_norm', 'cat_stimulant', 'cat_depressant', 'cat_psychedelic', 'cat_entactogen', 'cat_dissociative', 'cat_cannabinoid', 'flag_serotonergic', 'flag_respiratory_depressant', 'flag_gaba', 'flag_maoi', 'flag_cardiovascular', 'flag_seizure_risk']

Sample features:


Unnamed: 0,serotonin_release,serotonin_agonist,dopamine_release,dopamine_reuptake,gaba_positive,nmda_antagonist,stimulant,cb1_agonist,opioid,dopamine_modulation,...,cat_psychedelic,cat_entactogen,cat_dissociative,cat_cannabinoid,flag_serotonergic,flag_respiratory_depressant,flag_gaba,flag_maoi,flag_cardiovascular,flag_seizure_risk
MDMA,0.9,0.0,0.3,0.0,0.0,0.0,0.6,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
LSD,0.0,0.95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
Alcohol,0.0,0.0,0.0,0.0,0.8,0.3,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
Cannabis,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
Cocaine,0.0,0.0,0.0,0.9,0.0,0.0,0.95,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


## Step 3: Load Known Interaction Data

Load documented drug combinations with their risk classifications.

Risk categories (from lowest to highest):
- **Low Risk**: Generally safe, minimal interaction
- **Caution**: Some interaction potential, requires monitoring
- **Unsafe**: Significant risks, not recommended
- **Dangerous**: Life-threatening combination, never mix

In [4]:
def load_known_interactions(data_path='../../data_collector/processed/known_interactions.json'):
    """
    Load documented drug interaction classifications.
    
    Expected structure:
    [
        {
            "drug_a": "MDMA",
            "drug_b": "Alcohol",
            "classification": "Dangerous",
            "reasoning": ["Serotonergic + depressant", "Dehydration risk"]
        }
    ]
    """
    try:
        with open(data_path, 'r') as f:
            interactions = json.load(f)
        print(f"✓ Loaded {len(interactions)} known interactions")
        return interactions
    except FileNotFoundError:
        print(f"⚠ File not found: {data_path}")
        print("Creating synthetic sample interaction data...")
        return create_sample_interactions()

def create_sample_interactions():
    """Create sample known interactions for demonstration"""
    return [
        {"drug_a": "MDMA", "drug_b": "Alcohol", "classification": "Dangerous", 
         "reasoning": ["Serotonergic + depressant", "Dehydration risk", "Liver toxicity"]},
        {"drug_a": "LSD", "drug_b": "Cannabis", "classification": "Caution",
         "reasoning": ["Can intensify psychedelic effects", "Anxiety risk in some users"]},
        {"drug_a": "Cocaine", "drug_b": "Alcohol", "classification": "Dangerous",
         "reasoning": ["Forms cocaethylene", "Cardiotoxic", "Additive cardiovascular stress"]},
        {"drug_a": "MDMA", "drug_b": "LSD", "classification": "Caution",
         "reasoning": ["Both serotonergic", "Intensified effects", "Extended duration"]},
        {"drug_a": "Cannabis", "drug_b": "Alcohol", "classification": "Caution",
         "reasoning": ["Increased intoxication", "Nausea risk", "Impaired coordination"]}
    ]

# Load known interactions
known_interactions = load_known_interactions()
print(f"\nSample interactions: {len(known_interactions)}")
for interaction in known_interactions[:3]:
    print(f"  {interaction['drug_a']} + {interaction['drug_b']} = {interaction['classification']}")

⚠ File not found: ../../data_collector/processed/known_interactions.json
Creating synthetic sample interaction data...

Sample interactions: 5
  MDMA + Alcohol = Dangerous
  LSD + Cannabis = Caution
  Cocaine + Alcohol = Dangerous


## Step 4: Create Combination Features

For each drug pair, create a combined feature vector representing the interaction profile.

In [5]:
def create_combination_features(drug_a_name, drug_b_name, features_df):
    """
    Create feature vector for a drug combination.
    
    Strategy:
    - Element-wise addition for mechanism overlap
    - Maximum for safety flags (conservative)
    - Difference metrics for pharmacokinetic mismatch
    """
    if drug_a_name not in features_df.index or drug_b_name not in features_df.index:
        return None
    
    feat_a = features_df.loc[drug_a_name]
    feat_b = features_df.loc[drug_b_name]
    
    combo_features = {}
    
    # Mechanism overlap (additive load)
    neuro_cols = [col for col in features_df.columns if not col.startswith(('cat_', 'flag_', 'half_life', 'duration'))]
    for col in neuro_cols:
        combo_features[f'{col}_sum'] = feat_a[col] + feat_b[col]
        combo_features[f'{col}_max'] = max(feat_a[col], feat_b[col])
    
    # Category overlap
    cat_cols = [col for col in features_df.columns if col.startswith('cat_')]
    for col in cat_cols:
        combo_features[f'{col}_both'] = 1 if (feat_a[col] == 1 and feat_b[col] == 1) else 0
        combo_features[f'{col}_either'] = 1 if (feat_a[col] == 1 or feat_b[col] == 1) else 0
    
    # Safety flag overlap (conservative - max)
    flag_cols = [col for col in features_df.columns if col.startswith('flag_')]
    for col in flag_cols:
        combo_features[f'{col}_any'] = max(feat_a[col], feat_b[col])
    
    # Pharmacokinetic mismatch
    combo_features['half_life_diff'] = abs(feat_a['half_life_norm'] - feat_b['half_life_norm'])
    combo_features['duration_diff'] = abs(feat_a['duration_norm'] - feat_b['duration_norm'])
    
    return combo_features

# Create training dataset from known interactions
X_train = []
y_train = []
interaction_pairs = []

for interaction in known_interactions:
    combo_feat = create_combination_features(
        interaction['drug_a'], 
        interaction['drug_b'], 
        features_df
    )
    if combo_feat is not None:
        X_train.append(combo_feat)
        y_train.append(interaction['classification'])
        interaction_pairs.append((interaction['drug_a'], interaction['drug_b']))

X_train_df = pd.DataFrame(X_train)
y_train_series = pd.Series(y_train)

print(f"✓ Created {len(X_train_df)} training samples")
print(f"Feature dimensions: {X_train_df.shape}")
print(f"\nClassification distribution:")
print(y_train_series.value_counts())

✓ Created 5 training samples
Feature dimensions: (5, 40)

Classification distribution:
Caution      3
Dangerous    2
Name: count, dtype: int64


## Step 5: Train Similarity-Based Classifier

Train a conservative classifier that uses mechanism similarity to infer risk categories.

**Key constraint**: The model is not allowed to output "Low Risk" for undocumented combinations.

In [6]:
from sklearn.ensemble import RandomForestClassifier

# Encode target labels (ordered by severity)
risk_order = ['Low Risk', 'Caution', 'Unsafe', 'Dangerous']
label_encoder = {label: idx for idx, label in enumerate(risk_order)}
label_decoder = {idx: label for label, idx in label_encoder.items()}

y_train_encoded = y_train_series.map(label_encoder)

# Train Random Forest classifier
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    random_state=42,
    class_weight='balanced'  # Handle imbalanced classes
)

rf_model.fit(X_train_df, y_train_encoded)

print("✓ Model trained successfully")
print(f"Feature importances (top 10):")
feature_importance = pd.DataFrame({
    'feature': X_train_df.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print(feature_importance.head(10).to_string(index=False))

✓ Model trained successfully
Feature importances (top 10):
                        feature  importance
flag_respiratory_depressant_any        0.50
            nmda_antagonist_max        0.25
            nmda_antagonist_sum        0.25
          serotonin_release_sum        0.00
           dopamine_release_sum        0.00
          serotonin_release_max        0.00
          serotonin_agonist_sum        0.00
          serotonin_agonist_max        0.00
          dopamine_reuptake_max        0.00
          dopamine_reuptake_sum        0.00


## Step 6: Implement Conservative Inference Function

Create a function that:
1. Compares unknown combination to known ones
2. Outputs minimum "Caution" for undocumented pairs
3. Provides uncertainty score
4. Explains reasoning

In [7]:
def predict_interaction_risk(drug_a_name, drug_b_name, 
                            features_df, model, 
                            known_interactions, 
                            min_confidence_threshold=0.6):
    """
    Predict interaction risk for a drug combination.
    
    Returns:
    {
        "classification": str,
        "confidence": str (Low/Medium/High),
        "confidence_score": float,
        "reasoning": list[str],
        "similar_combinations": list[dict],
        "note": str
    }
    """
    # Check if combination is already documented
    for known in known_interactions:
        if {known['drug_a'], known['drug_b']} == {drug_a_name, drug_b_name}:
            return {
                "classification": known['classification'],
                "confidence": "High",
                "confidence_score": 1.0,
                "reasoning": known['reasoning'],
                "similar_combinations": [],
                "note": "This is a documented interaction from expert-curated data."
            }
    
    # Extract combination features
    combo_feat = create_combination_features(drug_a_name, drug_b_name, features_df)
    if combo_feat is None:
        return {
            "classification": "Unknown",
            "confidence": "N/A",
            "confidence_score": 0.0,
            "reasoning": ["One or both substances not in database"],
            "similar_combinations": [],
            "note": "Cannot assess - insufficient data."
        }
    
    # Predict using model
    X_pred = pd.DataFrame([combo_feat])
    pred_proba = model.predict_proba(X_pred)[0]
    pred_class_idx = model.predict(X_pred)[0]
    pred_class = label_decoder[pred_class_idx]
    confidence_score = pred_proba[pred_class_idx]
    
    # Enforce minimum "Caution" for undocumented combinations
    if pred_class == "Low Risk":
        pred_class = "Caution"
        confidence_score = max(0.5, confidence_score * 0.8)  # Reduce confidence
    
    # Determine confidence level
    if confidence_score >= 0.75:
        confidence = "High"
    elif confidence_score >= 0.5:
        confidence = "Medium"
    else:
        confidence = "Low"
    
    # Generate reasoning based on features
    reasoning = []
    drug_a_profile = drug_profiles.get(drug_a_name, {})
    drug_b_profile = drug_profiles.get(drug_b_name, {})
    
    # Check mechanism overlaps
    neuro_a = drug_a_profile.get('neuro_buckets', {})
    neuro_b = drug_b_profile.get('neuro_buckets', {})
    
    shared_mechanisms = []
    for mechanism in neuro_a:
        if mechanism in neuro_b and neuro_a[mechanism] > 0.3 and neuro_b[mechanism] > 0.3:
            shared_mechanisms.append(mechanism.replace('_', ' ').title())
    
    if shared_mechanisms:
        reasoning.append(f"Shared mechanisms: {', '.join(shared_mechanisms)}")
    
    # Check safety flags
    flags_a = set(drug_a_profile.get('safety_flags', []))
    flags_b = set(drug_b_profile.get('safety_flags', []))
    shared_flags = flags_a & flags_b
    
    if shared_flags:
        reasoning.append(f"Both substances affect: {', '.join(shared_flags)}")
    
    # Check category interactions
    cats_a = set(drug_a_profile.get('categories', []))
    cats_b = set(drug_b_profile.get('categories', []))
    
    if 'stimulant' in cats_a and 'depressant' in cats_b:
        reasoning.append("Stimulant + depressant combination")
    elif 'stimulant' in cats_b and 'depressant' in cats_a:
        reasoning.append("Stimulant + depressant combination")
    
    # Find similar documented combinations
    similar_combos = find_similar_combinations(
        drug_a_name, drug_b_name, 
        features_df, known_interactions,
        top_k=3
    )
    
    if not reasoning:
        reasoning.append("Limited mechanistic overlap detected")
        reasoning.append("Insufficient human data for this specific combination")
    
    return {
        "classification": pred_class,
        "confidence": confidence,
        "confidence_score": float(confidence_score),
        "reasoning": reasoning,
        "similar_combinations": similar_combos,
        "note": "This classification is inferred from pharmacological similarity and should be treated as a conservative guideline, not a statement of safety."
    }

def find_similar_combinations(drug_a_name, drug_b_name, 
                             features_df, known_interactions, 
                             top_k=3):
    """Find known combinations with similar mechanism profiles"""
    target_feat = create_combination_features(drug_a_name, drug_b_name, features_df)
    if target_feat is None:
        return []
    
    similarities = []
    for known in known_interactions:
        known_feat = create_combination_features(known['drug_a'], known['drug_b'], features_df)
        if known_feat is None:
            continue
        
        # Compute cosine similarity
        target_vec = np.array(list(target_feat.values())).reshape(1, -1)
        known_vec = np.array(list(known_feat.values())).reshape(1, -1)
        
        similarity = cosine_similarity(target_vec, known_vec)[0][0]
        
        similarities.append({
            'drug_a': known['drug_a'],
            'drug_b': known['drug_b'],
            'classification': known['classification'],
            'similarity': float(similarity)
        })
    
    similarities.sort(key=lambda x: x['similarity'], reverse=True)
    return similarities[:top_k]

print("✓ Inference function ready")

✓ Inference function ready


## Step 7: Test the Model

Test predictions on sample combinations, including both documented and undocumented pairs.

In [8]:
# Test on documented combinations (should match known classification)
print("=" * 60)
print("Testing on DOCUMENTED combinations:")
print("=" * 60)

test_pairs = [
    ("MDMA", "Alcohol"),
    ("LSD", "Cannabis"),
    ("Cocaine", "Alcohol")
]

for drug_a, drug_b in test_pairs:
    result = predict_interaction_risk(
        drug_a, drug_b, 
        features_df, rf_model, 
        known_interactions
    )
    print(f"\n{drug_a} + {drug_b}")
    print(f"  Classification: {result['classification']} ({result['confidence']} confidence)")
    print(f"  Confidence Score: {result['confidence_score']:.2f}")
    print(f"  Reasoning:")
    for reason in result['reasoning']:
        print(f"    - {reason}")

# Test on undocumented combinations
print("\n" + "=" * 60)
print("Testing on UNDOCUMENTED combinations:")
print("=" * 60)

undocumented_pairs = [
    ("MDMA", "Cannabis"),
    ("LSD", "Cocaine")
]

for drug_a, drug_b in undocumented_pairs:
    result = predict_interaction_risk(
        drug_a, drug_b, 
        features_df, rf_model, 
        known_interactions
    )
    print(f"\n{drug_a} + {drug_b}")
    print(f"  Classification: {result['classification']} ({result['confidence']} confidence)")
    print(f"  Confidence Score: {result['confidence_score']:.2f}")
    print(f"  Reasoning:")
    for reason in result['reasoning']:
        print(f"    - {reason}")
    print(f"  Similar documented combinations:")
    for sim in result['similar_combinations']:
        print(f"    - {sim['drug_a']} + {sim['drug_b']}: {sim['classification']} (similarity: {sim['similarity']:.2f})")
    print(f"  Note: {result['note']}")

Testing on DOCUMENTED combinations:

MDMA + Alcohol
  Classification: Dangerous (High confidence)
  Confidence Score: 1.00
  Reasoning:
    - Serotonergic + depressant
    - Dehydration risk
    - Liver toxicity

LSD + Cannabis
  Classification: Caution (High confidence)
  Confidence Score: 1.00
  Reasoning:
    - Can intensify psychedelic effects
    - Anxiety risk in some users

Cocaine + Alcohol
  Classification: Dangerous (High confidence)
  Confidence Score: 1.00
  Reasoning:
    - Forms cocaethylene
    - Cardiotoxic
    - Additive cardiovascular stress

Testing on UNDOCUMENTED combinations:

MDMA + Cannabis
  Classification: Caution (Low confidence)
  Confidence Score: 0.47
  Reasoning:
    - Limited mechanistic overlap detected
    - Insufficient human data for this specific combination
  Similar documented combinations:
    - MDMA + LSD: Caution (similarity: 0.66)
    - MDMA + Alcohol: Dangerous (similarity: 0.62)
    - LSD + Cannabis: Caution (similarity: 0.52)
  Note: This c

## Step 8: Save Model and Feature Extractors

Export the trained model and feature extraction logic for use in the API service.

In [9]:
import pickle
from pathlib import Path

# Create models directory
models_dir = Path('models')
models_dir.mkdir(exist_ok=True)

# Save the trained model
model_path = models_dir / 'interaction_classifier.pkl'
with open(model_path, 'wb') as f:
    pickle.dump(rf_model, f)
print(f"✓ Model saved to {model_path}")

# Save label encoding
encoding_path = models_dir / 'label_encoding.json'
with open(encoding_path, 'w') as f:
    json.dump({
        'encoder': label_encoder,
        'decoder': {str(k): v for k, v in label_decoder.items()},
        'risk_order': risk_order
    }, f, indent=2)
print(f"✓ Label encoding saved to {encoding_path}")

# Save feature columns for consistency
feature_cols_path = models_dir / 'feature_columns.json'
with open(feature_cols_path, 'w') as f:
    json.dump(X_train_df.columns.tolist(), f, indent=2)
print(f"✓ Feature columns saved to {feature_cols_path}")

print("\n✓ Model export complete. Ready for API integration.")

✓ Model saved to models\interaction_classifier.pkl
✓ Label encoding saved to models\label_encoding.json
✓ Feature columns saved to models\feature_columns.json

✓ Model export complete. Ready for API integration.


## Summary

This notebook implements a conservative drug interaction inference model:

✅ **Never predicts "Low Risk"** for undocumented combinations  
✅ **Provides mechanism-based reasoning** for all predictions  
✅ **Outputs uncertainty scores** to communicate confidence  
✅ **Respects documented interactions** from expert sources  
✅ **Conservative by design** - prefers caution over reassurance  

### Next Steps

1. Integrate with `interaction_service.py` in the backend services
2. Add more documented interactions to training data
3. Implement rule-based overrides for known dangerous patterns (MAOI + serotonergic, dual respiratory depressants, etc.)
4. Create API endpoint for real-time inference
5. Build testing suite with edge cases

**Important**: This model is for research and harm reduction guidance only. Not medical advice.