# AnnoMI Cognitive Action Analysis - Comprehensive Meta-Analysis

This notebook analyzes the AnnoMI therapeutic conversation dataset using universal cognitive action probe inference.

**Features:**
- ✅ Universal probe inference across all layers (21-30)
- ✅ Cognitive action detection for all therapist/client utterances
- ✅ Client vs Therapist cognitive pattern analysis
- ✅ Topic-specific cognitive action distribution
- ✅ Temporal distribution analysis (10 conversation stages)
- ✅ Comprehensive visualizations

**Requirements:**
- Google Colab with GPU (T4 or better)
- ~15 GB VRAM
- Runtime: ~3-4 hours for full dataset analysis

## 1️⃣ Check GPU and Setup

In [None]:
# Check GPU availability
!nvidia-smi

import torch
print("\n" + "="*60)
print("GPU INFORMATION")
print("="*60)
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU device: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("⚠️  WARNING: No GPU detected! This will be very slow on CPU.")
print("="*60)

## 2️⃣ Clone Repository and Install Dependencies

In [None]:
import os
import sys

# Clone the repository
repo_url = "https://github.com/ChuloIva/brije.git"
repo_name = "brije"

if not os.path.exists(repo_name):
    print("📥 Cloning Brije repository...")
    !git clone {repo_url}
    print("✅ Repository cloned successfully!")
else:
    print("✅ Repository already exists")
    print("🔄 Pulling latest changes...")
    !cd {repo_name} && git pull

# Change to repo directory
os.chdir(repo_name)
print(f"\n📁 Current directory: {os.getcwd()}")

In [None]:
# Install dependencies
print("📦 Installing dependencies...\n")

# Core dependencies
print("Installing core packages...")
!pip install -q torch transformers h5py scikit-learn tqdm matplotlib seaborn pandas

# Clone and install nnsight
nnsight_dir = "third_party/nnsight"
nnsight_repo = "https://github.com/ndif-team/nnsight"

print("\n📦 Setting up nnsight...")
if not os.path.exists(nnsight_dir) or not os.listdir(nnsight_dir):
    print("   Cloning nnsight repository...")
    os.makedirs("third_party", exist_ok=True)
    !git clone {nnsight_repo} {nnsight_dir}
    print("   ✅ nnsight repository cloned")
else:
    print("   ✅ nnsight repository already exists")

# Install nnsight
print("   Installing nnsight...")
!pip install -q -e {nnsight_dir}

print("\n✅ All dependencies installed!")

## 3️⃣ Verify Pre-trained Probes

The pre-trained probes are included in the repository. Let's verify they exist.

In [None]:
import glob

# Check for probes in the repository
local_probes_dir = 'data/probes_binary'

probe_dirs = glob.glob(f'{local_probes_dir}/layer_*')

if probe_dirs:
    print("✅ Found pre-trained probes in repository")
    print(f"\n📊 Available probe layers: {len(probe_dirs)}")
    
    for probe_dir in sorted(probe_dirs)[:5]:
        layer_num = os.path.basename(probe_dir).replace('layer_', '')
        probe_files = glob.glob(f"{probe_dir}/probe_*.pth")
        print(f"   Layer {layer_num}: {len(probe_files)} probes")
    
    if len(probe_dirs) > 5:
        print(f"   ... and {len(probe_dirs) - 5} more layers")
else:
    print("⚠️  No probes found in:", local_probes_dir)
    print("\n💡 Please train probes first using:")
    print("   Brije_Full_Pipeline_Colab.ipynb")

## 4️⃣ Login to Hugging Face (for Gemma 3 4B)

In [None]:
from huggingface_hub import notebook_login

notebook_login()

## 5️⃣ Load and Explore AnnoMI Dataset

In [None]:
import pandas as pd
import numpy as np

# Load AnnoMI-simple.csv
annomi_path = 'third_party/AnnoMI/AnnoMI-simple.csv'

print("📊 Loading AnnoMI dataset...")
df = pd.read_csv(annomi_path)

print(f"\n✅ Loaded {len(df)} utterances")
print(f"   Transcripts: {df['transcript_id'].nunique()}")
print(f"   Topics: {df['topic'].nunique()}")
print(f"   Therapist utterances: {len(df[df['interlocutor'] == 'therapist'])}")
print(f"   Client utterances: {len(df[df['interlocutor'] == 'client'])}")

# Show sample
print("\n" + "="*80)
print("SAMPLE UTTERANCES")
print("="*80)
print(df[['interlocutor', 'utterance_text', 'topic']].head(10).to_string(index=False))

# Show topic distribution
print("\n" + "="*80)
print("TOPIC DISTRIBUTION")
print("="*80)
topic_counts = df.groupby('topic')['transcript_id'].nunique().sort_values(ascending=False)
for topic, count in topic_counts.items():
    print(f"  {topic:40s}: {count:3d} transcripts")

## 6️⃣ Initialize Universal Probe Inference Engine

In [None]:
# Add src to path
sys.path.insert(0, os.path.join(os.getcwd(), 'src', 'probes'))

from universal_multi_layer_inference import UniversalMultiLayerInferenceEngine
from pathlib import Path

# Initialize the engine
print("🚀 Initializing Universal Probe Inference Engine...\n")

engine = UniversalMultiLayerInferenceEngine(
    probes_base_dir=Path('data/probes_binary'),
    model_name='google/gemma-3-4b-it',
    layer_range=(21, 30)  # Layers 21-30 (adjust based on your trained probes)
)

print("\n✅ Engine initialized and ready!")

## 7️⃣ Run Cognitive Action Inference on All Utterances

This will analyze all utterances in the dataset and detect cognitive actions.

**⏰ Expected time:** ~2-4 hours for full dataset (~3,000 utterances)

In [None]:
import time
from tqdm import tqdm
from collections import defaultdict

print("🧠 Running cognitive action inference on all utterances...")
print(f"   Total utterances: {len(df)}")
print(f"   This may take 2-4 hours...\n")

# Create output directory
output_dir = Path('output/annomi_analysis')
output_dir.mkdir(parents=True, exist_ok=True)

# Store predictions
predictions_list = []
start_time = time.time()

# Process each utterance
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing utterances"):
    utterance_text = row['utterance_text']
    
    try:
        # Run universal inference (grouped by action)
        action_preds = engine.predict_by_action(
            utterance_text,
            threshold=0.1,
            aggregation="max"
        )
        
        # Get all layers where each action was detected
        all_layer_preds = engine.predict_all(
            utterance_text,
            threshold=0.1
        )
        
        # Group predictions by action with layer info
        action_layer_map = defaultdict(list)
        for pred in all_layer_preds:
            if pred.is_active:
                action_layer_map[pred.action_name].append({
                    'layer': pred.layer,
                    'confidence': pred.confidence
                })
        
        # Store prediction with metadata
        predictions_list.append({
            'utterance_id': row['utterance_id'],
            'transcript_id': row['transcript_id'],
            'interlocutor': row['interlocutor'],
            'topic': row['topic'],
            'utterance_text': utterance_text,
            'mi_quality': row['mi_quality'],
            'main_therapist_behaviour': row.get('main_therapist_behaviour', None),
            'client_talk_type': row.get('client_talk_type', None),
            'predictions': action_preds,
            'action_layer_details': dict(action_layer_map)
        })
        
    except Exception as e:
        print(f"\n⚠️  Error processing utterance {idx}: {e}")
        continue
    
    # Save checkpoint every 100 utterances
    if (idx + 1) % 100 == 0:
        checkpoint_file = output_dir / f'predictions_checkpoint_{idx+1}.json'
        import json
        with open(checkpoint_file, 'w') as f:
            json.dump(predictions_list, f, indent=2, default=str)
        
        elapsed = time.time() - start_time
        avg_time = elapsed / (idx + 1)
        remaining = avg_time * (len(df) - (idx + 1))
        print(f"\n✓ Checkpoint saved. ETA: {remaining/3600:.1f} hours")

elapsed_time = time.time() - start_time

print(f"\n✅ Inference complete!")
print(f"   Time elapsed: {elapsed_time/3600:.2f} hours")
print(f"   Average time per utterance: {elapsed_time/len(df):.2f} seconds")

# Save final predictions
import json
predictions_file = output_dir / 'all_predictions.json'
with open(predictions_file, 'w') as f:
    json.dump(predictions_list, f, indent=2, default=str)

print(f"\n💾 Predictions saved to: {predictions_file}")

## 7️⃣.1 Filter High-Confidence Cognitive Actions

Filter predictions to keep only cognitive actions that:
- Appear on **more than 2 layers**, OR
- Have **100% confidence score**

This removes noise and focuses on robust cognitive action detections.

In [None]:
print("🔍 Filtering cognitive actions for high confidence...")
print(f"   Total predictions before filtering: {len(predictions_list)}")

# Save original predictions for comparison
all_predictions = predictions_list.copy()

# Filter each prediction to keep only high-confidence actions
filtered_predictions_list = []

for pred_data in predictions_list:
    action_layer_details = pred_data.get('action_layer_details', {})
    predictions = pred_data['predictions']
    
    # Create filtered predictions dict
    filtered_predictions = {}
    
    for action_name, action_info in predictions.items():
        if not action_info.get('is_active', False):
            continue
        
        # Check filtering criteria
        num_layers = len(action_layer_details.get(action_name, []))
        max_confidence = max([layer_info['confidence'] for layer_info in action_layer_details.get(action_name, [])], default=0)
        
        # Keep if: appears on >2 layers OR has 100% confidence
        if num_layers > 2 or max_confidence >= 1.0:
            filtered_predictions[action_name] = action_info
    
    # Create filtered prediction entry
    filtered_pred_data = pred_data.copy()
    filtered_pred_data['predictions'] = filtered_predictions
    filtered_predictions_list.append(filtered_pred_data)

# Count statistics
total_actions_before = sum(len([a for a, d in pred['predictions'].items() if d.get('is_active', False)]) 
                          for pred in all_predictions)
total_actions_after = sum(len([a for a, d in pred['predictions'].items() if d.get('is_active', False)]) 
                         for pred in filtered_predictions_list)

print(f"   Total predictions after filtering: {len(filtered_predictions_list)}")
print(f"   Active actions before: {total_actions_before}")
print(f"   Active actions after: {total_actions_after}")
print(f"   Removed: {total_actions_before - total_actions_after} actions ({(total_actions_before - total_actions_after)/total_actions_before*100:.1f}%)")

# Replace predictions_list with filtered version
predictions_list = filtered_predictions_list

# Save filtered predictions
filtered_predictions_file = output_dir / 'all_predictions_filtered.json'
with open(filtered_predictions_file, 'w') as f:
    json.dump(predictions_list, f, indent=2, default=str)

print(f"
✅ Filtered predictions saved to: {filtered_predictions_file}")

## 8️⃣ Meta-Analysis 1: Client vs Therapist Cognitive Patterns

In [None]:
from collections import Counter

print("="*80)
print("META-ANALYSIS 1: CLIENT VS THERAPIST COGNITIVE PATTERNS")
print("="*80)

# Aggregate cognitive actions by interlocutor
therapist_actions = Counter()
client_actions = Counter()

for pred_data in predictions_list:
    interlocutor = pred_data['interlocutor']
    predictions = pred_data['predictions']
    
    for action_name, data in predictions.items():
        if data.get('is_active', False):
            # Weight by aggregate confidence
            confidence = data['aggregate']
            
            if interlocutor == 'therapist':
                therapist_actions[action_name] += confidence
            else:
                client_actions[action_name] += confidence

print("\n" + "="*80)
print("TOP 20 COGNITIVE ACTIONS: THERAPIST")
print("="*80)
for action, score in therapist_actions.most_common(20):
    bar = "█" * int(score / 10)
    print(f"{action:35s} {score:7.2f} {bar}")

print("\n" + "="*80)
print("TOP 20 COGNITIVE ACTIONS: CLIENT")
print("="*80)
for action, score in client_actions.most_common(20):
    bar = "█" * int(score / 10)
    print(f"{action:35s} {score:7.2f} {bar}")

# Find unique patterns
therapist_only = set(therapist_actions.keys()) - set(client_actions.keys())
client_only = set(client_actions.keys()) - set(therapist_actions.keys())

if therapist_only:
    print("\n" + "="*80)
    print("COGNITIVE ACTIONS UNIQUE TO THERAPIST")
    print("="*80)
    for action in sorted(therapist_only):
        print(f"  • {action}")

if client_only:
    print("\n" + "="*80)
    print("COGNITIVE ACTIONS UNIQUE TO CLIENT")
    print("="*80)
    for action in sorted(client_only):
        print(f"  • {action}")

# Save analysis
analysis_1 = {
    'therapist_top_20': dict(therapist_actions.most_common(20)),
    'client_top_20': dict(client_actions.most_common(20)),
    'therapist_unique': list(therapist_only),
    'client_unique': list(client_only)
}

with open(output_dir / 'analysis_1_client_vs_therapist.json', 'w') as f:
    json.dump(analysis_1, f, indent=2)

print("\n✅ Analysis saved to: output/annomi_analysis/analysis_1_client_vs_therapist.json")

## 9️⃣ Meta-Analysis 2: Topic-Specific Cognitive Action Distribution

In [None]:
print("="*80)
print("META-ANALYSIS 2: TOPIC-SPECIFIC COGNITIVE ACTION DISTRIBUTION")
print("="*80)

# Aggregate by topic and interlocutor
topic_actions = defaultdict(lambda: {'therapist': Counter(), 'client': Counter()})

for pred_data in predictions_list:
    topic = pred_data['topic']
    interlocutor = pred_data['interlocutor']
    predictions = pred_data['predictions']
    
    for action_name, data in predictions.items():
        if data.get('is_active', False):
            confidence = data['aggregate']
            topic_actions[topic][interlocutor][action_name] += confidence

# Analyze each topic
for topic in sorted(topic_actions.keys()):
    print(f"\n{'='*80}")
    print(f"TOPIC: {topic.upper()}")
    print(f"{'='*80}")
    
    therapist = topic_actions[topic]['therapist']
    client = topic_actions[topic]['client']
    
    print(f"\n  Top 10 Therapist Actions:")
    for action, score in therapist.most_common(10):
        print(f"    • {action:35s} {score:7.2f}")
    
    print(f"\n  Top 10 Client Actions:")
    for action, score in client.most_common(10):
        print(f"    • {action:35s} {score:7.2f}")

# Save analysis
analysis_2 = {}
for topic, data in topic_actions.items():
    analysis_2[topic] = {
        'therapist_top_10': dict(data['therapist'].most_common(10)),
        'client_top_10': dict(data['client'].most_common(10))
    }

with open(output_dir / 'analysis_2_topic_specific.json', 'w') as f:
    json.dump(analysis_2, f, indent=2)

print("\n✅ Analysis saved to: output/annomi_analysis/analysis_2_topic_specific.json")

## 🔟 Meta-Analysis 3: Temporal Distribution (10 Conversation Stages)

In [None]:
print("="*80)
print("META-ANALYSIS 3: TEMPORAL DISTRIBUTION (10 CONVERSATION STAGES)")
print("="*80)

# Group predictions by transcript and split into 10 stages
transcript_stages = defaultdict(lambda: {i: {'therapist': Counter(), 'client': Counter()} for i in range(10)})

# Group by transcript
transcript_groups = defaultdict(list)
for pred_data in predictions_list:
    transcript_id = pred_data['transcript_id']
    transcript_groups[transcript_id].append(pred_data)

# Process each transcript
for transcript_id, utterances in transcript_groups.items():
    # Sort by utterance_id
    utterances = sorted(utterances, key=lambda x: x['utterance_id'])
    
    # Divide into 10 stages
    n = len(utterances)
    stage_size = n / 10
    
    for i, utterance in enumerate(utterances):
        stage = min(int(i / stage_size), 9)  # 0-9
        interlocutor = utterance['interlocutor']
        
        for action_name, data in utterance['predictions'].items():
            if data.get('is_active', False):
                confidence = data['aggregate']
                transcript_stages[transcript_id][stage][interlocutor][action_name] += confidence

# Aggregate across all transcripts
stage_aggregates = {i: {'therapist': Counter(), 'client': Counter()} for i in range(10)}

for transcript_data in transcript_stages.values():
    for stage, data in transcript_data.items():
        for interlocutor in ['therapist', 'client']:
            stage_aggregates[stage][interlocutor].update(data[interlocutor])

# Display results
for stage in range(10):
    print(f"\n{'='*80}")
    print(f"STAGE {stage + 1}/10 ({stage*10}%-{(stage+1)*10}% through conversation)")
    print(f"{'='*80}")
    
    print(f"\n  Top 5 Therapist Actions:")
    for action, score in stage_aggregates[stage]['therapist'].most_common(5):
        print(f"    • {action:35s} {score:7.2f}")
    
    print(f"\n  Top 5 Client Actions:")
    for action, score in stage_aggregates[stage]['client'].most_common(5):
        print(f"    • {action:35s} {score:7.2f}")

# Save analysis
analysis_3 = {}
for stage, data in stage_aggregates.items():
    analysis_3[f'stage_{stage+1}'] = {
        'stage_range': f"{stage*10}%-{(stage+1)*10}%",
        'therapist_top_10': dict(data['therapist'].most_common(10)),
        'client_top_10': dict(data['client'].most_common(10))
    }

with open(output_dir / 'analysis_3_temporal_distribution.json', 'w') as f:
    json.dump(analysis_3, f, indent=2)

print("\n✅ Analysis saved to: output/annomi_analysis/analysis_3_temporal_distribution.json")

## 1️⃣1️⃣ Visualizations: Client vs Therapist Patterns

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (20, 10)

# Get top actions
top_therapist = dict(therapist_actions.most_common(15))
top_client = dict(client_actions.most_common(15))

# Combine and get unique actions
all_actions = set(list(top_therapist.keys()) + list(top_client.keys()))
actions = sorted(all_actions, key=lambda x: max(top_therapist.get(x, 0), top_client.get(x, 0)), reverse=True)

therapist_scores = [top_therapist.get(a, 0) for a in actions]
client_scores = [top_client.get(a, 0) for a in actions]

# Create figure
fig, axes = plt.subplots(1, 2, figsize=(20, 10))

# Plot 1: Comparison bar chart
x = np.arange(len(actions))
width = 0.35

axes[0].barh(x - width/2, therapist_scores, width, label='Therapist', color='steelblue', alpha=0.8)
axes[0].barh(x + width/2, client_scores, width, label='Client', color='coral', alpha=0.8)
axes[0].set_yticks(x)
axes[0].set_yticklabels(actions, fontsize=10)
axes[0].set_xlabel('Aggregate Confidence Score', fontsize=12)
axes[0].set_title('Cognitive Action Comparison:\nTherapist vs Client', fontsize=14, fontweight='bold')
axes[0].legend(fontsize=11)
axes[0].grid(True, alpha=0.3, axis='x')
axes[0].invert_yaxis()

# Plot 2: Ratio comparison (Therapist / Client)
ratios = []
ratio_actions = []
for action in actions:
    t_score = top_therapist.get(action, 0.1)
    c_score = top_client.get(action, 0.1)
    ratio = np.log2(t_score / c_score) if c_score > 0 else 0
    ratios.append(ratio)
    ratio_actions.append(action)

colors = ['steelblue' if r > 0 else 'coral' for r in ratios]
axes[1].barh(range(len(ratio_actions)), ratios, color=colors, alpha=0.8)
axes[1].set_yticks(range(len(ratio_actions)))
axes[1].set_yticklabels(ratio_actions, fontsize=10)
axes[1].set_xlabel('Log2(Therapist/Client)', fontsize=12)
axes[1].set_title('Cognitive Action Bias:\nTherapist-Dominant (blue) vs Client-Dominant (red)', fontsize=14, fontweight='bold')
axes[1].axvline(x=0, color='black', linestyle='--', linewidth=1)
axes[1].grid(True, alpha=0.3, axis='x')
axes[1].invert_yaxis()

plt.tight_layout()
plt.savefig(output_dir / 'viz_1_client_vs_therapist.png', dpi=150, bbox_inches='tight')
plt.show()

print("✅ Visualization saved to: output/annomi_analysis/viz_1_client_vs_therapist.png")

## 1️⃣2️⃣ Visualizations: Temporal Distribution Heatmap

In [None]:
# Create heatmap data for temporal distribution
# Get top actions overall
all_actions_combined = therapist_actions + client_actions
top_actions_for_heatmap = [action for action, _ in all_actions_combined.most_common(20)]

# Build matrices for therapist and client
therapist_heatmap = np.zeros((len(top_actions_for_heatmap), 10))
client_heatmap = np.zeros((len(top_actions_for_heatmap), 10))

for i, action in enumerate(top_actions_for_heatmap):
    for stage in range(10):
        therapist_heatmap[i, stage] = stage_aggregates[stage]['therapist'].get(action, 0)
        client_heatmap[i, stage] = stage_aggregates[stage]['client'].get(action, 0)

# Create figure with two subplots
fig, axes = plt.subplots(2, 1, figsize=(16, 18))

# Therapist heatmap
im1 = axes[0].imshow(therapist_heatmap, aspect='auto', cmap='Blues', interpolation='nearest')
axes[0].set_xticks(range(10))
axes[0].set_xticklabels([f'Stage {i+1}\n({i*10}-{(i+1)*10}%)' for i in range(10)], fontsize=10)
axes[0].set_yticks(range(len(top_actions_for_heatmap)))
axes[0].set_yticklabels(top_actions_for_heatmap, fontsize=10)
axes[0].set_title('Therapist Cognitive Actions Across Conversation Stages', fontsize=14, fontweight='bold')
plt.colorbar(im1, ax=axes[0], label='Aggregate Confidence Score')

# Client heatmap
im2 = axes[1].imshow(client_heatmap, aspect='auto', cmap='Oranges', interpolation='nearest')
axes[1].set_xticks(range(10))
axes[1].set_xticklabels([f'Stage {i+1}\n({i*10}-{(i+1)*10}%)' for i in range(10)], fontsize=10)
axes[1].set_yticks(range(len(top_actions_for_heatmap)))
axes[1].set_yticklabels(top_actions_for_heatmap, fontsize=10)
axes[1].set_title('Client Cognitive Actions Across Conversation Stages', fontsize=14, fontweight='bold')
plt.colorbar(im2, ax=axes[1], label='Aggregate Confidence Score')

plt.tight_layout()
plt.savefig(output_dir / 'viz_2_temporal_heatmap.png', dpi=150, bbox_inches='tight')
plt.show()

print("✅ Visualization saved to: output/annomi_analysis/viz_2_temporal_heatmap.png")

## 1️⃣3️⃣ Visualizations: Topic-Specific Cognitive Patterns

In [None]:
# Create a stacked bar chart showing top actions per topic
topics_to_plot = list(topic_actions.keys())[:6]  # Plot top 6 topics

fig, axes = plt.subplots(2, 3, figsize=(20, 12))
axes = axes.flatten()

for idx, topic in enumerate(topics_to_plot):
    ax = axes[idx]
    
    # Get top 10 actions for this topic (combined)
    combined = topic_actions[topic]['therapist'] + topic_actions[topic]['client']
    top_10 = [action for action, _ in combined.most_common(10)]
    
    therapist_vals = [topic_actions[topic]['therapist'].get(a, 0) for a in top_10]
    client_vals = [topic_actions[topic]['client'].get(a, 0) for a in top_10]
    
    x = np.arange(len(top_10))
    width = 0.35
    
    ax.bar(x - width/2, therapist_vals, width, label='Therapist', color='steelblue', alpha=0.8)
    ax.bar(x + width/2, client_vals, width, label='Client', color='coral', alpha=0.8)
    
    ax.set_xticks(x)
    ax.set_xticklabels(top_10, rotation=45, ha='right', fontsize=8)
    ax.set_ylabel('Confidence Score', fontsize=10)
    ax.set_title(f'{topic}', fontsize=11, fontweight='bold')
    ax.legend(fontsize=9)
    ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig(output_dir / 'viz_3_topic_specific.png', dpi=150, bbox_inches='tight')
plt.show()

print("✅ Visualization saved to: output/annomi_analysis/viz_3_topic_specific.png")

## 1️⃣4️⃣ Summary Report

In [None]:
print("="*80)
print("COMPREHENSIVE ANALYSIS SUMMARY")
print("="*80)

print(f"\n📊 Dataset Statistics:")
print(f"   Total utterances analyzed: {len(predictions_list)}")
print(f"   Transcripts: {df['transcript_id'].nunique()}")
print(f"   Topics: {df['topic'].nunique()}")
print(f"   Therapist utterances: {len([p for p in predictions_list if p['interlocutor'] == 'therapist'])}")
print(f"   Client utterances: {len([p for p in predictions_list if p['interlocutor'] == 'client'])}")

print(f"\n🧠 Key Findings:")

print(f"\n1️⃣ Most Common Therapist Cognitive Actions:")
for i, (action, score) in enumerate(therapist_actions.most_common(5), 1):
    print(f"   {i}. {action:35s} (score: {score:.2f})")

print(f"\n2️⃣ Most Common Client Cognitive Actions:")
for i, (action, score) in enumerate(client_actions.most_common(5), 1):
    print(f"   {i}. {action:35s} (score: {score:.2f})")

print(f"\n3️⃣ Temporal Patterns:")
print(f"   Early stage (0-30%) dominant actions:")
early_combined = stage_aggregates[0]['therapist'] + stage_aggregates[0]['client'] + \
                 stage_aggregates[1]['therapist'] + stage_aggregates[1]['client'] + \
                 stage_aggregates[2]['therapist'] + stage_aggregates[2]['client']
for action, score in early_combined.most_common(3):
    print(f"      • {action}")

print(f"\n   Late stage (70-100%) dominant actions:")
late_combined = stage_aggregates[7]['therapist'] + stage_aggregates[7]['client'] + \
                stage_aggregates[8]['therapist'] + stage_aggregates[8]['client'] + \
                stage_aggregates[9]['therapist'] + stage_aggregates[9]['client']
for action, score in late_combined.most_common(3):
    print(f"      • {action}")

print(f"\n📁 Output Files:")
print(f"   • all_predictions.json - Full prediction data")
print(f"   • analysis_1_client_vs_therapist.json - Client vs Therapist analysis")
print(f"   • analysis_2_topic_specific.json - Topic-specific analysis")
print(f"   • analysis_3_temporal_distribution.json - Temporal analysis")
print(f"   • viz_1_client_vs_therapist.png - Client vs Therapist visualization")
print(f"   • viz_2_temporal_heatmap.png - Temporal heatmap")
print(f"   • viz_3_topic_specific.png - Topic-specific visualization")

print(f"\n✅ All analyses complete!")
print(f"\n📂 All outputs saved to: {output_dir}")
print("="*80)

## 1️⃣5️⃣ Backup to Google Drive

In [None]:
# Backup all outputs to Google Drive
drive_output_dir = '/content/drive/MyDrive/brije_outputs/annomi_analysis'

print("📥 Backing up results to Google Drive...")
!mkdir -p {drive_output_dir}
!cp -r {output_dir}/* {drive_output_dir}/

print(f"✅ Backup complete! Results saved to: {drive_output_dir}")