# AnnoMI Comprehensive Analysis - Cognitive Actions × MI Annotations × Therapy Topics

This notebook provides a **unified analysis** combining the best insights from all previous analyses while incorporating the **therapy topic dimension** to understand how different therapeutic contexts shape cognitive patterns and MI quality.

**Dataset Composition:**
- 🎯 **44 unique therapy topics** (alcohol reduction, smoking cessation, diabetes management, etc.)
- 🧠 **45 cognitive actions** from multi-layer neural predictions
- 💬 **133 therapy transcripts** with MI quality labels
- 📊 **Rich annotations** (questions, reflections, therapist inputs)

**Integrated Analyses:**
1. 🌍 **Topic Distribution & Characteristics** - Understanding the dataset composition
2. 🎯 **Topic-Specific Cognitive Signatures** - How cognitive patterns vary by therapy topic
3. 🔄 **Cognitive Synchrony by Topic** - Therapist-client alignment across different topics
4. 💡 **Change Talk Patterns by Topic** - Which topics facilitate client change commitment
5. 👥 **Therapist Styles Across Topics** - Clustering therapeutic approaches by topic
6. 🎭 **MI Techniques × Topics × Cognitive Actions** - Three-way interaction analysis
7. 📈 **Topic-Aware Predictive Modeling** - Predicting MI quality with topic features
8. 🕸️ **Network Analysis by Topic Cluster** - Graph-based insights segmented by topic type

## Setup and Data Loading

In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter, defaultdict
from itertools import combinations
from scipy import stats
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import networkx as nx
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (18, 10)
plt.rcParams['font.size'] = 10

print("✅ Libraries loaded")

In [None]:
# Load AnnoMI-full.csv (with rich annotations)
print("📥 Loading AnnoMI-full.csv...")
df_full = pd.read_csv('third_party/AnnoMI/AnnoMI-full.csv')

# Take first annotation per utterance
df_annotations = df_full.groupby(['transcript_id', 'utterance_id']).first().reset_index()
print(f"✅ Loaded {len(df_annotations)} annotated utterances")

# Load cognitive action predictions
print("\n📥 Loading cognitive action predictions...")
with open('output/analysis_AnnoMI/all_predictions.json', 'r') as f:
    all_predictions = json.load(f)

df_predictions = pd.DataFrame(all_predictions)
print(f"✅ Loaded {len(df_predictions)} cognitive predictions")

# Merge datasets
print("\n🔗 Merging annotations with cognitive predictions...")
df = df_annotations.merge(
    df_predictions[['transcript_id', 'utterance_id', 'predictions', 'action_layer_details']],
    on=['transcript_id', 'utterance_id'],
    how='inner'
)

print(f"✅ Merged dataset: {len(df)} utterances")
print(f"\n📊 Dataset Overview:")
print(f"   Transcripts: {df['transcript_id'].nunique()}")
print(f"   Topics: {df['topic'].nunique()}")
print(f"   Therapist utterances: {len(df[df['interlocutor'] == 'therapist'])}")
print(f"   Client utterances: {len(df[df['interlocutor'] == 'client'])}")

In [None]:
# Filter short utterances and extract high-confidence cognitive actions
MIN_LENGTH = 10
df['utterance_length'] = df['utterance_text'].str.len()
df = df[df['utterance_length'] >= MIN_LENGTH].copy()

print(f"🔄 Filtered to {len(df)} utterances (>= {MIN_LENGTH} chars)")

# Extract high-confidence cognitive actions (>2 layers OR 100% confidence)
def extract_high_confidence_actions(row):
    """Extract cognitive actions with >2 layers OR 100% confidence"""
    if pd.isna(row['predictions']):
        return []
    
    predictions = row['predictions']
    action_layer_details = row.get('action_layer_details', {})
    
    active = []
    for action, data in predictions.items():
        if not data.get('is_active', False):
            continue
        
        num_layers = len(action_layer_details.get(action, []))
        max_confidence = max(
            [layer['confidence'] for layer in action_layer_details.get(action, [])],
            default=0
        )
        
        if num_layers > 2 or max_confidence >= 1.0:
            active.append({
                'action': action,
                'confidence': data['aggregate'],
                'num_layers': num_layers
            })
    return active

df['active_actions'] = df.apply(extract_high_confidence_actions, axis=1)
df['num_active_actions'] = df['active_actions'].apply(len)
df['action_names'] = df['active_actions'].apply(lambda x: [a['action'] for a in x])

# Get all unique actions
all_actions = sorted(set(
    action for actions in df['action_names'] for action in actions
))

print(f"\n🧠 Cognitive Actions:")
print(f"   Unique actions: {len(all_actions)}")
print(f"   Mean per utterance: {df['num_active_actions'].mean():.2f}")
print(f"   Utterances with actions: {(df['num_active_actions'] > 0).sum()} ({(df['num_active_actions'] > 0).sum()/len(df)*100:.1f}%)")

## 1️⃣ Topic Distribution & Characteristics

Understanding the composition of therapy topics in the dataset.

In [None]:
print("="*80)
print("THERAPY TOPIC ANALYSIS")
print("="*80)

# Topic distribution
topic_counts = df['topic'].value_counts()
print(f"\n📊 Total unique topics: {len(topic_counts)}")
print(f"\nTop 20 topics by utterance count:")
print("="*80)
for i, (topic, count) in enumerate(topic_counts.head(20).items(), 1):
    pct = count / len(df) * 100
    bar = "█" * int(pct)
    print(f"{i:2d}. {topic[:50]:50s} {count:5d} ({pct:5.1f}%) {bar}")

# Group topics into categories
def categorize_topic(topic):
    """Categorize topics into broader themes"""
    topic_lower = str(topic).lower()
    
    if 'alcohol' in topic_lower:
        return 'Substance Use - Alcohol'
    elif 'drug' in topic_lower or 'substance' in topic_lower:
        return 'Substance Use - Drugs'
    elif 'smok' in topic_lower:
        return 'Substance Use - Smoking'
    elif 'gambl' in topic_lower:
        return 'Behavioral - Gambling'
    elif 'weight' in topic_lower or 'diet' in topic_lower or 'exercise' in topic_lower or 'activity' in topic_lower:
        return 'Health - Weight/Exercise'
    elif 'diabetes' in topic_lower or 'asthma' in topic_lower or 'medicine' in topic_lower or 'medical' in topic_lower:
        return 'Health - Disease Management'
    elif 'recidivism' in topic_lower or 'violence' in topic_lower or 'school' in topic_lower:
        return 'Behavioral - Justice/School'
    elif 'anxiety' in topic_lower or 'depression' in topic_lower or 'self-harm' in topic_lower:
        return 'Mental Health'
    else:
        return 'Other'

df['topic_category'] = df['topic'].apply(categorize_topic)

category_counts = df['topic_category'].value_counts()
print(f"\n🏷️  Topic Categories:")
print("="*80)
for category, count in category_counts.items():
    pct = count / len(df) * 100
    print(f"   {category:40s} {count:5d} ({pct:5.1f}%)")

# MI quality by topic category
print(f"\n📈 MI Quality by Topic Category:")
print("="*80)
for category in category_counts.index:
    category_df = df[df['topic_category'] == category]
    high_quality = (category_df['mi_quality'] == 'high').sum()
    total = len(category_df['transcript_id'].unique())
    if total > 0:
        pct = high_quality / total * 100
        print(f"   {category:40s} {high_quality:3d}/{total:3d} high quality ({pct:.1f}%)")

In [None]:
# Visualize topic distribution
fig, axes = plt.subplots(2, 2, figsize=(20, 14))

# Plot 1: Top 15 topics
top_topics = topic_counts.head(15)
axes[0, 0].barh(range(len(top_topics)), top_topics.values, color='steelblue', alpha=0.7)
axes[0, 0].set_yticks(range(len(top_topics)))
axes[0, 0].set_yticklabels([t[:40] for t in top_topics.index], fontsize=9)
axes[0, 0].set_xlabel('Number of Utterances', fontsize=11)
axes[0, 0].set_title('Top 15 Therapy Topics by Utterance Count', fontsize=12, fontweight='bold')
axes[0, 0].grid(True, alpha=0.3, axis='x')
axes[0, 0].invert_yaxis()

# Plot 2: Topic categories
colors = plt.cm.Set3(np.linspace(0, 1, len(category_counts)))
axes[0, 1].pie(category_counts.values, labels=category_counts.index, autopct='%1.1f%%',
               colors=colors, startangle=90)
axes[0, 1].set_title('Distribution of Topic Categories', fontsize=12, fontweight='bold')

# Plot 3: MI quality by category
category_quality = []
for category in category_counts.index:
    category_transcripts = df[df['topic_category'] == category]['transcript_id'].unique()
    for tid in category_transcripts:
        mi_qual = df[df['transcript_id'] == tid]['mi_quality'].iloc[0]
        category_quality.append({'category': category, 'mi_quality': mi_qual})

qual_df = pd.DataFrame(category_quality)
qual_pivot = qual_df.groupby(['category', 'mi_quality']).size().unstack(fill_value=0)
qual_pivot = qual_pivot.div(qual_pivot.sum(axis=1), axis=0) * 100

if 'high' in qual_pivot.columns and 'low' in qual_pivot.columns:
    x = np.arange(len(qual_pivot))
    width = 0.6
    
    axes[1, 0].barh(x, qual_pivot['high'], width, label='High Quality', color='green', alpha=0.7)
    axes[1, 0].barh(x, qual_pivot['low'], width, left=qual_pivot['high'], 
                    label='Low Quality', color='red', alpha=0.7)
    axes[1, 0].set_yticks(x)
    axes[1, 0].set_yticklabels(qual_pivot.index, fontsize=9)
    axes[1, 0].set_xlabel('Percentage', fontsize=11)
    axes[1, 0].set_title('MI Quality Distribution by Topic Category', fontsize=12, fontweight='bold')
    axes[1, 0].legend()
    axes[1, 0].grid(True, alpha=0.3, axis='x')
    axes[1, 0].invert_yaxis()

# Plot 4: Cognitive complexity by topic category
category_complexity = df.groupby('topic_category')['num_active_actions'].mean().sort_values(ascending=False)
axes[1, 1].barh(range(len(category_complexity)), category_complexity.values, color='coral', alpha=0.7)
axes[1, 1].set_yticks(range(len(category_complexity)))
axes[1, 1].set_yticklabels(category_complexity.index, fontsize=9)
axes[1, 1].set_xlabel('Mean Cognitive Actions per Utterance', fontsize=11)
axes[1, 1].set_title('Cognitive Complexity by Topic Category', fontsize=12, fontweight='bold')
axes[1, 1].grid(True, alpha=0.3, axis='x')
axes[1, 1].invert_yaxis()

plt.tight_layout()
plt.savefig('output/analysis_AnnoMI/comprehensive_1_topic_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

print("✅ Visualization saved: output/analysis_AnnoMI/comprehensive_1_topic_distribution.png")

## 2️⃣ Topic-Specific Cognitive Signatures

Analyzing how cognitive patterns differ across therapy topics.

In [None]:
print("="*80)
print("TOPIC-SPECIFIC COGNITIVE SIGNATURES")
print("="*80)

# Analyze top topics (with enough data)
top_10_topics = topic_counts.head(10).index

topic_signatures = {}

for topic in top_10_topics:
    topic_df = df[df['topic'] == topic]
    
    # Aggregate cognitive actions
    action_counts = Counter()
    for actions in topic_df['action_names']:
        for action in actions:
            action_counts[action] += 1
    
    # Normalize by utterances
    total_utterances = len(topic_df)
    signature = {action: count / total_utterances 
                for action, count in action_counts.items()}
    
    topic_signatures[topic] = {
        'utterances': total_utterances,
        'transcripts': topic_df['transcript_id'].nunique(),
        'mi_quality_high_pct': (topic_df.groupby('transcript_id')['mi_quality'].first() == 'high').sum() / topic_df['transcript_id'].nunique() * 100,
        'top_actions': action_counts.most_common(10),
        'signature': signature
    }

print(f"\n📊 Analyzed {len(topic_signatures)} topics\n")

for topic, data in list(topic_signatures.items())[:3]:
    print(f"{'='*80}")
    print(f"{topic.upper()[:70]}")
    print(f"{'='*80}")
    print(f"Utterances: {data['utterances']}, Transcripts: {data['transcripts']}, "
          f"High Quality: {data['mi_quality_high_pct']:.1f}%")
    print(f"\nTop 10 cognitive actions:")
    for action, count in data['top_actions']:
        per_utt = count / data['utterances']
        print(f"   {action:35s} {count:5d} ({per_utt:.3f} per utterance)")
    print()

In [None]:
# Compare cognitive signatures across topics
# Create a heatmap of top topics × top actions

# Get top 15 actions overall
all_action_counts = Counter()
for actions in df['action_names']:
    for action in actions:
        all_action_counts[action] += 1
top_15_actions = [a for a, _ in all_action_counts.most_common(15)]

# Build matrix
topic_action_matrix = []
topic_labels = []

for topic in top_10_topics:
    signature = topic_signatures[topic]['signature']
    row = [signature.get(action, 0) for action in top_15_actions]
    topic_action_matrix.append(row)
    topic_labels.append(topic[:40])  # Truncate for display

topic_action_matrix = np.array(topic_action_matrix)

# Visualize
fig, ax = plt.subplots(figsize=(18, 10))
sns.heatmap(topic_action_matrix, cmap='YlOrRd', annot=True, fmt='.2f',
            xticklabels=top_15_actions, yticklabels=topic_labels,
            cbar_kws={'label': 'Actions per Utterance'},
            ax=ax, linewidths=0.5)
ax.set_title('Cognitive Action Signatures Across Top 10 Therapy Topics',
             fontsize=14, fontweight='bold', pad=20)
ax.set_xlabel('Cognitive Action', fontsize=12)
ax.set_ylabel('Therapy Topic', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)

plt.tight_layout()
plt.savefig('output/analysis_AnnoMI/comprehensive_2_topic_signatures.png', dpi=150, bbox_inches='tight')
plt.show()

print("✅ Visualization saved: output/analysis_AnnoMI/comprehensive_2_topic_signatures.png")

## 3️⃣ Cognitive Synchrony by Topic

Measuring therapist-client cognitive alignment across different therapy topics.

In [None]:
print("="*80)
print("COGNITIVE SYNCHRONY ANALYSIS BY TOPIC")
print("="*80)

# Create action vectors
def create_action_vector(action_names, all_actions):
    """Create binary vector for cognitive actions"""
    vector = np.zeros(len(all_actions))
    for action in action_names:
        if action in all_actions:
            idx = all_actions.index(action)
            vector[idx] = 1
    return vector

df['action_vector'] = df['action_names'].apply(
    lambda x: create_action_vector(x, all_actions)
)

# Calculate synchrony for each transcript
def calculate_synchrony(transcript_df, window_size=5):
    """Calculate cognitive synchrony between therapist and client"""
    synchrony_scores = []
    
    for i in range(len(transcript_df) - window_size + 1):
        window = transcript_df.iloc[i:i+window_size]
        
        t_vecs = [v for v, interlocutor in zip(window['action_vector'], window['interlocutor']) 
                  if interlocutor == 'therapist']
        c_vecs = [v for v, interlocutor in zip(window['action_vector'], window['interlocutor']) 
                  if interlocutor == 'client']
        
        if len(t_vecs) > 0 and len(c_vecs) > 0:
            t_avg = np.mean(t_vecs, axis=0)
            c_avg = np.mean(c_vecs, axis=0)
            
            norm_t = np.linalg.norm(t_avg)
            norm_c = np.linalg.norm(c_avg)
            
            if norm_t > 0 and norm_c > 0:
                similarity = np.dot(t_avg, c_avg) / (norm_t * norm_c)
                synchrony_scores.append(similarity)
    
    return np.mean(synchrony_scores) if len(synchrony_scores) > 0 else None

# Compute synchrony by topic category
category_synchrony = defaultdict(list)

for transcript_id in df['transcript_id'].unique():
    transcript_df = df[df['transcript_id'] == transcript_id].sort_values('utterance_id')
    
    if len(transcript_df) < 10:
        continue
    
    sync_score = calculate_synchrony(transcript_df)
    
    if sync_score is not None:
        topic_category = transcript_df['topic_category'].iloc[0]
        mi_quality = transcript_df['mi_quality'].iloc[0]
        
        category_synchrony[topic_category].append({
            'synchrony': sync_score,
            'mi_quality': mi_quality
        })

print(f"\n📊 Synchrony by Topic Category:")
print("="*80)

for category in sorted(category_synchrony.keys()):
    data = category_synchrony[category]
    if len(data) < 3:
        continue
    
    all_sync = [d['synchrony'] for d in data]
    high_sync = [d['synchrony'] for d in data if d['mi_quality'] == 'high']
    low_sync = [d['synchrony'] for d in data if d['mi_quality'] == 'low']
    
    print(f"\n{category}:")
    print(f"   Overall: {np.mean(all_sync):.3f} (±{np.std(all_sync):.3f}, n={len(all_sync)})")
    if len(high_sync) > 0:
        print(f"   High quality: {np.mean(high_sync):.3f} (±{np.std(high_sync):.3f}, n={len(high_sync)})")
    if len(low_sync) > 0:
        print(f"   Low quality:  {np.mean(low_sync):.3f} (±{np.std(low_sync):.3f}, n={len(low_sync)})")
    
    # Statistical test
    if len(high_sync) > 2 and len(low_sync) > 2:
        t_stat, p_val = stats.ttest_ind(high_sync, low_sync)
        if p_val < 0.05:
            print(f"   ✅ Significant difference (p={p_val:.4f})")

In [None]:
# Visualize synchrony by topic category and quality
sync_data = []
for category, data in category_synchrony.items():
    for item in data:
        sync_data.append({
            'category': category,
            'synchrony': item['synchrony'],
            'mi_quality': item['mi_quality']
        })

sync_df = pd.DataFrame(sync_data)

fig, axes = plt.subplots(1, 2, figsize=(20, 8))

# Plot 1: Boxplot by category
sns.boxplot(data=sync_df, x='category', y='synchrony', ax=axes[0], palette='Set2')
axes[0].set_xlabel('Topic Category', fontsize=11)
axes[0].set_ylabel('Cognitive Synchrony', fontsize=11)
axes[0].set_title('Cognitive Synchrony by Topic Category', fontsize=12, fontweight='bold')
axes[0].tick_params(axis='x', rotation=45)
axes[0].grid(True, alpha=0.3, axis='y')

# Plot 2: By category and quality
sns.boxplot(data=sync_df, x='category', y='synchrony', hue='mi_quality', 
            ax=axes[1], palette={'high': 'green', 'low': 'red'})
axes[1].set_xlabel('Topic Category', fontsize=11)
axes[1].set_ylabel('Cognitive Synchrony', fontsize=11)
axes[1].set_title('Cognitive Synchrony by Topic Category and MI Quality', fontsize=12, fontweight='bold')
axes[1].tick_params(axis='x', rotation=45)
axes[1].legend(title='MI Quality', fontsize=10)
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('output/analysis_AnnoMI/comprehensive_3_synchrony_by_topic.png', dpi=150, bbox_inches='tight')
plt.show()

print("✅ Visualization saved: output/analysis_AnnoMI/comprehensive_3_synchrony_by_topic.png")

## 4️⃣ Change Talk Patterns by Topic

Analyzing which therapy topics most effectively elicit client change commitment.

In [None]:
print("="*80)
print("CHANGE TALK PATTERNS BY TOPIC")
print("="*80)

# Analyze change talk by topic category
client_df = df[df['interlocutor'] == 'client'].copy()
client_df = client_df[client_df['client_talk_type'].notna()].copy()

topic_change_stats = {}

for category in df['topic_category'].unique():
    cat_client = client_df[client_df['topic_category'] == category]
    
    if len(cat_client) < 10:
        continue
    
    change_count = (cat_client['client_talk_type'] == 'change').sum()
    sustain_count = (cat_client['client_talk_type'] == 'sustain').sum()
    neutral_count = (cat_client['client_talk_type'] == 'neutral').sum()
    total = len(cat_client)
    
    # Cognitive actions in change talk
    change_actions = Counter()
    for _, row in cat_client[cat_client['client_talk_type'] == 'change'].iterrows():
        for action in row['action_names']:
            change_actions[action] += 1
    
    topic_change_stats[category] = {
        'total': total,
        'change_count': change_count,
        'sustain_count': sustain_count,
        'neutral_count': neutral_count,
        'change_pct': change_count / total * 100,
        'sustain_pct': sustain_count / total * 100,
        'top_change_actions': change_actions.most_common(5)
    }

# Sort by change percentage
sorted_topics = sorted(topic_change_stats.items(), key=lambda x: x[1]['change_pct'], reverse=True)

print(f"\n📊 Change Talk by Topic Category:")
print("="*80)
print(f"{'Category':40s} {'Change':>8s} {'Sustain':>8s} {'Neutral':>8s} {'Total':>8s}")
print("="*80)

for category, stats in sorted_topics:
    print(f"{category:40s} {stats['change_pct']:7.1f}% {stats['sustain_pct']:7.1f}% "
          f"{stats['neutral_count']/stats['total']*100:7.1f}% {stats['total']:7d}")

print(f"\n🎯 Top cognitive actions in change talk by topic category:")
print("="*80)

for category, stats in sorted_topics[:5]:
    print(f"\n{category}:")
    for action, count in stats['top_change_actions']:
        print(f"   {action:35s} {count:4d}")

In [None]:
# Visualize change talk patterns
fig, axes = plt.subplots(2, 1, figsize=(18, 12))

# Plot 1: Stacked bar chart of talk types by category
categories = [cat for cat, _ in sorted_topics]
change_pcts = [stats['change_pct'] for _, stats in sorted_topics]
sustain_pcts = [stats['sustain_pct'] for _, stats in sorted_topics]
neutral_pcts = [stats['neutral_count']/stats['total']*100 for _, stats in sorted_topics]

x = np.arange(len(categories))
axes[0].barh(x, change_pcts, label='Change', color='green', alpha=0.7)
axes[0].barh(x, sustain_pcts, left=change_pcts, label='Sustain', color='red', alpha=0.7)
axes[0].barh(x, neutral_pcts, left=np.array(change_pcts)+np.array(sustain_pcts), 
             label='Neutral', color='gray', alpha=0.7)
axes[0].set_yticks(x)
axes[0].set_yticklabels(categories, fontsize=10)
axes[0].set_xlabel('Percentage', fontsize=11)
axes[0].set_title('Client Talk Type Distribution by Topic Category', fontsize=12, fontweight='bold')
axes[0].legend(fontsize=10)
axes[0].grid(True, alpha=0.3, axis='x')
axes[0].invert_yaxis()

# Plot 2: Change talk percentage comparison
axes[1].barh(x, change_pcts, color='green', alpha=0.7)
axes[1].set_yticks(x)
axes[1].set_yticklabels(categories, fontsize=10)
axes[1].set_xlabel('Change Talk Percentage', fontsize=11)
axes[1].set_title('Change Talk Percentage by Topic Category (Higher = More Change-Oriented)', 
                  fontsize=12, fontweight='bold')
axes[1].grid(True, alpha=0.3, axis='x')
axes[1].invert_yaxis()

# Add percentage labels
for i, pct in enumerate(change_pcts):
    axes[1].text(pct + 0.5, i, f'{pct:.1f}%', va='center', fontsize=9)

plt.tight_layout()
plt.savefig('output/analysis_AnnoMI/comprehensive_4_change_talk_by_topic.png', dpi=150, bbox_inches='tight')
plt.show()

print("✅ Visualization saved: output/analysis_AnnoMI/comprehensive_4_change_talk_by_topic.png")

## 5️⃣ MI Techniques × Topics × Cognitive Actions

Three-way analysis: How MI techniques and cognitive patterns interact across different therapy topics.

In [None]:
print("="*80)
print("MI TECHNIQUES × TOPICS × COGNITIVE ACTIONS")
print("="*80)

therapist_df = df[df['interlocutor'] == 'therapist'].copy()

# Focus on top 5 topic categories and major MI techniques
top_5_categories = df['topic_category'].value_counts().head(5).index

# Analyze reflection techniques across topics
print(f"\n📊 Complex Reflections - Top Cognitive Actions by Topic Category:")
print("="*80)

for category in top_5_categories:
    cat_therapist = therapist_df[(therapist_df['topic_category'] == category) & 
                                  (therapist_df['reflection_subtype'] == 'complex')]
    
    if len(cat_therapist) < 5:
        continue
    
    actions = Counter()
    for _, row in cat_therapist.iterrows():
        for action in row['action_names']:
            actions[action] += 1
    
    print(f"\n{category} (n={len(cat_therapist)}):")
    for action, count in actions.most_common(5):
        print(f"   {action:35s} {count:4d} ({count/len(cat_therapist):.2f} per utterance)")

# Compare open questions across topics
print(f"\n\n📊 Open Questions - Top Cognitive Actions by Topic Category:")
print("="*80)

for category in top_5_categories:
    cat_therapist = therapist_df[(therapist_df['topic_category'] == category) & 
                                  (therapist_df['question_subtype'] == 'open')]
    
    if len(cat_therapist) < 5:
        continue
    
    actions = Counter()
    for _, row in cat_therapist.iterrows():
        for action in row['action_names']:
            actions[action] += 1
    
    print(f"\n{category} (n={len(cat_therapist)}):")
    for action, count in actions.most_common(5):
        print(f"   {action:35s} {count:4d} ({count/len(cat_therapist):.2f} per utterance)")

## 6️⃣ Topic-Aware Predictive Modeling

Using topic features to predict MI quality.

In [None]:
print("="*80)
print("TOPIC-AWARE MI QUALITY PREDICTION")
print("="*80)

# Build features for each transcript including topic information
transcript_features = []

for transcript_id in df['transcript_id'].unique():
    t_df = df[df['transcript_id'] == transcript_id]
    t_therapist = t_df[t_df['interlocutor'] == 'therapist']
    t_client = t_df[t_df['interlocutor'] == 'client']
    
    if len(t_therapist) < 5:
        continue
    
    # Topic features
    topic_category = t_df['topic_category'].iloc[0]
    
    # One-hot encode top topic categories
    for cat in top_5_categories:
        transcript_features_dict = {f'topic_{cat}': 1 if topic_category == cat else 0}
    
    # Annotation features
    reflection_rate = (t_therapist['reflection_exists'] == True).sum() / len(t_therapist)
    complex_ref_rate = (t_therapist['reflection_subtype'] == 'complex').sum() / len(t_therapist)
    open_q_rate = (t_therapist['question_subtype'] == 'open').sum() / len(t_therapist)
    
    # Cognitive features
    top_10_actions = [a for a, _ in all_action_counts.most_common(10)]
    action_counts = Counter()
    for actions in t_therapist['action_names']:
        for action in actions:
            action_counts[action] += 1
    
    action_features = {f'action_{action}': action_counts.get(action, 0) / len(t_therapist)
                      for action in top_10_actions}
    
    # Client features
    client_change_rate = 0
    if len(t_client) > 0:
        client_with_talk = t_client[t_client['client_talk_type'].notna()]
        if len(client_with_talk) > 0:
            client_change_rate = (client_with_talk['client_talk_type'] == 'change').sum() / len(client_with_talk)
    
    # Combine all features
    features = {
        'transcript_id': transcript_id,
        'mi_quality': t_df['mi_quality'].iloc[0],
        'reflection_rate': reflection_rate,
        'complex_ref_rate': complex_ref_rate,
        'open_q_rate': open_q_rate,
        'client_change_rate': client_change_rate,
        **action_features
    }
    
    # Add topic one-hot features
    for cat in top_5_categories:
        features[f'topic_{cat}'] = 1 if topic_category == cat else 0
    
    transcript_features.append(features)

features_df = pd.DataFrame(transcript_features)
print(f"\n✅ Created feature matrix: {len(features_df)} transcripts × {len(features_df.columns)-2} features")

# Prepare for modeling
X = features_df.drop(['transcript_id', 'mi_quality'], axis=1).values
y = (features_df['mi_quality'] == 'high').astype(int).values

# Train model
rf = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=5)
scores = cross_val_score(rf, X, y, cv=5, scoring='accuracy')

print(f"\n📊 Random Forest Cross-Validation Results:")
print(f"   Mean Accuracy: {scores.mean():.3f} (±{scores.std():.3f})")
print(f"   Range: {scores.min():.3f} - {scores.max():.3f}")

# Feature importance
rf.fit(X, y)
feature_names = features_df.drop(['transcript_id', 'mi_quality'], axis=1).columns
importances = pd.DataFrame({
    'feature': feature_names,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

print(f"\n📈 Top 15 Most Important Features:")
for i, row in importances.head(15).iterrows():
    feat_type = 'Topic' if 'topic_' in row['feature'] else ('Cognitive' if 'action_' in row['feature'] else 'MI Technique')
    print(f"   [{feat_type:12s}] {row['feature']:35s} {row['importance']:.4f}")

In [None]:
# Visualize feature importance
fig, ax = plt.subplots(figsize=(14, 10))

top_features = importances.head(20)
colors = []
for feat in top_features['feature']:
    if 'topic_' in feat:
        colors.append('purple')
    elif 'action_' in feat:
        colors.append('steelblue')
    else:
        colors.append('coral')

ax.barh(range(len(top_features)), top_features['importance'], color=colors, alpha=0.7)
ax.set_yticks(range(len(top_features)))
ax.set_yticklabels(top_features['feature'].str.replace('action_', 'cog: ').str.replace('topic_', 'topic: '), 
                   fontsize=9)
ax.set_xlabel('Feature Importance', fontsize=12)
ax.set_title('Top 20 Features for Predicting MI Quality\n(Purple=Topic, Blue=Cognitive, Red=MI Technique)',
             fontsize=14, fontweight='bold')
ax.grid(True, alpha=0.3, axis='x')
ax.invert_yaxis()

plt.tight_layout()
plt.savefig('output/analysis_AnnoMI/comprehensive_5_topic_aware_prediction.png', dpi=150, bbox_inches='tight')
plt.show()

print("✅ Visualization saved: output/analysis_AnnoMI/comprehensive_5_topic_aware_prediction.png")

## 7️⃣ Summary and Export

In [None]:
print("="*80)
print("COMPREHENSIVE TOPIC-AWARE ANALYSIS SUMMARY")
print("="*80)

summary = {
    'dataset': {
        'total_utterances': len(df),
        'transcripts': int(df['transcript_id'].nunique()),
        'unique_topics': int(df['topic'].nunique()),
        'topic_categories': int(df['topic_category'].nunique()),
        'cognitive_actions': len(all_actions)
    },
    'topic_distribution': {
        'top_10_topics': dict(topic_counts.head(10)),
        'category_distribution': dict(category_counts)
    },
    'topic_signatures': {
        topic: {
            'utterances': data['utterances'],
            'transcripts': data['transcripts'],
            'mi_quality_high_pct': float(data['mi_quality_high_pct']),
            'top_5_actions': [a for a, _ in data['top_actions'][:5]]
        }
        for topic, data in list(topic_signatures.items())[:5]
    },
    'change_talk_by_topic': {
        category: {
            'change_pct': float(stats['change_pct']),
            'sustain_pct': float(stats['sustain_pct']),
            'total': int(stats['total'])
        }
        for category, stats in sorted_topics
    },
    'predictive_modeling': {
        'cross_val_accuracy_mean': float(scores.mean()),
        'cross_val_accuracy_std': float(scores.std()),
        'top_15_features': importances.head(15)['feature'].tolist(),
        'topic_feature_importance': {
            feat: float(imp) for feat, imp in zip(importances['feature'], importances['importance'])
            if 'topic_' in feat
        }
    }
}

# Save summary
with open('output/analysis_AnnoMI/comprehensive_topic_analysis_summary.json', 'w') as f:
    json.dump(summary, f, indent=2)

print("\n✅ Summary saved: output/analysis_AnnoMI/comprehensive_topic_analysis_summary.json")

print("\n" + "="*80)
print("COMPREHENSIVE TOPIC-AWARE ANALYSIS COMPLETE!")
print("="*80)
print("\nGenerated visualizations:")
print("   1. comprehensive_1_topic_distribution.png")
print("   2. comprehensive_2_topic_signatures.png")
print("   3. comprehensive_3_synchrony_by_topic.png")
print("   4. comprehensive_4_change_talk_by_topic.png")
print("   5. comprehensive_5_topic_aware_prediction.png")
print("\nData exports:")
print("   - comprehensive_topic_analysis_summary.json")
print("\n🔍 Key Insights:")
print(f"   - Analyzed {len(df['topic'].unique())} unique therapy topics across {df['transcript_id'].nunique()} transcripts")
print(f"   - Topic-aware model accuracy: {scores.mean():.1%}")
print(f"   - Identified topic-specific cognitive signatures and MI technique patterns")
print(f"   - Topic category with highest change talk: {sorted_topics[0][0]} ({sorted_topics[0][1]['change_pct']:.1f}%)")