# Carl Rogers Therapy Transcripts - Cognitive Action Analysis

This notebook analyzes therapy transcripts from Carl Rogers using universal cognitive action probe inference.

**Features:**
- ✅ Extract transcripts from PDF with speaker identification (therapist/client)
- ✅ Universal probe inference for cognitive action detection
- ✅ Cognitive action network visualization per transcript
- ✅ Combined network analysis across all transcripts
- ✅ Separate therapist and client network analysis

**Requirements:**
- Google Colab with GPU (T4 or better)
- ~15 GB VRAM
- Runtime: ~2-3 hours

## 1️⃣ Check GPU and Setup

In [None]:
# Check GPU availability
!nvidia-smi

import torch
print("\n" + "="*60)
print("GPU INFORMATION")
print("="*60)
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU device: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("⚠️  WARNING: No GPU detected! This will be very slow on CPU.")
print("="*60)

## 2️⃣ Clone Repository and Install Dependencies

In [None]:
import os
import sys

# Clone the repository
repo_url = "https://github.com/ChuloIva/brije.git"
repo_name = "brije"

if not os.path.exists(repo_name):
    print("📥 Cloning Brije repository...")
    !git clone {repo_url}
    print("✅ Repository cloned successfully!")
else:
    print("✅ Repository already exists")
    print("🔄 Pulling latest changes...")
    !cd {repo_name} && git pull

# Change to repo directory
os.chdir(repo_name)
print(f"\n📁 Current directory: {os.getcwd()}")

In [None]:
# Install dependencies
print("📦 Installing dependencies...\n")

# Core dependencies
print("Installing core packages...")
!pip install -q torch transformers h5py scikit-learn tqdm matplotlib seaborn pandas networkx pypdf2 pdfplumber

# Clone and install nnsight
nnsight_dir = "third_party/nnsight"
nnsight_repo = "https://github.com/ndif-team/nnsight"

print("\n📦 Setting up nnsight...")
if not os.path.exists(nnsight_dir) or not os.listdir(nnsight_dir):
    print("   Cloning nnsight repository...")
    os.makedirs("third_party", exist_ok=True)
    !git clone {nnsight_repo} {nnsight_dir}
    print("   ✅ nnsight repository cloned")
else:
    print("   ✅ nnsight repository already exists")

# Install nnsight
print("   Installing nnsight...")
!pip install -q -e {nnsight_dir}

print("\n✅ All dependencies installed!")

## 3️⃣ Verify Pre-trained Probes

In [None]:
import glob

# Check for probes in the repository
local_probes_dir = 'data/probes_binary'

probe_dirs = glob.glob(f'{local_probes_dir}/layer_*')

if probe_dirs:
    print("✅ Found pre-trained probes in repository")
    print(f"\n📊 Available probe layers: {len(probe_dirs)}")
    
    for probe_dir in sorted(probe_dirs)[:5]:
        layer_num = os.path.basename(probe_dir).replace('layer_', '')
        probe_files = glob.glob(f"{probe_dir}/probe_*.pth")
        print(f"   Layer {layer_num}: {len(probe_files)} probes")
    
    if len(probe_dirs) > 5:
        print(f"   ... and {len(probe_dirs) - 5} more layers")
else:
    print("⚠️  No probes found in:", local_probes_dir)
    print("\n💡 Please train probes first using:")
    print("   Brije_Full_Pipeline_Colab.ipynb")

## 4️⃣ Login to Hugging Face (for Gemma 3 4B)

In [None]:
from huggingface_hub import notebook_login

notebook_login()

## 5️⃣ Load Pre-Parsed Transcripts

The transcripts have been pre-parsed from the raw PDF into clean CSV/JSON format.

**Available formats:**
- `all_sessions_combined.csv` - All sessions in flat CSV format
- `all_sessions_combined.json` - All sessions with full metadata
- Individual session files: `{session_name}_session.csv/json`

In [None]:
import json
import pandas as pd
from pathlib import Path

# Path to parsed transcripts
data_dir = Path('output/carl_rogers_analysis')
csv_file = data_dir / 'all_sessions_combined.csv'
json_file = data_dir / 'all_sessions_combined.json'

print("📥 Loading pre-parsed transcripts...")

# Load from CSV (flat format)
df = pd.read_csv(csv_file)

print(f"✅ Loaded {len(df)} total utterances from {df['session'].nunique()} sessions")

# Statistics
total_utterances = len(df)
therapist_utterances = len(df[df['speaker'] == 'therapist'])
client_utterances = len(df[df['speaker'] == 'client'])

print(f"\n📊 Dataset Statistics:")
print(f"   Total utterances: {total_utterances}")
print(f"   Therapist (Rogers) utterances: {therapist_utterances}")
print(f"   Client utterances: {client_utterances}")
print(f"\n   Sessions: {', '.join(df['session'].unique())}")

# Also load JSON for detailed metadata if needed
with open(json_file, 'r', encoding='utf-8') as f:
    json_data = json.load(f)

print(f"\n   Years covered: {sorted(set(s['year'] for s in json_data['sessions']))}")

In [None]:
# Convert dataframe to transcript format for processing
print("=" * 80)
print("SAMPLE TRANSCRIPTS")
print("=" * 80)

# Create transcripts list from the parsed data
transcripts = []

for session_data in json_data['sessions']:
    session_name = session_data['session_name']
    session_df = df[df['session'] == session_name]
    
    utterances = []
    for idx, row in session_df.iterrows():
        utterances.append({
            'utterance_id': idx,
            'speaker': row['speaker'],
            'speaker_name': row['speaker_name'],
            'text': row['text'],
            'turn_number': row['turn_number']
        })
    
    transcripts.append({
        'transcript_id': session_name,
        'title': f"{session_data['client_name']} Session ({session_data['year']})",
        'session_name': session_name,
        'client_name': session_data['client_name'],
        'year': session_data['year'],
        'utterances': utterances
    })

# Show sample
for i, transcript in enumerate(transcripts[:3]):
    print(f"\n📄 {transcript['session_name']}: {transcript['title']}")
    print(f"   Total utterances: {len(transcript['utterances'])}")
    
    therapist_count = sum(1 for u in transcript['utterances'] if u['speaker'] == 'therapist')
    client_count = sum(1 for u in transcript['utterances'] if u['speaker'] == 'client')
    print(f"   Therapist: {therapist_count}, Client: {client_count}")
    
    print(f"\n   First 5 utterances:")
    for utterance in transcript['utterances'][:5]:
        speaker = "THERAPIST" if utterance['speaker'] == 'therapist' else "CLIENT   "
        text = utterance['text'][:100] + "..." if len(utterance['text']) > 100 else utterance['text']
        print(f"      [{speaker}] {text}")

In [None]:
# Show transcript distribution
print("\n" + "=" * 80)
print("TRANSCRIPT DISTRIBUTION")
print("=" * 80)

print(f"\nAll {len(transcripts)} sessions:")
for t in transcripts:
    session_df = df[df['session'] == t['session_name']]
    t_count = len(session_df[session_df['speaker'] == 'therapist'])
    c_count = len(session_df[session_df['speaker'] == 'client'])
    title_preview = t['title'][:45] + "..." if len(t['title']) > 45 else t['title']
    print(f"   {t['session_name']:15s} | {title_preview:48s} | Total: {len(session_df):3d} (T: {t_count:3d}, C: {c_count:3d})")

## 6️⃣ Optional: Quality Check

Verify the parsed data looks correct.

In [None]:
# Quality check - look for any parsing anomalies
print("🔍 DATA QUALITY CHECK\n")

# Check for very short or very long utterances
short_utterances = []
long_utterances = []

for idx, row in df.iterrows():
    text_len = len(row['text'])
    if text_len < 20:
        short_utterances.append((row['session'], row['speaker'], text_len, row['text']))
    if text_len > 1000:
        long_utterances.append((row['session'], row['speaker'], text_len, row['text'][:100]))

if short_utterances:
    print(f"⚠️  Found {len(short_utterances)} very short utterances (< 20 chars)")
    print("   First 5 examples:")
    for session, speaker, length, text in short_utterances[:5]:
        print(f"      Session {session} [{speaker}] ({length} chars): {text}")

if long_utterances:
    print(f"\n📝 Found {len(long_utterances)} very long utterances (> 1000 chars)")
    print("   These are usually complete therapy turns - normal for transcripts")
    
# Check speaker balance
print(f"\n📊 Speaker balance per session:")
for session_name in df['session'].unique():
    session_df = df[df['session'] == session_name]
    t_count = len(session_df[session_df['speaker'] == 'therapist'])
    c_count = len(session_df[session_df['speaker'] == 'client'])
    ratio = t_count / c_count if c_count > 0 else 0
    print(f"   {session_name:15s}: T/C ratio = {ratio:.2f} (T:{t_count:3d}, C:{c_count:3d})")

print(f"\n✅ Data quality check complete!")

## 7️⃣ Initialize Universal Probe Inference Engine

In [None]:
# Add src to path
sys.path.insert(0, os.path.join(os.getcwd(), 'src', 'probes'))

from universal_multi_layer_inference import UniversalMultiLayerInferenceEngine
from pathlib import Path

# Initialize the engine
print("🚀 Initializing Universal Probe Inference Engine...\n")

engine = UniversalMultiLayerInferenceEngine(
    probes_base_dir=Path('data/probes_binary'),
    model_name='google/gemma-3-4b-it',
    layer_range=(21, 30)  # Layers 21-30
)

print("\n✅ Engine initialized and ready!")

## 8️⃣ Run Cognitive Action Inference

Analyze all utterances and detect cognitive actions.

**Filtering:** We keep only high-confidence actions that appear on **>2 layers** OR have **100% confidence**.

In [None]:
import time
from tqdm import tqdm
import json
from collections import defaultdict

print("🧠 Running cognitive action inference...\n")

# Create output directory
output_dir = Path('output/carl_rogers_analysis')
output_dir.mkdir(parents=True, exist_ok=True)

# Process each transcript
annotated_transcripts = []
start_time = time.time()

for transcript in tqdm(transcripts, desc="Processing transcripts"):
    annotated_utterances = []
    
    for utterance in tqdm(transcript['utterances'], desc=f"  {transcript['session_name']}", leave=False):
        utterance_text = utterance['text']
        
        # Skip very short utterances
        if len(utterance_text) < 10:
            continue
        
        try:
            # Run universal inference
            action_preds = engine.predict_by_action(
                utterance_text,
                threshold=0.1,
                aggregation="max"
            )
            
            # Get all layer predictions for filtering
            all_layer_preds = engine.predict_all(
                utterance_text,
                threshold=0.1
            )
            
            # Group by action with layer info
            action_layer_map = defaultdict(list)
            for pred in all_layer_preds:
                if pred.is_active:
                    action_layer_map[pred.action_name].append({
                        'layer': pred.layer,
                        'confidence': pred.confidence
                    })
            
            # FILTER: Keep only actions with >2 layers OR 100% confidence
            filtered_predictions = {}
            for action_name, action_info in action_preds.items():
                if not action_info.get('is_active', False):
                    continue
                
                num_layers = len(action_layer_map.get(action_name, []))
                max_confidence = max(
                    [layer_info['confidence'] for layer_info in action_layer_map.get(action_name, [])],
                    default=0
                )
                
                if num_layers > 2 or max_confidence >= 1.0:
                    filtered_predictions[action_name] = action_info
            
            annotated_utterances.append({
                'utterance_id': utterance['utterance_id'],
                'speaker': utterance['speaker'],
                'text': utterance_text,
                'predictions': filtered_predictions,
                'action_layer_details': dict(action_layer_map)
            })
            
        except Exception as e:
            print(f"\n⚠️  Error processing utterance: {e}")
            continue
    
    annotated_transcripts.append({
        'transcript_id': transcript['transcript_id'],
        'session_name': transcript['session_name'],
        'title': transcript['title'],
        'utterances': annotated_utterances
    })

elapsed_time = time.time() - start_time

print(f"\n✅ Inference complete!")
print(f"   Time elapsed: {elapsed_time/3600:.2f} hours")
print(f"   Average time per transcript: {elapsed_time/len(transcripts):.2f} seconds")

# Save annotated transcripts
output_file = output_dir / 'annotated_transcripts.json'
with open(output_file, 'w') as f:
    json.dump(annotated_transcripts, f, indent=2, default=str)

print(f"\n💾 Saved to: {output_file}")

## 9️⃣ Cognitive Action Network - Per Transcript

Create network visualizations for each transcript (therapist and client separately).

In [None]:
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

sns.set_style('whitegrid')

def create_cognitive_action_network(utterances, speaker_filter=None, min_edge_weight=2):
    """
    Create a directed graph of cognitive action transitions.
    
    Args:
        utterances: List of annotated utterances
        speaker_filter: 'therapist' or 'client' to filter by speaker, None for all
        min_edge_weight: Minimum number of transitions to include edge
    """
    # Filter by speaker
    if speaker_filter:
        utterances = [u for u in utterances if u['speaker'] == speaker_filter]
    
    # Create graph
    G = nx.DiGraph()
    
    # Count co-occurrences (actions appearing together) and transitions
    edge_weights = Counter()
    node_weights = Counter()
    
    # Add edges for co-occurrence within same utterance
    for utterance in utterances:
        actions = [action for action, data in utterance['predictions'].items() 
                  if data.get('is_active', False)]
        
        # Count node occurrences
        for action in actions:
            node_weights[action] += 1
        
        # Add edges for all pairs (co-occurrence)
        for i, action1 in enumerate(actions):
            for action2 in actions[i+1:]:
                # Undirected co-occurrence
                edge = tuple(sorted([action1, action2]))
                edge_weights[edge] += 1
    
    # Add transitions between consecutive utterances
    for i in range(len(utterances) - 1):
        current_actions = [action for action, data in utterances[i]['predictions'].items() 
                          if data.get('is_active', False)]
        next_actions = [action for action, data in utterances[i+1]['predictions'].items() 
                       if data.get('is_active', False)]
        
        for curr_action in current_actions:
            for next_action in next_actions:
                edge_weights[(curr_action, next_action)] += 1
    
    # Add nodes
    for action, weight in node_weights.items():
        G.add_node(action, weight=weight)
    
    # Add edges
    for edge, weight in edge_weights.items():
        if weight >= min_edge_weight:
            G.add_edge(edge[0], edge[1], weight=weight)
    
    return G, node_weights, edge_weights

def visualize_network(G, title, output_path, top_n_nodes=20):
    """
    Visualize cognitive action network.
    """
    if len(G.nodes()) == 0:
        print(f"⚠️  No nodes in graph for {title}")
        return
    
    # Get top N nodes by degree
    node_degrees = dict(G.degree())
    top_nodes = sorted(node_degrees.items(), key=lambda x: x[1], reverse=True)[:top_n_nodes]
    top_node_names = [node for node, _ in top_nodes]
    
    # Create subgraph
    G_sub = G.subgraph(top_node_names).copy()
    
    if len(G_sub.nodes()) == 0:
        print(f"⚠️  No nodes after filtering for {title}")
        return
    
    # Layout
    pos = nx.spring_layout(G_sub, k=2, iterations=50, seed=42)
    
    # Node sizes based on degree
    node_sizes = [G_sub.degree(node) * 200 for node in G_sub.nodes()]
    
    # Edge widths based on weight
    edges = G_sub.edges()
    weights = [G_sub[u][v]['weight'] for u, v in edges]
    max_weight = max(weights) if weights else 1
    edge_widths = [w / max_weight * 5 for w in weights]
    
    # Draw
    plt.figure(figsize=(16, 12))
    
    nx.draw_networkx_nodes(G_sub, pos, node_size=node_sizes, 
                          node_color='lightblue', alpha=0.7, 
                          edgecolors='black', linewidths=1.5)
    
    nx.draw_networkx_labels(G_sub, pos, font_size=9, font_weight='bold')
    
    nx.draw_networkx_edges(G_sub, pos, width=edge_widths, alpha=0.4,
                          arrows=True, arrowsize=15, edge_color='gray',
                          connectionstyle='arc3,rad=0.1')
    
    plt.title(title, fontsize=14, fontweight='bold')
    plt.axis('off')
    plt.tight_layout()
    plt.savefig(output_path, dpi=150, bbox_inches='tight')
    plt.close()
    
    print(f"✅ Saved: {output_path}")

print("📊 Creating cognitive action networks per transcript...\n")

# Create networks for each transcript
network_dir = output_dir / 'networks'
network_dir.mkdir(exist_ok=True)

for transcript in annotated_transcripts:
    session_name = transcript['session_name']
    title = transcript['title']
    utterances = transcript['utterances']
    
    print(f"\n{session_name}: {title}")
    
    # Therapist network
    G_therapist, node_weights_t, edge_weights_t = create_cognitive_action_network(
        utterances, speaker_filter='therapist', min_edge_weight=1
    )
    
    if len(G_therapist.nodes()) > 0:
        visualize_network(
            G_therapist,
            f"Therapist Cognitive Actions\n{session_name}",
            network_dir / f"{session_name}_therapist.png"
        )
    
    # Client network
    G_client, node_weights_c, edge_weights_c = create_cognitive_action_network(
        utterances, speaker_filter='client', min_edge_weight=1
    )
    
    if len(G_client.nodes()) > 0:
        visualize_network(
            G_client,
            f"Client Cognitive Actions\n{session_name}",
            network_dir / f"{session_name}_client.png"
        )

print("\n✅ Per-transcript networks complete!")

## 🔟 Combined Network Analysis

Aggregate cognitive action networks across all transcripts.

In [None]:
print("📊 Creating combined network analysis...\n")

# Combine all utterances
all_utterances = []
for transcript in annotated_transcripts:
    all_utterances.extend(transcript['utterances'])

# Combined therapist network
print("Creating combined therapist network...")
G_therapist_combined, node_weights_t, edge_weights_t = create_cognitive_action_network(
    all_utterances, speaker_filter='therapist', min_edge_weight=3
)

visualize_network(
    G_therapist_combined,
    "Therapist Cognitive Actions (All Transcripts)",
    output_dir / "combined_therapist_network.png",
    top_n_nodes=25
)

# Combined client network
print("Creating combined client network...")
G_client_combined, node_weights_c, edge_weights_c = create_cognitive_action_network(
    all_utterances, speaker_filter='client', min_edge_weight=3
)

visualize_network(
    G_client_combined,
    "Client Cognitive Actions (All Transcripts)",
    output_dir / "combined_client_network.png",
    top_n_nodes=25
)

print("\n✅ Combined networks complete!")

## 1️⃣1️⃣ Network Statistics & Analysis

In [None]:
print("="*80)
print("NETWORK ANALYSIS STATISTICS")
print("="*80)

def analyze_network(G, name):
    print(f"\n{name}:")
    print(f"   Nodes: {G.number_of_nodes()}")
    print(f"   Edges: {G.number_of_edges()}")
    print(f"   Density: {nx.density(G):.3f}")
    
    if G.number_of_nodes() > 0:
        # Centrality measures
        degree_centrality = nx.degree_centrality(G)
        pagerank = nx.pagerank(G)
        
        print(f"\n   Top 10 by Degree Centrality:")
        for action, score in sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)[:10]:
            print(f"      {action:30s} {score:.3f}")
        
        print(f"\n   Top 10 by PageRank:")
        for action, score in sorted(pagerank.items(), key=lambda x: x[1], reverse=True)[:10]:
            print(f"      {action:30s} {score:.3f}")

analyze_network(G_therapist_combined, "THERAPIST (Combined)")
analyze_network(G_client_combined, "CLIENT (Combined)")

print("\n" + "="*80)

## 1️⃣2️⃣ Comparison: Therapist vs Client Cognitive Patterns

In [None]:
print("="*80)
print("THERAPIST VS CLIENT COMPARISON")
print("="*80)

# Get action frequencies
therapist_actions = Counter()
client_actions = Counter()

for utterance in all_utterances:
    actions = [action for action, data in utterance['predictions'].items() 
              if data.get('is_active', False)]
    
    if utterance['speaker'] == 'therapist':
        therapist_actions.update(actions)
    else:
        client_actions.update(actions)

# Top actions
print("\nTop 15 Therapist Actions:")
for action, count in therapist_actions.most_common(15):
    bar = "█" * int(count / 10)
    print(f"   {action:30s} {count:4d} {bar}")

print("\nTop 15 Client Actions:")
for action, count in client_actions.most_common(15):
    bar = "█" * int(count / 10)
    print(f"   {action:30s} {count:4d} {bar}")

# Unique actions
therapist_only = set(therapist_actions.keys()) - set(client_actions.keys())
client_only = set(client_actions.keys()) - set(therapist_actions.keys())

if therapist_only:
    print("\nActions unique to therapist:")
    for action in sorted(therapist_only):
        print(f"   • {action}")

if client_only:
    print("\nActions unique to client:")
    for action in sorted(client_only):
        print(f"   • {action}")

## 1️⃣3️⃣ Export Summary

In [None]:
# Save summary
summary = {
    'transcripts_analyzed': len(annotated_transcripts),
    'total_utterances': len(all_utterances),
    'therapist_utterances': sum(1 for u in all_utterances if u['speaker'] == 'therapist'),
    'client_utterances': sum(1 for u in all_utterances if u['speaker'] == 'client'),
    'therapist_network': {
        'nodes': G_therapist_combined.number_of_nodes(),
        'edges': G_therapist_combined.number_of_edges(),
        'density': float(nx.density(G_therapist_combined)),
        'top_actions': [action for action, _ in therapist_actions.most_common(20)]
    },
    'client_network': {
        'nodes': G_client_combined.number_of_nodes(),
        'edges': G_client_combined.number_of_edges(),
        'density': float(nx.density(G_client_combined)),
        'top_actions': [action for action, _ in client_actions.most_common(20)]
    }
}

with open(output_dir / 'analysis_summary.json', 'w') as f:
    json.dump(summary, f, indent=2)

print("="*80)
print("ANALYSIS COMPLETE!")
print("="*80)
print(f"\n📁 Output directory: {output_dir}")
print(f"\n📊 Files generated:")
print(f"   • annotated_transcripts.json")
print(f"   • analysis_summary.json")
print(f"   • combined_therapist_network.png")
print(f"   • combined_client_network.png")
print(f"   • networks/transcript_*_therapist.png (per transcript)")
print(f"   • networks/transcript_*_client.png (per transcript)")
print("\n✅ All done!")