# Open-Ended Coding Analysis

This notebook provides a comprehensive framework for analyzing open-ended qualitative data through:
- **Code Frames**: Systematic coding structures for categorizing data
- **Themes**: Identification and analysis of recurring patterns
- **Categorization**: Multi-level classification and organization of qualitative data

## Features
- Data loading from flat files (CSV, Excel) and databases (SQLite, PostgreSQL)
- Interactive visualizations
- Robust error handling
- Code quality checks via Makefile
- Comprehensive testing framework

## 1. Setup and Imports

In [None]:
# Standard library imports
import os
import sys
import warnings
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union
import logging

# Data manipulation
import pandas as pd
import numpy as np

# Database connections
import sqlite3
from sqlalchemy import create_engine

# Visualizations
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

# NLP and text analysis
from collections import Counter, defaultdict
import re

# Word cloud for text visualization
try:
    from wordcloud import WordCloud
    WORDCLOUD_AVAILABLE = True
except ImportError:
    WORDCLOUD_AVAILABLE = False
    print("Note: wordcloud not available. Install with: pip install wordcloud")

# Network analysis
try:
    import networkx as nx
    NETWORKX_AVAILABLE = True
except ImportError:
    NETWORKX_AVAILABLE = False
    print("Note: networkx not available. Install with: pip install networkx")

# Add src to path and import modules
sys.path.insert(0, os.path.abspath('.'))
from src.data_loader import DataLoader
from src.code_frame import CodeFrame
from src.theme_analyzer import ThemeAnalyzer
from src.category_manager import CategoryManager

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Suppress warnings
warnings.filterwarnings('ignore')

# Set visualization styles
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Set Plotly default template
import plotly.io as pio
pio.templates.default = "plotly_white"

print("✓ All imports successful")
print(f"✓ Imported from src: DataLoader, CodeFrame, ThemeAnalyzer, CategoryManager")
print(f"✓ WordCloud available: {WORDCLOUD_AVAILABLE}")
print(f"✓ NetworkX available: {NETWORKX_AVAILABLE}")

## 2. Data Loading Module

Robust data loading from multiple sources with comprehensive error handling.

## 2. Visualization and Analysis Helper Functions

Reusable functions for efficient analysis and visualization.

In [None]:
def create_code_hierarchy_viz(code_frame: CodeFrame, title: str = "Code Hierarchy"):
    """
    Create hierarchical sunburst visualization of code frame.
    
    Args:
        code_frame: CodeFrame instance
        title: Chart title
    """
    sunburst_data = []
    
    # Add root
    sunburst_data.append({
        'labels': code_frame.name,
        'parents': '',
        'values': sum(info['count'] for info in code_frame.codes.values()),
        'ids': 'root'
    })
    
    # Add all codes
    for code_id, code_info in code_frame.codes.items():
        parent = code_info.get('parent', '')
        if parent and parent in code_frame.codes:
            parent_id = parent
        else:
            parent_id = 'root'
        
        sunburst_data.append({
            'labels': code_info['label'],
            'parents': parent_id,
            'values': max(code_info['count'], 1),  # Use 1 if count is 0 for visibility
            'ids': code_id
        })
    
    df_sunburst = pd.DataFrame(sunburst_data)
    
    fig = px.sunburst(
        df_sunburst,
        names='labels',
        parents='parents',
        values='values',
        title=title,
        ids='ids'
    )
    fig.update_layout(height=600)
    return fig


def create_code_heatmap(df: pd.DataFrame, code_frame: CodeFrame, title: str = "Code Co-occurrence Heatmap"):
    """
    Create heatmap showing code co-occurrence patterns.
    
    Args:
        df: DataFrame with 'codes' column
        code_frame: CodeFrame instance
        title: Chart title
    """
    # Get active codes (those with count > 0)
    active_codes = [code_id for code_id, info in code_frame.codes.items() if info['count'] > 0]
    
    if not active_codes:
        print("No active codes to visualize")
        return None
    
    # Create co-occurrence matrix
    n = len(active_codes)
    cooccur = np.zeros((n, n))
    
    for codes_list in df['codes']:
        for i, code1 in enumerate(active_codes):
            for j, code2 in enumerate(active_codes):
                if code1 in codes_list and code2 in codes_list:
                    cooccur[i, j] += 1
    
    # Get labels
    labels = [code_frame.codes[c]['label'] for c in active_codes]
    
    # Create heatmap
    fig = px.imshow(
        cooccur,
        labels=dict(x="Code", y="Code", color="Co-occurrences"),
        x=labels,
        y=labels,
        title=title,
        color_continuous_scale='YlOrRd',
        aspect='auto'
    )
    fig.update_layout(height=600)
    return fig


def create_wordcloud_viz(df: pd.DataFrame, title: str = "Response Word Cloud"):
    """
    Create word cloud from responses.
    
    Args:
        df: DataFrame with 'response' column
        title: Chart title
    """
    if not WORDCLOUD_AVAILABLE:
        print("WordCloud not available. Install with: pip install wordcloud")
        return None
    
    # Combine all responses
    text = ' '.join(df['response'].astype(str))
    
    # Create word cloud
    wordcloud = WordCloud(
        width=800,
        height=400,
        background_color='white',
        colormap='viridis',
        max_words=100
    ).generate(text)
    
    # Display using matplotlib
    fig, ax = plt.subplots(figsize=(12, 6))
    ax.imshow(wordcloud, interpolation='bilinear')
    ax.axis('off')
    ax.set_title(title, fontsize=16, fontweight='bold')
    plt.tight_layout()
    return fig


def create_network_graph(df: pd.DataFrame, code_frame: CodeFrame, title: str = "Code Network"):
    """
    Create network graph showing code relationships.
    
    Args:
        df: DataFrame with 'codes' column
        code_frame: CodeFrame instance  
        title: Chart title
    """
    if not NETWORKX_AVAILABLE:
        print("NetworkX not available. Install with: pip install networkx")
        return None
    
    # Build co-occurrence network
    G = nx.Graph()
    
    # Add nodes
    for code_id, info in code_frame.codes.items():
        if info['count'] > 0:
            G.add_node(code_id, label=info['label'], count=info['count'])
    
    # Add edges based on co-occurrence
    edge_weights = defaultdict(int)
    for codes_list in df['codes']:
        for i, code1 in enumerate(codes_list):
            for code2 in codes_list[i+1:]:
                edge = tuple(sorted([code1, code2]))
                edge_weights[edge] += 1
    
    for (code1, code2), weight in edge_weights.items():
        if code1 in G.nodes and code2 in G.nodes:
            G.add_edge(code1, code2, weight=weight)
    
    if len(G.nodes) == 0:
        print("No active codes to visualize")
        return None
    
    # Create layout
    pos = nx.spring_layout(G, k=0.5, iterations=50)
    
    # Create edge trace
    edge_trace = []
    for edge in G.edges():
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        weight = G.edges[edge]['weight']
        edge_trace.append(
            go.Scatter(
                x=[x0, x1, None],
                y=[y0, y1, None],
                mode='lines',
                line=dict(width=weight*2, color='#888'),
                hoverinfo='none',
                showlegend=False
            )
        )
    
    # Create node trace
    node_x = []
    node_y = []
    node_text = []
    node_size = []
    
    for node in G.nodes():
        x, y = pos[node]
        node_x.append(x)
        node_y.append(y)
        label = G.nodes[node]['label']
        count = G.nodes[node]['count']
        node_text.append(f"{label}<br>Count: {count}")
        node_size.append(10 + count * 3)
    
    node_trace = go.Scatter(
        x=node_x,
        y=node_y,
        mode='markers+text',
        hovertext=node_text,
        hoverinfo='text',
        marker=dict(
            size=node_size,
            color='lightblue',
            line=dict(width=2, color='darkblue')
        ),
        text=[G.nodes[node]['label'][:10] for node in G.nodes()],
        textposition='top center',
        showlegend=False
    )
    
    # Create figure
    fig = go.Figure(data=edge_trace + [node_trace])
    fig.update_layout(
        title=title,
        showlegend=False,
        hovermode='closest',
        height=600,
        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)
    )
    
    return fig


def create_treemap(code_frame: CodeFrame, title: str = "Code Distribution Treemap"):
    """
    Create treemap visualization of code distribution.
    
    Args:
        code_frame: CodeFrame instance
        title: Chart title
    """
    treemap_data = []
    
    for code_id, code_info in code_frame.codes.items():
        if code_info['count'] > 0:
            parent = code_info.get('parent', '')
            if not parent or parent not in code_frame.codes:
                parent = code_frame.name
            else:
                parent = code_frame.codes[parent]['label']
            
            treemap_data.append({
                'labels': code_info['label'],
                'parents': parent,
                'values': code_info['count']
            })
    
    # Add root
    treemap_data.append({
        'labels': code_frame.name,
        'parents': '',
        'values': 0
    })
    
    df_treemap = pd.DataFrame(treemap_data)
    
    fig = px.treemap(
        df_treemap,
        names='labels',
        parents='parents',
        values='values',
        title=title
    )
    fig.update_layout(height=600)
    return fig


def analyze_dataset(filepath: str, code_frame: CodeFrame, dataset_name: str, 
                   color_scheme: str = 'Blues'):
    """
    Complete analysis pipeline for a dataset.
    
    Args:
        filepath: Path to CSV file
        code_frame: Configured CodeFrame instance
        dataset_name: Name for display
        color_scheme: Plotly color scheme
        
    Returns:
        DataFrame with coded responses
    """
    # Load data
    loader = DataLoader()
    df = loader.load_csv(filepath)
    
    # Apply codes
    df['codes'] = df['response'].apply(
        lambda x: code_frame.apply_codes(x, case_sensitive=False)
    )
    
    # Display sample
    print(f"\n{'='*60}")
    print(f"{dataset_name} Analysis")
    print(f"{'='*60}")
    print(f"Loaded {len(df)} responses")
    print(f"\nSample coded responses:")
    display(df[['response', 'codes']].head(5))
    
    # Code summary
    summary = code_frame.summary()
    active_summary = summary[summary['Count'] > 0]
    
    print(f"\nCode Summary (showing {len(active_summary)} active codes):")
    display(active_summary.head(10))
    
    # Visualizations
    print(f"\nGenerating visualizations...")
    
    # 1. Bar chart
    fig = px.bar(
        active_summary,
        x='Label',
        y='Count',
        color='Count',
        title=f'{dataset_name}: Code Distribution',
        color_continuous_scale=color_scheme
    )
    fig.update_layout(xaxis_tickangle=-45, height=500)
    fig.show()
    
    # 2. Hierarchical sunburst
    fig = create_code_hierarchy_viz(code_frame, f"{dataset_name}: Code Hierarchy")
    fig.show()
    
    # 3. Treemap
    fig = create_treemap(code_frame, f"{dataset_name}: Code Distribution Treemap")
    fig.show()
    
    # 4. Co-occurrence heatmap
    fig = create_code_heatmap(df, code_frame, f"{dataset_name}: Code Co-occurrence")
    if fig:
        fig.show()
    
    # 5. Network graph
    fig = create_network_graph(df, code_frame, f"{dataset_name}: Code Network")
    if fig:
        fig.show()
    
    # 6. Word cloud
    fig = create_wordcloud_viz(df, f"{dataset_name}: Word Cloud")
    if fig:
        plt.show()
    
    return df


print("✓ Helper functions defined")
print("  - create_code_hierarchy_viz()")
print("  - create_code_heatmap()")
print("  - create_wordcloud_viz()")
print("  - create_network_graph()")
print("  - create_treemap()")
print("  - analyze_dataset() [Complete analysis pipeline]")

## 3. Initialize Data Loader

Create data loader instance for loading from various sources.

In [None]:
# Initialize data loader
data_loader = DataLoader()
print("✓ DataLoader initialized")
print("\nReady to load data from:")
print("  - CSV files")
print("  - Excel files")
print("  - SQLite databases")
print("  - PostgreSQL databases")

## 4. Example: Remote Work Analysis

Code frames provide a structured approach to categorizing qualitative data. Define your coding scheme here.

### 4.1 Load Data and Define Code Frame

In [None]:
# Create a code frame for analyzing remote work experiences
remote_work_frame = CodeFrame(
    name="Remote Work Analysis",
    description="Coding frame for analyzing remote work experiences"
)

# Define main categories (top-level codes)
remote_work_frame.add_code(
    'POSITIVE',
    'Positive Experiences',
    'Positive aspects of remote work'
)

remote_work_frame.add_code(
    'NEGATIVE',
    'Negative Experiences',
    'Challenges and negative aspects of remote work'
)

remote_work_frame.add_code(
    'NEUTRAL',
    'Neutral/Mixed',
    'Neutral or mixed experiences'
)

# Define sub-codes for positive experiences
remote_work_frame.add_code(
    'POS_FLEX',
    'Flexibility',
    'Flexibility in schedule and location',
    keywords=['flexibility', 'flexible', 'autonomy', 'schedule'],
    parent='POSITIVE'
)

remote_work_frame.add_code(
    'POS_BALANCE',
    'Work-Life Balance',
    'Improved work-life balance',
    keywords=['work-life balance', 'family', 'personal activities', 'time management'],
    parent='POSITIVE'
)

remote_work_frame.add_code(
    'POS_PROD',
    'Productivity',
    'Increased productivity',
    keywords=['productivity', 'productive', 'focus', 'efficient'],
    parent='POSITIVE'
)

remote_work_frame.add_code(
    'POS_COST',
    'Cost Savings',
    'Financial benefits',
    keywords=['cost savings', 'commuting', 'save money', 'reduced stress'],
    parent='POSITIVE'
)

# Define sub-codes for negative experiences
remote_work_frame.add_code(
    'NEG_COMM',
    'Communication Issues',
    'Communication and collaboration challenges',
    keywords=['communication', 'challenges', 'video call', 'fatigue'],
    parent='NEGATIVE'
)

remote_work_frame.add_code(
    'NEG_SOCIAL',
    'Social Isolation',
    'Lack of social interaction',
    keywords=['isolated', 'social', 'miss', 'lonely', 'relationships'],
    parent='NEGATIVE'
)

remote_work_frame.add_code(
    'NEG_TECH',
    'Technical Issues',
    'Technology and infrastructure problems',
    keywords=['technology', 'internet', 'connectivity', 'technical'],
    parent='NEGATIVE'
)

remote_work_frame.add_code(
    'NEG_BOUND',
    'Work-Life Boundaries',
    'Difficulty maintaining boundaries',
    keywords=['separating', 'boundaries', 'motivated', 'personal life'],
    parent='NEGATIVE'
)

print(f"\n✓ Code frame '{remote_work_frame.name}' created with {len(remote_work_frame.codes)} codes")
print(f"\nHierarchy: {remote_work_frame.get_hierarchy()}")

### 4.2 Apply Codes and Basic Analysis

In [None]:
# Apply codes to each response
df['codes'] = df['response'].apply(
    lambda x: remote_work_frame.apply_codes(x, case_sensitive=False)
)

# Create binary columns for each code
for code_id in remote_work_frame.codes.keys():
    df[f'code_{code_id}'] = df['codes'].apply(lambda x: 1 if code_id in x else 0)

print("\nCoded Responses:")
display(df[['response', 'codes']].head(10))

# Show code summary
print("\nCode Summary:")
code_summary = remote_work_frame.summary()
display(code_summary)

### 4.3 Advanced Visualizations

In [None]:
# Bar chart of code frequencies
fig = px.bar(
    code_summary,
    x='Label',
    y='Count',
    color='Count',
    title='Code Distribution in Responses',
    labels={'Label': 'Code', 'Count': 'Frequency'},
    color_continuous_scale='Blues'
)
fig.update_layout(xaxis_tickangle=-45, height=500)
fig.show()

# Hierarchical sunburst chart
sunburst_data = []
for code_id, code_info in remote_work_frame.codes.items():
    parent = code_info.get('parent', '')
    sunburst_data.append({
        'labels': code_info['label'],
        'parents': remote_work_frame.codes[parent]['label'] if parent else '',
        'values': code_info['count']
    })

sunburst_df = pd.DataFrame(sunburst_data)
fig = px.sunburst(
    sunburst_df,
    names='labels',
    parents='parents',
    values='values',
    title='Hierarchical Code Structure'
)
fig.show()

## 5. Themes

Identify and analyze recurring themes in the data.

### 5.1 Define Themes

In [None]:
# Initialize theme analyzer
theme_analyzer = ThemeAnalyzer()

# Define themes based on code patterns
theme_analyzer.define_theme(
    'THEME_AUTONOMY',
    'Autonomy and Control',
    'Themes related to personal autonomy, control over schedule, and independence',
    associated_codes=['POS_FLEX', 'POS_BALANCE']
)

theme_analyzer.define_theme(
    'THEME_PERFORMANCE',
    'Work Performance',
    'Themes related to productivity, efficiency, and work output',
    associated_codes=['POS_PROD', 'NEG_TECH']
)

theme_analyzer.define_theme(
    'THEME_CONNECTION',
    'Social Connection',
    'Themes related to social interaction, relationships, and collaboration',
    associated_codes=['NEG_SOCIAL', 'NEG_COMM']
)

theme_analyzer.define_theme(
    'THEME_WELLBEING',
    'Personal Wellbeing',
    'Themes related to mental health, stress, and life quality',
    associated_codes=['POS_COST', 'POS_BALANCE', 'NEG_BOUND']
)

print(f"\n✓ Defined {len(theme_analyzer.themes)} themes")

### 5.2 Identify Themes in Data

In [None]:
# Apply theme identification
df = theme_analyzer.identify_themes(df)

print("\nResponses with Identified Themes:")
display(df[['response', 'codes', 'themes']].head(10))

# Show theme summary
print("\nTheme Summary:")
theme_summary = theme_analyzer.summary()
display(theme_summary)

### 5.3 Visualize Themes

In [None]:
# Theme frequency bar chart
fig = px.bar(
    theme_summary,
    x='Name',
    y='Frequency',
    title='Theme Distribution',
    color='Frequency',
    color_continuous_scale='Viridis',
    hover_data=['Description']
)
fig.update_layout(xaxis_tickangle=-45, height=500)
fig.show()

# Theme network visualization
# Count theme co-occurrences
theme_counts = Counter()
theme_pairs = Counter()

for themes in df['themes']:
    for theme in themes:
        theme_counts[theme] += 1
    
    # Count pairs
    for i, theme1 in enumerate(themes):
        for theme2 in themes[i+1:]:
            pair = tuple(sorted([theme1, theme2]))
            theme_pairs[pair] += 1

print("\nTheme Co-occurrences:")
for pair, count in theme_pairs.most_common():
    print(f"{pair[0]} <-> {pair[1]}: {count}")

## 6. Categorization

Advanced categorization and classification of coded data.

### 6.1 Define Categories

In [None]:
# Initialize category manager
category_manager = CategoryManager()

# Level 1: Primary sentiment
category_manager.create_category(
    'CAT_POSITIVE',
    'Overall Positive',
    {'codes_required': ['POS_FLEX', 'POS_BALANCE', 'POS_PROD', 'POS_COST']},
    level=1
)

category_manager.create_category(
    'CAT_NEGATIVE',
    'Overall Negative',
    {'codes_required': ['NEG_COMM', 'NEG_SOCIAL', 'NEG_TECH', 'NEG_BOUND']},
    level=1
)

# Level 2: Specific aspects
category_manager.create_category(
    'CAT_WORK_FOCUSED',
    'Work-Focused',
    {'codes_required': ['POS_PROD', 'NEG_TECH', 'NEG_COMM']},
    level=2
)

category_manager.create_category(
    'CAT_LIFE_FOCUSED',
    'Life-Focused',
    {'codes_required': ['POS_BALANCE', 'POS_COST', 'NEG_BOUND']},
    level=2
)

category_manager.create_category(
    'CAT_SOCIAL_FOCUSED',
    'Social-Focused',
    {'codes_required': ['NEG_SOCIAL', 'NEG_COMM']},
    level=2
)

print(f"\n✓ Created {len(category_manager.categories)} categories")

### 6.2 Apply Categorization

In [None]:
# Apply categories to data
df = category_manager.categorize(df)

print("\nCategorized Responses:")
display(df[['response', 'codes', 'themes', 'categories']].head(10))

# Show category summary
print("\nCategory Summary:")
category_summary = category_manager.summary()
display(category_summary)

### 6.3 Visualize Categories

In [None]:
# Category distribution by level
fig = px.bar(
    category_summary,
    x='Name',
    y='Count',
    color='Level',
    title='Category Distribution by Hierarchical Level',
    barmode='group'
)
fig.update_layout(xaxis_tickangle=-45, height=500)
fig.show()

# Pie chart of primary categories
level1_cats = category_summary[category_summary['Level'] == 1]
fig = px.pie(
    level1_cats,
    values='Count',
    names='Name',
    title='Primary Category Distribution'
)
fig.show()

## 7. Comprehensive Analysis & Reporting

## 8. Export Results

In [None]:
# Create output directory
output_dir = Path('output')
output_dir.mkdir(exist_ok=True)

# Export coded data
output_file = output_dir / 'coded_data.csv'
df.to_csv(output_file, index=False)
print(f"✓ Exported coded data to {output_file}")

# Export code summary
code_summary = remote_work_frame.summary()
code_summary.to_csv(output_dir / 'code_summary.csv', index=False)
print(f"✓ Exported code summary to {output_dir / 'code_summary.csv'}")

# Export theme summary
theme_summary = theme_analyzer.summary()
theme_summary.to_csv(output_dir / 'theme_summary.csv', index=False)
print(f"✓ Exported theme summary to {output_dir / 'theme_summary.csv'}")

# Export category summary
category_summary = category_manager.summary()
category_summary.to_csv(output_dir / 'category_summary.csv', index=False)
print(f"✓ Exported category summary to {output_dir / 'category_summary.csv'}")

print("\n✓ All results exported successfully!")

---

# PART 2: Additional Dataset Analyses

The following sections demonstrate the framework with four additional datasets covering different research domains.

## 10. Political Leadership Analysis (Trump Dataset)

Analyze political discourse, policy impacts, and public opinion.

In [None]:
# Load Trump dataset
df_trump = data_loader.load_csv('data/trump_responses.csv')

print(f"\nLoaded {len(df_trump)} responses")
print(f"Data shape: {df_trump.shape}")
print(f"Columns: {df_trump.columns.tolist()}")
display(df_trump.head())

### 10.1 Define Political Discourse Code Frame

In [None]:
# Create code frame for political discourse analysis
trump_frame = CodeFrame(
    name="Political Leadership Analysis",
    description="Code frame for analyzing political discourse and policy impacts"
)

# Main categories
trump_frame.add_code('POLICY', 'Policy Analysis', 'Discussion of policies and their impacts')
trump_frame.add_code('COMMUNICATION', 'Communication Style', 'Communication and rhetoric')
trump_frame.add_code('GOVERNANCE', 'Governance', 'Leadership and governance approach')
trump_frame.add_code('IMPACT', 'Political Impact', 'Political and social impacts')

# Policy sub-codes
trump_frame.add_code(
    'POL_ECONOMY',
    'Economic Policy',
    'Trade, taxation, and economic impacts',
    keywords=['trade', 'economy', 'economic', 'tax', 'taxation', 'business', 'corporate'],
    parent='POLICY'
)

trump_frame.add_code(
    'POL_FOREIGN',
    'Foreign Policy',
    'International relations and foreign policy',
    keywords=['foreign', 'international', 'nato', 'russia', 'china', 'diplomatic', 'agreements'],
    parent='POLICY'
)

trump_frame.add_code(
    'POL_DOMESTIC',
    'Domestic Policy',
    'Immigration, healthcare, and domestic issues',
    keywords=['immigration', 'healthcare', 'border', 'wall', 'domestic', 'reform'],
    parent='POLICY'
)

trump_frame.add_code(
    'POL_JUDICIARY',
    'Judicial Appointments',
    'Court appointments and judicial impact',
    keywords=['court', 'supreme', 'judicial', 'judges', 'appointments', 'nominees'],
    parent='POLICY'
)

# Communication sub-codes
trump_frame.add_code(
    'COMM_MEDIA',
    'Media Relations',
    'Relationship with media and press coverage',
    keywords=['media', 'press', 'coverage', 'fake news', 'adversarial'],
    parent='COMMUNICATION'
)

trump_frame.add_code(
    'COMM_SOCIAL',
    'Social Media',
    'Twitter and social media usage',
    keywords=['twitter', 'social media', 'tweets', 'unprecedented'],
    parent='COMMUNICATION'
)

trump_frame.add_code(
    'COMM_RHETORIC',
    'Political Rhetoric',
    'Speaking style and political rhetoric',
    keywords=['rhetoric', 'communication', 'style', 'unconventional', 'theatrical'],
    parent='COMMUNICATION'
)

# Governance sub-codes
trump_frame.add_code(
    'GOV_LEADERSHIP',
    'Leadership Style',
    'Management and leadership approach',
    keywords=['leadership', 'governance', 'management', 'approach', 'business'],
    parent='GOVERNANCE'
)

trump_frame.add_code(
    'GOV_LOYALTY',
    'Loyalty Dynamics',
    'Loyalty expectations and demands',
    keywords=['loyalty', 'demands', 'officials', 'unusual'],
    parent='GOVERNANCE'
)

# Impact sub-codes
trump_frame.add_code(
    'IMP_VOTERS',
    'Voter Impact',
    'Impact on voter demographics and base',
    keywords=['voters', 'base', 'demographics', 'energized', 'working-class', 'consistent'],
    parent='IMPACT'
)

trump_frame.add_code(
    'IMP_POLITICAL',
    'Political System Impact',
    'Impact on political system and norms',
    keywords=['impeachment', 'divisive', 'proceedings', 'political', 'pardons'],
    parent='IMPACT'
)

trump_frame.add_code(
    'IMP_LASTING',
    'Long-term Impact',
    'Lasting effects and legacy',
    keywords=['lasting', 'impact', 'legacy', 'long-term', 'shift'],
    parent='IMPACT'
)

print(f"\n✓ Code frame created with {len(trump_frame.codes)} codes")
print(f"Hierarchy: {trump_frame.get_hierarchy()}")

### 10.2 Apply Codes and Analyze

In [None]:
# Apply codes
df_trump['codes'] = df_trump['response'].apply(
    lambda x: trump_frame.apply_codes(x, case_sensitive=False)
)

print("\nCoded Responses:")
display(df_trump[['response', 'topic', 'codes']].head(10))

# Code summary
trump_summary = trump_frame.summary()
print("\nCode Summary:")
display(trump_summary)

# Visualize
fig = px.bar(
    trump_summary[trump_summary['Count'] > 0],
    x='Label',
    y='Count',
    color='Count',
    title='Political Discourse Code Distribution',
    color_continuous_scale='Reds'
)
fig.update_layout(xaxis_tickangle=-45, height=500)
fig.show()

## 11. Justice System Analysis (Epstein Dataset)

Analyze institutional accountability and justice system issues.

In [None]:
# Load Epstein dataset
df_epstein = data_loader.load_csv('data/epstein_case_responses.csv')

print(f"\nLoaded {len(df_epstein)} responses")
print(f"Data shape: {df_epstein.shape}")
display(df_epstein.head())

### 11.1 Define Justice System Code Frame

In [None]:
# Create code frame for justice system analysis
epstein_frame = CodeFrame(
    name="Justice System Analysis",
    description="Code frame for institutional accountability and justice system issues"
)

# Main categories
epstein_frame.add_code('INSTITUTIONAL', 'Institutional Issues', 'Institutional failures and oversight')
epstein_frame.add_code('LEGAL', 'Legal Process', 'Legal proceedings and justice system')
epstein_frame.add_code('SOCIAL', 'Social Impact', 'Social and cultural implications')
epstein_frame.add_code('REFORM', 'Reform Needs', 'Calls for reform and change')

# Institutional sub-codes
epstein_frame.add_code(
    'INST_OVERSIGHT',
    'Oversight Failures',
    'Failures in institutional oversight',
    keywords=['oversight', 'failures', 'institutional', 'monitoring', 'procedures'],
    parent='INSTITUTIONAL'
)

epstein_frame.add_code(
    'INST_ACCOUNTABILITY',
    'Accountability Issues',
    'Questions of accountability and responsibility',
    keywords=['accountability', 'responsible', 'connections', 'powerful', 'enablers'],
    parent='INSTITUTIONAL'
)

epstein_frame.add_code(
    'INST_PRIVILEGE',
    'Privilege and Power',
    'Role of wealth and privilege',
    keywords=['privilege', 'wealth', 'power', 'preferential', 'treatment', 'factor'],
    parent='INSTITUTIONAL'
)

# Legal process sub-codes
epstein_frame.add_code(
    'LEG_JUSTICE',
    'Justice System',
    'Justice system handling and equal treatment',
    keywords=['justice', 'equal', 'treatment', 'handling', 'system'],
    parent='LEGAL'
)

epstein_frame.add_code(
    'LEG_PROCESS',
    'Legal Process',
    'Plea deals and prosecutorial decisions',
    keywords=['plea', 'deal', 'prosecution', 'legal', 'decisions', 'prior'],
    parent='LEGAL'
)

epstein_frame.add_code(
    'LEG_CUSTODY',
    'Custody Issues',
    'Death in custody and prison procedures',
    keywords=['custody', 'death', 'prison', 'jail', 'procedures'],
    parent='LEGAL'
)

epstein_frame.add_code(
    'LEG_INVESTIGATION',
    'Investigations',
    'Federal investigations and inquiries',
    keywords=['investigation', 'federal', 'revealed', 'inquiries', 'pressure'],
    parent='LEGAL'
)

# Social impact sub-codes
epstein_frame.add_code(
    'SOC_VICTIMS',
    'Victim Advocacy',
    'Focus on victims and survivors',
    keywords=['victims', 'survivors', 'believing', 'advocates', 'impact', 'support', 'compensation'],
    parent='SOCIAL'
)

epstein_frame.add_code(
    'SOC_TRUST',
    'Public Trust',
    'Impact on public trust in institutions',
    keywords=['trust', 'public', 'damaged', 'confidence'],
    parent='SOCIAL'
)

epstein_frame.add_code(
    'SOC_AWARENESS',
    'Social Awareness',
    'Raising awareness about systemic issues',
    keywords=['trafficking', 'problem', 'societal', 'awareness', 'attention'],
    parent='SOCIAL'
)

epstein_frame.add_code(
    'SOC_MEDIA',
    'Media Role',
    'Role of journalism and media coverage',
    keywords=['media', 'journalism', 'coverage', 'reporting', 'sensitive'],
    parent='SOCIAL'
)

# Reform sub-codes
epstein_frame.add_code(
    'REF_LEGAL',
    'Legal Reforms',
    'Proposed legal and justice reforms',
    keywords=['reform', 'proposed', 'prevent', 'criminal justice', 'transparency'],
    parent='REFORM'
)

epstein_frame.add_code(
    'REF_PROTECTION',
    'Protection Systems',
    'Need for better protection of vulnerable individuals',
    keywords=['protection', 'vulnerable', 'inadequate', 'background', 'checks'],
    parent='REFORM'
)

print(f"\n✓ Code frame created with {len(epstein_frame.codes)} codes")

### 11.2 Apply Codes and Analyze

In [None]:
# Apply codes
df_epstein['codes'] = df_epstein['response'].apply(
    lambda x: epstein_frame.apply_codes(x, case_sensitive=False)
)

print("\nCoded Responses:")
display(df_epstein[['response', 'topic', 'codes']].head(10))

# Code summary
epstein_summary = epstein_frame.summary()
print("\nCode Summary:")
display(epstein_summary)

# Visualize
fig = px.bar(
    epstein_summary[epstein_summary['Count'] > 0],
    x='Label',
    y='Count',
    color='Count',
    title='Justice System Code Distribution',
    color_continuous_scale='Oranges'
)
fig.update_layout(xaxis_tickangle=-45, height=500)
fig.show()

## 12. Cricket Analysis (Sports Dataset)

Analyze cricket perspectives, formats, and cultural impact.

In [None]:
# Load Cricket dataset
df_cricket = data_loader.load_csv('data/cricket_responses.csv')

print(f"\nLoaded {len(df_cricket)} responses")
print(f"Data shape: {df_cricket.shape}")
display(df_cricket.head())

### 12.1 Define Cricket Analysis Code Frame

In [None]:
# Create code frame for cricket analysis
cricket_frame = CodeFrame(
    name="Cricket Analysis",
    description="Code frame for analyzing cricket perspectives and culture"
)

# Main categories
cricket_frame.add_code('FORMATS', 'Cricket Formats', 'Different formats and competitions')
cricket_frame.add_code('TECHNICAL', 'Technical Aspects', 'Skills, strategies, and techniques')
cricket_frame.add_code('CULTURE', 'Cricket Culture', 'Cultural and social aspects')
cricket_frame.add_code('DEVELOPMENT', 'Game Development', 'Evolution and modernization')

# Format sub-codes
cricket_frame.add_code(
    'FMT_TEST',
    'Test Cricket',
    'Test matches and traditional format',
    keywords=['test', 'purest', 'ashes', 'resilience', 'strategy'],
    parent='FORMATS'
)

cricket_frame.add_code(
    'FMT_T20',
    'T20 Format',
    'T20 cricket and innovations',
    keywords=['t20', 'ipl', 'big bash', 'accessible', 'entertainment', 'innovations'],
    parent='FORMATS'
)

cricket_frame.add_code(
    'FMT_LEAGUES',
    'Leagues and Competitions',
    'IPL, county championship, and leagues',
    keywords=['league', 'ipl', 'county', 'championship', 'big bash', 'hundred'],
    parent='FORMATS'
)

cricket_frame.add_code(
    'FMT_TOURNAMENTS',
    'Major Tournaments',
    'World Cup and major tournaments',
    keywords=['world cup', 'tournament', 'pinnacle'],
    parent='FORMATS'
)

# Technical sub-codes
cricket_frame.add_code(
    'TECH_BOWLING',
    'Bowling',
    'Fast bowling and spin bowling',
    keywords=['bowling', 'fast', 'spin', 'art form'],
    parent='TECHNICAL'
)

cricket_frame.add_code(
    'TECH_BATTING',
    'Batting',
    'Batting techniques and mental aspects',
    keywords=['batting', 'mental', 'willow'],
    parent='TECHNICAL'
)

cricket_frame.add_code(
    'TECH_STATS',
    'Statistics and Analysis',
    'Cricket statistics and records',
    keywords=['statistics', 'records', 'depth'],
    parent='TECHNICAL'
)

cricket_frame.add_code(
    'TECH_COACHING',
    'Coaching and Training',
    'Modern coaching and scientific approach',
    keywords=['coaching', 'scientific', 'training'],
    parent='TECHNICAL'
)

# Culture sub-codes
cricket_frame.add_code(
    'CULT_COMMUNITY',
    'Community',
    'Community building and social connection',
    keywords=['community', 'together', 'clubs', 'grassroots', 'childhood'],
    parent='CULTURE'
)

cricket_frame.add_code(
    'CULT_RIVALRY',
    'Rivalries',
    'International rivalries and traditions',
    keywords=['rivalry', 'india', 'pakistan', 'unmatched', 'ashes'],
    parent='CULTURE'
)

cricket_frame.add_code(
    'CULT_EXPERIENCE',
    'Fan Experience',
    'Spectator experience and atmosphere',
    keywords=['atmosphere', 'watching', 'ground', 'commentary', 'experience', 'sound'],
    parent='CULTURE'
)

cricket_frame.add_code(
    'CULT_SPIRIT',
    'Spirit of Cricket',
    'Sportsmanship and game spirit',
    keywords=['spirit', 'sledging', 'gamesmanship'],
    parent='CULTURE'
)

# Development sub-codes
cricket_frame.add_code(
    'DEV_WOMEN',
    "Women's Cricket",
    "Growth of women's cricket",
    keywords=["women's", 'womens', 'recognition', 'deserved'],
    parent='DEVELOPMENT'
)

cricket_frame.add_code(
    'DEV_TECH',
    'Technology',
    'DRS and technological innovations',
    keywords=['drs', 'technology', 'decision-making', 'accuracy'],
    parent='DEVELOPMENT'
)

cricket_frame.add_code(
    'DEV_COMMERCIAL',
    'Commercialization',
    'Commercial aspects and business',
    keywords=['commercialization', 'commercial', 'revolutionized'],
    parent='DEVELOPMENT'
)

cricket_frame.add_code(
    'DEV_GLOBAL',
    'Global Reach',
    'Expanding cricket globally',
    keywords=['olympics', 'global', 'reach', 'rise', 'bangladesh'],
    parent='DEVELOPMENT'
)

cricket_frame.add_code(
    'DEV_INTEGRITY',
    'Integrity Issues',
    'Match-fixing and integrity',
    keywords=['fixing', 'match-fixing', 'seriously'],
    parent='DEVELOPMENT'
)

print(f"\n✓ Code frame created with {len(cricket_frame.codes)} codes")

### 12.2 Apply Codes and Analyze

In [None]:
# Apply codes
df_cricket['codes'] = df_cricket['response'].apply(
    lambda x: cricket_frame.apply_codes(x, case_sensitive=False)
)

print("\nCoded Responses:")
display(df_cricket[['response', 'topic', 'codes']].head(10))

# Code summary
cricket_summary = cricket_frame.summary()
print("\nCode Summary:")
display(cricket_summary)

# Visualize
fig = px.bar(
    cricket_summary[cricket_summary['Count'] > 0],
    x='Label',
    y='Count',
    color='Count',
    title='Cricket Analysis Code Distribution',
    color_continuous_scale='Greens'
)
fig.update_layout(xaxis_tickangle=-45, height=500)
fig.show()

## 13. Fashion Industry Analysis (Fashion Dataset)

Analyze fashion trends, sustainability, and consumer attitudes.

In [None]:
# Load Fashion dataset
df_fashion = data_loader.load_csv('data/fashion_responses.csv')

print(f"\nLoaded {len(df_fashion)} responses")
print(f"Data shape: {df_fashion.shape}")
display(df_fashion.head())

### 13.1 Define Fashion Industry Code Frame

In [None]:
# Create code frame for fashion analysis
fashion_frame = CodeFrame(
    name="Fashion Industry Analysis",
    description="Code frame for analyzing fashion trends and consumer attitudes"
)

# Main categories
fashion_frame.add_code('SUSTAINABILITY', 'Sustainability', 'Environmental and ethical concerns')
fashion_frame.add_code('CONSUMPTION', 'Consumption Patterns', 'Buying behavior and alternatives')
fashion_frame.add_code('IDENTITY', 'Personal Identity', 'Self-expression and style')
fashion_frame.add_code('INDUSTRY', 'Fashion Industry', 'Industry practices and trends')
fashion_frame.add_code('SOCIAL', 'Social Issues', 'Inclusivity and representation')

# Sustainability sub-codes
fashion_frame.add_code(
    'SUS_ETHICAL',
    'Ethical Fashion',
    'Sustainable and ethical practices',
    keywords=['sustainable', 'sustainability', 'ethical', 'ethics', 'essential', 'principles'],
    parent='SUSTAINABILITY'
)

fashion_frame.add_code(
    'SUS_FAST',
    'Fast Fashion Critique',
    'Criticism of fast fashion',
    keywords=['fast fashion', 'waste', 'environmental'],
    parent='SUSTAINABILITY'
)

fashion_frame.add_code(
    'SUS_IMPACT',
    'Environmental Impact',
    'Carbon footprint and environmental concerns',
    keywords=['carbon', 'footprint', 'alarming', 'environmental'],
    parent='SUSTAINABILITY'
)

# Consumption sub-codes
fashion_frame.add_code(
    'CON_QUALITY',
    'Quality Focus',
    'Quality over quantity approach',
    keywords=['quality', 'quantity', 'guide', 'purchasing'],
    parent='CONSUMPTION'
)

fashion_frame.add_code(
    'CON_VINTAGE',
    'Vintage and Secondhand',
    'Vintage clothing and resale',
    keywords=['vintage', 'resale', 'market', 'reshaping'],
    parent='CONSUMPTION'
)

fashion_frame.add_code(
    'CON_RENTAL',
    'Alternative Models',
    'Rental and sharing services',
    keywords=['rental', 'services', 'alternatives'],
    parent='CONSUMPTION'
)

fashion_frame.add_code(
    'CON_COST',
    'Cost Barriers',
    'Cost and accessibility issues',
    keywords=['cost', 'prohibitive', 'expensive'],
    parent='CONSUMPTION'
)

fashion_frame.add_code(
    'CON_MINIMALIST',
    'Minimalism',
    'Minimalist approach to wardrobes',
    keywords=['minimalist', 'wardrobes', 'reduce', 'fatigue'],
    parent='CONSUMPTION'
)

# Identity sub-codes
fashion_frame.add_code(
    'ID_PERSONAL',
    'Personal Style',
    'Individual expression and style',
    keywords=['personal style', 'individuality', 'unique', 'expression', 'expresses'],
    parent='IDENTITY'
)

fashion_frame.add_code(
    'ID_CONFIDENCE',
    'Self-Care and Wellbeing',
    'Fashion as self-care and confidence',
    keywords=['self-care', 'confidence', 'wellbeing', 'boosts'],
    parent='IDENTITY'
)

fashion_frame.add_code(
    'ID_COMFORT',
    'Comfort Priority',
    'Emphasis on comfort',
    keywords=['comfort', 'aesthetics', 'prioritize'],
    parent='IDENTITY'
)

# Industry sub-codes
fashion_frame.add_code(
    'IND_LUXURY',
    'Luxury Fashion',
    'Luxury brands and haute couture',
    keywords=['luxury', 'haute couture', 'craftsmanship', 'extraordinary'],
    parent='INDUSTRY'
)

fashion_frame.add_code(
    'IND_STREETWEAR',
    'Streetwear',
    'Streetwear and casual fashion',
    keywords=['streetwear', 'mainstream'],
    parent='INDUSTRY'
)

fashion_frame.add_code(
    'IND_TRENDS',
    'Trends and Shows',
    'Fashion weeks and industry trends',
    keywords=['fashion week', 'trends', 'set', 'industry'],
    parent='INDUSTRY'
)

fashion_frame.add_code(
    'IND_MEDIA',
    'Media and Influence',
    'Social media and influencers',
    keywords=['social media', 'influencers', 'democratized', 'photography', 'replaced'],
    parent='INDUSTRY'
)

fashion_frame.add_code(
    'IND_INNOVATION',
    'Innovation',
    'Digital fashion and new technologies',
    keywords=['digital', 'nft', 'future', 'interesting'],
    parent='INDUSTRY'
)

fashion_frame.add_code(
    'IND_CRAFT',
    'Traditional Crafts',
    'Tailoring and traditional skills',
    keywords=['tailoring', 'dying', 'art', 'form', 'craftsmanship'],
    parent='INDUSTRY'
)

# Social issues sub-codes
fashion_frame.add_code(
    'SOC_INCLUSIVITY',
    'Size Inclusivity',
    'Size diversity and inclusivity',
    keywords=['inclusivity', 'size', 'inclusive'],
    parent='SOCIAL'
)

fashion_frame.add_code(
    'SOC_BODY',
    'Body Positivity',
    'Body positivity movement',
    keywords=['body positivity', 'changing', 'advertising'],
    parent='SOCIAL'
)

fashion_frame.add_code(
    'SOC_GENDER',
    'Gender Neutral',
    'Gender-neutral fashion',
    keywords=['gender-neutral', 'breaking', 'boundaries'],
    parent='SOCIAL'
)

fashion_frame.add_code(
    'SOC_CULTURE',
    'Cultural Issues',
    'Cultural appropriation and representation',
    keywords=['cultural', 'appropriation', 'addressing', 'local', 'designers'],
    parent='SOCIAL'
)

fashion_frame.add_code(
    'SOC_EDUCATION',
    'Education',
    'Fashion education and awareness',
    keywords=['education', 'should include', 'history'],
    parent='SOCIAL'
)

fashion_frame.add_code(
    'SOC_WORKPLACE',
    'Workplace Fashion',
    'Dress codes and workplace norms',
    keywords=['workplace', 'dress codes', 'relaxed'],
    parent='SOCIAL'
)

print(f"\n✓ Code frame created with {len(fashion_frame.codes)} codes")

### 13.2 Apply Codes and Analyze

In [None]:
# Apply codes
df_fashion['codes'] = df_fashion['response'].apply(
    lambda x: fashion_frame.apply_codes(x, case_sensitive=False)
)

print("\nCoded Responses:")
display(df_fashion[['response', 'topic', 'codes']].head(10))

# Code summary
fashion_summary = fashion_frame.summary()
print("\nCode Summary:")
display(fashion_summary)

# Visualize
fig = px.bar(
    fashion_summary[fashion_summary['Count'] > 0],
    x='Label',
    y='Count',
    color='Count',
    title='Fashion Industry Code Distribution',
    color_continuous_scale='Purples'
)
fig.update_layout(xaxis_tickangle=-45, height=500)
fig.show()

## 14. Enhanced Comparative Analysis

Advanced cross-dataset comparison with sophisticated visualizations and insights.

In [None]:
# Collect all datasets and code frames
datasets_info = [
    {'name': 'Remote Work', 'df': df, 'frame': remote_work_frame, 'color': 'Blues'},
    {'name': 'Political Leadership', 'df': df_trump, 'frame': trump_frame, 'color': 'Reds'},
    {'name': 'Justice System', 'df': df_epstein, 'frame': epstein_frame, 'color': 'Oranges'},
    {'name': 'Cricket', 'df': df_cricket, 'frame': cricket_frame, 'color': 'Greens'},
    {'name': 'Fashion', 'df': df_fashion, 'frame': fashion_frame, 'color': 'Purples'}
]

# Create comprehensive comparative metrics
comparative_data = []

for info in datasets_info:
    dataset_df = info['df']
    frame = info['frame']
    
    # Calculate metrics
    total_codes = len(frame.codes)
    active_codes = sum(1 for c in frame.codes.values() if c['count'] > 0)
    total_responses = len(dataset_df)
    avg_codes = dataset_df['codes'].apply(len).mean()
    max_codes = dataset_df['codes'].apply(len).max()
    min_codes = dataset_df['codes'].apply(len).min()
    std_codes = dataset_df['codes'].apply(len).std()
    
    # Coverage: % of responses with at least one code
    coverage = (dataset_df['codes'].apply(len) > 0).sum() / len(dataset_df) * 100
    
    # Code utilization: % of codes that are actually used
    utilization = (active_codes / total_codes * 100) if total_codes > 0 else 0
    
    comparative_data.append({
        'Dataset': info['name'],
        'Responses': total_responses,
        'Total Codes': total_codes,
        'Active Codes': active_codes,
        'Avg Codes/Response': avg_codes,
        'Max Codes/Response': max_codes,
        'Min Codes/Response': min_codes,
        'Std Dev': std_codes,
        'Coverage %': coverage,
        'Code Utilization %': utilization
    })

comp_df = pd.DataFrame(comparative_data)

print("\n" + "="*80)
print("COMPREHENSIVE COMPARATIVE ANALYSIS")
print("="*80)
display(comp_df)

# Statistical summary
print("\n" + "="*80)
print("STATISTICAL SUMMARY")
print("="*80)
print(f"\nAverage responses per dataset: {comp_df['Responses'].mean():.1f}")
print(f"Average code frame size: {comp_df['Total Codes'].mean():.1f} codes")
print(f"Average code utilization: {comp_df['Code Utilization %'].mean():.1f}%")
print(f"Average coverage: {comp_df['Coverage %'].mean():.1f}%")
print(f"\nMost complex code frame: {comp_df.loc[comp_df['Total Codes'].idxmax(), 'Dataset']} ({comp_df['Total Codes'].max()} codes)")
print(f"Highest coding density: {comp_df.loc[comp_df['Avg Codes/Response'].idxmax(), 'Dataset']} ({comp_df['Avg Codes/Response'].max():.2f} codes/response)")

### 14.1 Code Frame Complexity Comparison

In [None]:
# Create subplots for multiple metrics
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=(
        'Code Frame Size',
        'Coding Density',
        'Code Utilization Rate',
        'Coverage Rate'
    ),
    specs=[[{'type': 'bar'}, {'type': 'bar'}],
           [{'type': 'bar'}, {'type': 'bar'}]]
)

# Color mapping
colors = ['#3498db', '#e74c3c', '#f39c12', '#2ecc71', '#9b59b6']

# Plot 1: Code Frame Size
fig.add_trace(
    go.Bar(
        x=comp_df['Dataset'],
        y=comp_df['Total Codes'],
        name='Total Codes',
        marker_color=colors,
        text=comp_df['Total Codes'],
        textposition='outside'
    ),
    row=1, col=1
)

# Plot 2: Coding Density
fig.add_trace(
    go.Bar(
        x=comp_df['Dataset'],
        y=comp_df['Avg Codes/Response'],
        name='Avg Codes/Response',
        marker_color=colors,
        text=comp_df['Avg Codes/Response'].round(2),
        textposition='outside'
    ),
    row=1, col=2
)

# Plot 3: Code Utilization
fig.add_trace(
    go.Bar(
        x=comp_df['Dataset'],
        y=comp_df['Code Utilization %'],
        name='Utilization %',
        marker_color=colors,
        text=comp_df['Code Utilization %'].round(1),
        textposition='outside'
    ),
    row=2, col=1
)

# Plot 4: Coverage
fig.add_trace(
    go.Bar(
        x=comp_df['Dataset'],
        y=comp_df['Coverage %'],
        name='Coverage %',
        marker_color=colors,
        text=comp_df['Coverage %'].round(1),
        textposition='outside'
    ),
    row=2, col=2
)

fig.update_layout(
    height=800,
    showlegend=False,
    title_text="Comprehensive Code Frame Metrics Comparison"
)
fig.update_xaxes(tickangle=-45)
fig.show()

### 14.2 Multi-Dimensional Radar Comparison

In [None]:
# Normalize metrics to 0-100 scale for radar chart
radar_data = comp_df.copy()

# Normalize each metric
for col in ['Total Codes', 'Active Codes', 'Avg Codes/Response']:
    max_val = radar_data[col].max()
    if max_val > 0:
        radar_data[f'{col}_norm'] = (radar_data[col] / max_val * 100)

# Create radar chart
fig = go.Figure()

categories = ['Code Frame\nSize', 'Active Codes', 'Coding\nDensity', 
              'Utilization\nRate', 'Coverage\nRate']

for idx, row in radar_data.iterrows():
    fig.add_trace(go.Scatterpolar(
        r=[
            row['Total Codes_norm'],
            row['Active Codes_norm'],
            row['Avg Codes/Response_norm'],
            row['Code Utilization %'],
            row['Coverage %']
        ],
        theta=categories,
        fill='toself',
        name=row['Dataset']
    ))

fig.update_layout(
    polar=dict(
        radialaxis=dict(
            visible=True,
            range=[0, 100]
        )),
    showlegend=True,
    title="Multi-Dimensional Dataset Comparison (Normalized to 100)",
    height=600
)
fig.show()

### 14.3 Coding Distribution Analysis

In [None]:
# Create box plots for coding distribution
fig = go.Figure()

for i, info in enumerate(datasets_info):
    codes_per_response = info['df']['codes'].apply(len)
    fig.add_trace(go.Box(
        y=codes_per_response,
        name=info['name'],
        marker_color=colors[i],
        boxmean='sd'  # Show mean and standard deviation
    ))

fig.update_layout(
    title="Distribution of Codes per Response Across Datasets",
    yaxis_title="Number of Codes per Response",
    xaxis_title="Dataset",
    height=500,
    showlegend=False
)
fig.show()

# Violin plot for more detailed distribution
fig = go.Figure()

for i, info in enumerate(datasets_info):
    codes_per_response = info['df']['codes'].apply(len)
    fig.add_trace(go.Violin(
        y=codes_per_response,
        name=info['name'],
        marker_color=colors[i],
        box_visible=True,
        meanline_visible=True
    ))

fig.update_layout(
    title="Detailed Distribution Analysis (Violin Plot)",
    yaxis_title="Number of Codes per Response",
    xaxis_title="Dataset",
    height=500,
    showlegend=False
)
fig.show()

### 14.4 Metric Correlation Analysis

In [None]:
# Create correlation matrix for metrics
correlation_cols = ['Total Codes', 'Active Codes', 'Avg Codes/Response', 
                   'Coverage %', 'Code Utilization %']
corr_matrix = comp_df[correlation_cols].corr()

# Create annotated heatmap
fig = px.imshow(
    corr_matrix,
    labels=dict(color="Correlation"),
    x=correlation_cols,
    y=correlation_cols,
    color_continuous_scale='RdBu_r',
    zmin=-1, zmax=1,
    title="Correlation Between Coding Metrics",
    aspect='auto'
)

# Add correlation values as text
fig.update_traces(
    text=corr_matrix.round(2).values,
    texttemplate='%{text}'
)

fig.update_layout(height=600)
fig.show()

print("\nKey Correlations:")
print("="*60)
for i in range(len(correlation_cols)):
    for j in range(i+1, len(correlation_cols)):
        corr_val = corr_matrix.iloc[i, j]
        if abs(corr_val) > 0.5:
            direction = "positive" if corr_val > 0 else "negative"
            print(f"{correlation_cols[i]} <-> {correlation_cols[j]}: {corr_val:.2f} ({direction})")

### 14.5 Parallel Coordinates Visualization

In [None]:
# Create parallel coordinates plot
fig = px.parallel_coordinates(
    comp_df,
    dimensions=['Total Codes', 'Active Codes', 'Avg Codes/Response', 
               'Coverage %', 'Code Utilization %'],
    color='Avg Codes/Response',
    labels={"Total Codes": "Total Codes",
           "Active Codes": "Active Codes",
           "Avg Codes/Response": "Avg Codes/Response",
           "Coverage %": "Coverage %",
           "Code Utilization %": "Utilization %"},
    color_continuous_scale='Viridis',
    title="Parallel Coordinates: Multi-Metric Comparison"
)

# Add dataset labels
for i, row in comp_df.iterrows():
    fig.add_annotation(
        x=-0.05,
        y=i/len(comp_df),
        text=row['Dataset'],
        showarrow=False,
        xref='paper',
        yref='paper',
        xanchor='right'
    )

fig.update_layout(height=600)
fig.show()

## 9. Next Steps

### Customization Options:
1. **Modify Code Frames**: Update the code definitions to match your research needs
2. **Refine Themes**: Adjust theme definitions and associated codes
3. **Add Categories**: Create additional hierarchical categories
4. **Load Your Data**: Replace sample data with your actual responses

### Advanced Analysis:
- Intercoder reliability testing
- Temporal analysis of themes
- Demographic comparisons
- Sentiment analysis integration
- Machine learning-assisted coding

### Quality Assurance:
- Run `make test` to execute unit tests
- Run `make lint` to check code quality
- Review coding consistency across responses