# CONFIGURATION

In [2]:
import os
import pandas as pd
from openai import OpenAI
import time
import json
import numpy as np

# --- Configuration ---
CSV_FOLDER = 'videos4/projfiles'
CSV_FILE_MUSIC = os.path.join(CSV_FOLDER, 'input', 'youtube_comments_audio.csv')
CSV_FILE_VIDEO = os.path.join(CSV_FOLDER, 'input', 'youtube_comments_video.csv')
# CSV_FILE_SPORT = os.path.join(CSV_FOLDER, 'input', 'youtube_comments_sport.csv')
CSV_FILE_BETTERVIDEO = os.path.join(CSV_FOLDER, 'input', 'youtube_comments_bettervideo.csv')
CSV_FILE_PATH = os.path.join(CSV_FOLDER, 'youtube_comments.csv')
EMBEDDED_CSV_PATH = os.path.join(CSV_FOLDER, 'youtube_comments_embeded.csv')
CLUSTERED_CSV_PATH = os.path.join(CSV_FOLDER, 'youtube_comments_clustered.csv')
LABELED_CSV_PATH = os.path.join(CSV_FOLDER, 'youtube_comments_labeled.csv')
BATCH_FILE = "batch_input.jsonl"
OPENAI_MODEL = "text-embedding-3-large"
RATE_LIMIT_DELAY = 0.00  # in seconds

# --- OpenAI Client ---
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [3]:
def string_to_array(embedding_str):
    """Converts a string representation of an array to a NumPy array."""
    try:
        # Remove brackets and split by comma
        cleaned_str = embedding_str.strip('[]')
        values = cleaned_str.split(',')
        # Convert each value to float
        return np.array([float(val.strip()) for val in values])
    except:
        return None  # Handle potential errors in parsing

In [4]:
# Categories used for the labeling
category_dict = {
    "Musical Performance": 1,
    "Vocal Performance": 2,
    "Music Quality/Production": 3,

    "Visual Performance": 4,
    "Stage Presence": 5,
    "Gestures": 6,
    "Movement/Choreography": 7,

    "Technical Production": 8,
    "Video Quality": 9,
    "Camera Work": 10,
    "Lighting/Light design": 11,

    "Overall Impression - Positive": 12,
    "Overall Impression - Negative": 13,
    "Overall Impression - Neutral": 14,

    "Visual elements described using musical terms": 15,
    "Musical elements described using visual terms": 16,

    "Engagement/Connection": 17,
    "Personal Connection": 18,
    "Artist Connection": 19,

    "Olympic games": 20,

    "Implicit Visual Influence": 21,

    "Miscellaneous/Off-Topic": 22
}

# Preprocess input

In [5]:
import csv

In [6]:
df_music = pd.read_csv(CSV_FILE_MUSIC, sep='\t', quoting=csv.QUOTE_ALL, quotechar='"', encoding='utf-8')
df_video = pd.read_csv(CSV_FILE_VIDEO, sep='\t', quoting=csv.QUOTE_ALL, quotechar='"', encoding='utf-8')
#df_sport = pd.read_csv(CSV_FILE_SPORT, sep='\t', quoting=csv.QUOTE_ALL, quotechar='"', encoding='utf-8')
df_bettervideo = pd.read_csv(CSV_FILE_BETTERVIDEO, sep='\t', quoting=csv.QUOTE_ALL, quotechar='"', encoding='utf-8')

In [7]:
df_music['source_id'] = 1
df_video['source_id'] = 2
# df_sport['source_id'] = 2
df_bettervideo['source_id'] = 3

In [8]:
df = pd.concat([df_music, df_video, df_bettervideo], ignore_index=True) #add df_sport if needed

In [9]:
pd.DataFrame.to_csv(df, CSV_FILE_PATH)
del df_music
del df_video
# del df_sport
del df_bettervideo

# Labeling

In [10]:
df = pd.read_csv(CSV_FILE_PATH)

  df = pd.read_csv(CSV_FILE_PATH)


In [11]:
system_message = """You are a comment classifier. Your task is to categorize comments based on a predefined set of categories and return their numerical IDs as a Python list. You must ONLY output a list of integers, and nothing else. Do not include any category names or other text in your response."""

user_template = """
Here are the categories and their IDs:
{category_dict}

Please read the comment below and identify all relevant categories.  Return ONLY the numerical IDs of the relevant categories as a Python list.

**Instructions:**

* **Identify Relevant Categories:** Determine which categories from the list above apply to the comment.
* **Output Format:** Return ONLY a Python list containing the numerical IDs of the relevant categories. The list should look like this: `[ID1, ID2, ID3]` (e.g., `[1, 8]` or `[3]` or `[]`).
* **Multiple Categories:** If multiple categories are relevant, include all their IDs in the list, separated by commas within the brackets.
* **No Relevant Categories:** If no categories are relevant, return an empty list `[]`.
* **Numerical IDs ONLY:**  Do NOT include category names or any other text in your response. Just the list of numerical IDs.

**Example Outputs:**
* For a comment belonging to categories 1 and 8: `[1, 8]`
* For a comment belonging to category 3: `[3]`
* For a comment belonging to no categories: `[]`

**Comment Text:**
{comment_text}

Category(ies) ID:
"""

In [12]:
def create_batch_input_files(df, category_dict, user_template, max_comments_per_batch=15000):
    """Creates multiple JSONL files for Batch API input with comment limit per batch."""
    total_rows = len(df)
    batch_files = []
    
    # Calculate number of batches needed based on max comments per batch
    num_batches = (total_rows + max_comments_per_batch - 1) // max_comments_per_batch
    
    for batch_num in range(num_batches):
        start_idx = batch_num * max_comments_per_batch
        end_idx = min((batch_num + 1) * max_comments_per_batch, total_rows)
        batch_filename = f"batch_input_{batch_num + 1}.jsonl"
        
        # Get batch subset of DataFrame
        batch_df = df.iloc[start_idx:end_idx]
        
        print(f"Creating batch {batch_num + 1}/{num_batches} with {len(batch_df)} comments")
        
        with open(batch_filename, 'w') as f:
            for index, row in batch_df.iterrows():
                user_message = user_template.format(
                    comment_text=row['comment_text'], 
                    category_dict=json.dumps(category_dict)
                )
                request_data = {
                    "custom_id": str(index),
                    "method": "POST",
                    "url": "/v1/chat/completions",
                    "body": {
                        "model": "gpt-4o-mini",
                        "messages": [
                            {"role": "system", "content": system_message},
                            {"role": "user", "content": user_message}
                        ],
                        "temperature": 0.3,
                        "seed": 42
                    }
                }
                f.write(json.dumps(request_data) + '\n')
        
        batch_files.append(batch_filename)
        print(f"Created batch file: {batch_filename}")
    
    return batch_files

def process_batch_results(output_file_path):
    """Process the batch results from the output file and return as a dictionary."""
    results = {}
    
    with open(output_file_path, 'r') as f:
        for line in f:
            response_data = json.loads(line)
            custom_id = response_data.get('custom_id')
            
            try:
                # Navigate through the response structure
                if ('response' in response_data and 
                    'body' in response_data['response']):
                    
                    # Parse the body as JSON
                    body_str = response_data['response']['body']
                    if isinstance(body_str, str):
                        body = json.loads(body_str)
                    else:
                        body = body_str
                    
                    # Extract the content from the assistant's message
                    if ('choices' in body and 
                        len(body['choices']) > 0 and 
                        'message' in body['choices'][0] and
                        'content' in body['choices'][0]['message']):
                        
                        content = body['choices'][0]['message']['content']
                        
                        # Parse the content as a JSON array
                        try:
                            labels = json.loads(content)
                            results[custom_id] = labels
                        except json.JSONDecodeError:
                            print(f"Warning: Could not parse content as JSON for ID {custom_id}: {content}")
                            # If parsing fails, store the raw content
                            results[custom_id] = content
            except Exception as e:
                print(f"Error processing result for ID {custom_id}: {str(e)}")
    
    print(f"Processed {len(results)} results from batch")
    return results

def process_batch_with_retries(client, batch_file, max_retries=3, retry_delay=60):
    """Process a single batch file with retries and better error handling."""
    print(f"Processing batch file: {batch_file}")
    
    for attempt in range(max_retries):
        try:
            # Upload batch file
            with open(batch_file, "rb") as f:
                response = client.files.create(file=f, purpose="batch")
            input_file_id = response.id
            
            # Create batch
            batch_response = client.batches.create(
                input_file_id=input_file_id,
                endpoint="/v1/chat/completions",
                completion_window="24h"
            )
            batch_id = batch_response.id
            print(f"Batch created with ID: {batch_id}")
            
            # Monitor batch status
            while True:
                batch_status = client.batches.retrieve(batch_id)
                print(f"Batch status: {batch_status.status}")
                
                if batch_status.status == "completed":
                    output_file_id = batch_status.output_file_id
                    output_file_response = client.files.content(output_file_id)
                    output_file_path = f"batch_output_{batch_id}.jsonl"
                    
                    with open(output_file_path, 'w') as f:
                        f.write(output_file_response.text)
                    
                    return process_batch_results(output_file_path)
                
                elif batch_status.status in ["failed", "expired", "cancelled"]:
                    print(f"Batch failed with status: {batch_status.status}")
                    if batch_status.error_file_id:
                        error_file_response = client.files.content(batch_status.error_file_id)
                        print("Error details:", error_file_response.text)
                    break
                
                time.sleep(60)  # Check every minute
                
        except Exception as e:
            print(f"Attempt {attempt + 1} failed with error: {str(e)}")
            if attempt < max_retries - 1:
                print(f"Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)
            else:
                print("Max retries reached. Moving to next batch.")
                return {}
    
    return {}

def process_all_batches(df, category_dict, user_template, client):
    """Process all batches with progress tracking and error handling."""
    print("Starting batch processing...")
    
    # Create batch files
    batch_files = create_batch_input_files(df, category_dict, user_template)
    all_results = {}
    processed_batches = 0
    total_batches = len(batch_files)
    
    # Process each batch
    for batch_file in batch_files:
        processed_batches += 1
        print(f"\nProcessing batch {processed_batches}/{total_batches}")
        
        batch_results = process_batch_with_retries(client, batch_file)
        all_results.update(batch_results)
        
        # Save intermediate results
        intermediate_df = df.copy()
        intermediate_df['comment_labels'] = intermediate_df.index.map(lambda x: all_results.get(str(x), []))
        intermediate_df.to_csv(f"intermediate_results_{processed_batches}.csv")
        print(f"Saved intermediate results to: intermediate_results_{processed_batches}.csv")
    
    return all_results

In [13]:
#batch_results = process_batch_results('/data/notebook_files/projfiles/input/batch_67b701252b4c8190b77ec9e319c0b008_output.jsonl')

all_results = process_all_batches(df, category_dict, user_template, client)

# Update DataFrame with final results
df['comment_labels'] = df.index.map(lambda x: all_results.get(str(x), []))

Starting batch processing...
Creating batch 1/10 with 15000 comments
Created batch file: batch_input_1.jsonl
Creating batch 2/10 with 15000 comments
Created batch file: batch_input_2.jsonl
Creating batch 3/10 with 15000 comments
Created batch file: batch_input_3.jsonl
Creating batch 4/10 with 15000 comments
Created batch file: batch_input_4.jsonl
Creating batch 5/10 with 15000 comments
Created batch file: batch_input_5.jsonl
Creating batch 6/10 with 15000 comments
Created batch file: batch_input_6.jsonl
Creating batch 7/10 with 15000 comments
Created batch file: batch_input_7.jsonl
Creating batch 8/10 with 15000 comments
Created batch file: batch_input_8.jsonl
Creating batch 9/10 with 15000 comments
Created batch file: batch_input_9.jsonl
Creating batch 10/10 with 9444 comments
Created batch file: batch_input_10.jsonl

Processing batch 1/10
Processing batch file: batch_input_1.jsonl
Batch created with ID: batch_67d6ad3b09c0819090f0c8ec55bb6d1b
Batch status: validating
Batch status: in_

In [14]:
pd.DataFrame.to_csv(df, LABELED_CSV_PATH)
print("Labeling completed and saved to:", LABELED_CSV_PATH)

Labeling completed and saved to: videos4/projfiles/youtube_comments_labeled.csv


# combine results

In [31]:
def combine_intermediate_results():
    """
    Combines all intermediate results files into a single DataFrame.
    Assumes files are named 'intermediate_results_X.csv' where X is the batch number.
    """
    import glob
    
    # Get all intermediate results files
    intermediate_files = glob.glob('intermediate_results_*.csv')
    
    if not intermediate_files:
        print("No intermediate results files found")
        return None
        
    print(f"Found {len(intermediate_files)} intermediate files")
    
    # Read and combine all files
    dfs = []
    for file in sorted(intermediate_files):
        try:
            df = pd.read_csv(file)
            dfs.append(df)
            print(f"Loaded {file}")
        except Exception as e:
            print(f"Error loading {file}: {str(e)}")
    
    if not dfs:
        print("No valid DataFrames to combine")
        return None
        
    # Combine all DataFrames
    combined_df = pd.concat(dfs, ignore_index=True)
    
    # Remove duplicates if any
    combined_df = combined_df.drop_duplicates()
    
    print(f"Combined DataFrame shape: {combined_df.shape}")
    
    # Save combined results
    combined_df.to_csv(LABELED_CSV_PATH, index=False)
    print(f"Saved combined results to: {LABELED_CSV_PATH}")
    
    return combined_df

# After processing all batches, combine the results:
df = combine_intermediate_results()

Found 10 intermediate files


  df = pd.read_csv(file)


Loaded intermediate_results_1.csv


  df = pd.read_csv(file)


Loaded intermediate_results_10.csv


  df = pd.read_csv(file)


Loaded intermediate_results_2.csv


  df = pd.read_csv(file)


Loaded intermediate_results_3.csv


  df = pd.read_csv(file)


Loaded intermediate_results_4.csv


  df = pd.read_csv(file)


Loaded intermediate_results_5.csv


  df = pd.read_csv(file)


Loaded intermediate_results_6.csv


  df = pd.read_csv(file)


Loaded intermediate_results_7.csv


  df = pd.read_csv(file)


Loaded intermediate_results_8.csv


  df = pd.read_csv(file)


Loaded intermediate_results_9.csv
Combined DataFrame shape: (152677, 11)
Saved combined results to: videos4/projfiles/youtube_comments_labeled.csv


# tags_process

In [4]:
df = pd.read_csv(LABELED_CSV_PATH)
df = df.dropna(subset=['source_id'])  # Drop rows where source_id is NaN
df['source_id'] = df['source_id'].astype(int)  # Cast to integer
import ast  # import ast module for literal evaluation of strings

In [56]:
def add_category_indicators(df, category_dict):
    """
    Adds binary indicator columns (1/0) for each category to the input DataFrame.

    Args:
        df (pd.DataFrame): Input DataFrame with 'comment_labels' column
        category_dict (dict): Dictionary defining categories and their codes

    Returns:
        pd.DataFrame: Original DataFrame with additional binary category columns
    """
    # Create new columns for each category, initialized with 0
    for category_name in category_dict.keys():
        df[f'has_{category_name}'] = 0
    
    # Process each row
    for idx, row in df.iterrows():
        labels_str = row['comment_labels']
        labels_list = []
        
        # Parse the labels
        if isinstance(labels_str, str):
            try:
                labels_list = ast.literal_eval(labels_str)
                if not isinstance(labels_list, list):
                    labels_list = []
            except (ValueError, SyntaxError):
                print(f"Warning: Could not parse comment_labels string: '{labels_str}'. Treating as no labels.")
                labels_list = []
        elif isinstance(labels_str, list):
            labels_list = labels_str
            
        # Set indicators for present categories
        if isinstance(labels_list, list):
            for label in labels_list:
                for cat_name, cat_code in category_dict.items():
                    if label == cat_code:
                        df.at[idx, f'has_{cat_name}'] = 1
                        
    return df

In [57]:
# Add binary category indicators
df = add_category_indicators(df, category_dict)
df.to_csv(LABELED_CSV_PATH, index=False)

# CHI2

In [15]:
from scipy.stats import chi2_contingency
from ast import literal_eval  # converting strings to lists
from collections import defaultdict

In [5]:
df = pd.read_csv(LABELED_CSV_PATH)

In [17]:
def perform_chi_square_test(df, category_dict):
    """
    Performs Chi-square test of independence and calculates Cramér's V for each category across different video sources.
    
    Args:
        df (pd.DataFrame): DataFrame with 'source_id' and binary category columns (has_*)
        category_dict (dict): Dictionary defining categories and their codes
    """
    chi_square_results = {}
    
    # Test each category
    for category_name in category_dict.keys():
        column_name = f'has_{category_name}'
        if column_name not in df.columns:
            continue
            
        # Create contingency table
        contingency_table = pd.crosstab(df['source_id'], df[column_name])
        
        # Perform Chi-square test
        try:
            chi2, p_value, dof, expected = chi2_contingency(contingency_table)
            
            # Calculate Cramér's V
            n = contingency_table.sum().sum()  # total observations
            min_dim = min(contingency_table.shape[0] - 1, contingency_table.shape[1] - 1)
            cramers_v = np.sqrt(chi2 / (n * min_dim))
            
            chi_square_results[category_name] = {
                'chi2_statistic': chi2,
                'p_value': p_value,
                'degrees_of_freedom': dof,
                'cramers_v': cramers_v,
                'contingency_table': contingency_table
            }
        except ValueError as e:
            chi_square_results[category_name] = {
                'chi2_statistic': np.nan,
                'p_value': np.nan,
                'cramers_v': np.nan,
                'error': str(e),
                'contingency_table': contingency_table
            }
    
    # Print results
    print("Chi-square Test Results for Category Distribution across Video Types:\n")
    for category, result in chi_square_results.items():
        print(f"Category: {category}")
        if 'error' in result:
            print(f"  Error during Chi-square test: {result['error']}")
        else:
            print(f"  Chi-square statistic: {result['chi2_statistic']:.4f}")
            print(f"  P-value: {result['p_value']:.4f}")
            print(f"  Degrees of freedom: {result['degrees_of_freedom']}")
            print(f"  Cramér's V: {result['cramers_v']:.4f}")
            
            # Interpret Cramér's V
            if result['cramers_v'] < 0.1:
                strength = "negligible"
            elif result['cramers_v'] < 0.2:
                strength = "weak"
            elif result['cramers_v'] < 0.4:
                strength = "moderate"
            else:
                strength = "strong"
            print(f"  Association strength: {strength}")
            
            if result['p_value'] < 0.05:
                print(f"  **Statistically Significant (p < 0.05)**: Reject Null Hypothesis. "
                      f"Distribution of '{category}' comments is dependent on video types.")
            else:
                print(f"  Not Statistically Significant (p >= 0.05): Fail to Reject Null Hypothesis. "
                      f"No strong evidence that distribution of '{category}' comments depends on video types.")
            
            print("\n  Contingency Table:")
            print("    0 = Absent, 1 = Present")
            print(result['contingency_table'])
            print("-" * 50)
    
    return chi_square_results

In [18]:
results = perform_chi_square_test(df, category_dict)

Chi-square Test Results for Category Distribution across Video Types:

Category: Musical Performance
  Chi-square statistic: 40.3837
  P-value: 0.0000
  Degrees of freedom: 2
  Cramér's V: 0.0293
  Association strength: negligible
  **Statistically Significant (p < 0.05)**: Reject Null Hypothesis. Distribution of 'Musical Performance' comments is dependent on video types.

  Contingency Table:
    0 = Absent, 1 = Present
has_Musical Performance      0    1
source_id                          
1                        13351   94
2                        16521  231
3                        16687  242
--------------------------------------------------
Category: Vocal Performance
  Chi-square statistic: 212.2379
  P-value: 0.0000
  Degrees of freedom: 2
  Cramér's V: 0.0671
  Association strength: negligible
  **Statistically Significant (p < 0.05)**: Reject Null Hypothesis. Distribution of 'Vocal Performance' comments is dependent on video types.

  Contingency Table:
    0 = Absent, 1 = P

# Kruskal–Wallis test

In [19]:
from scipy.stats import kruskal

In [20]:
df = pd.read_csv(LABELED_CSV_PATH)

In [23]:
from collections import defaultdict
import pandas as pd
from scipy.stats import kruskal
import numpy as np

def perform_kruskal_wallis_test(df, category_dict):
    """
    Performs Kruskal-Wallis H-test for each category across different video sources and calculates effect size (Epsilon-squared).

    Args:
        df (pd.DataFrame): DataFrame with 'source_id' and binary category columns (has_*)
        category_dict (dict): Dictionary defining categories and their codes
    """
    kruskal_results = {}

    # Test each category
    for category_name in category_dict.keys():
        column_name = f'has_{category_name}'
        if column_name not in df.columns:
            continue

        # Group data by source_id
        category_presence_by_source = defaultdict(list)
        for source in sorted(df['source_id'].unique()):
            source_data = df[df['source_id'] == source][column_name].tolist()
            category_presence_by_source[source] = source_data

        # Perform Kruskal-Wallis test
        try:
            h_statistic, p_value = kruskal(*[category_presence_by_source[s] for s in sorted(df['source_id'].unique())])

            # Calculate Effect Size (Epsilon-squared)
            k = len(category_presence_by_source) # Number of groups (sources)
            N = sum(len(data) for data in category_presence_by_source.values()) # Total sample size
            epsilon_squared = (h_statistic - (k - 1)) / (N - k) if (N - k) != 0 else np.nan # Handle potential division by zero


            kruskal_results[category_name] = {
                'H_statistic': h_statistic,
                'p_value': p_value,
                'epsilon_squared': epsilon_squared, # Added effect size
                'groups_data': category_presence_by_source
            }
        except ValueError as e:
            kruskal_results[category_name] = {
                'H_statistic': np.nan,
                'p_value': np.nan,
                'epsilon_squared': np.nan, # Effect size also NaN in case of error
                'error': str(e),
                'groups_data': category_presence_by_source
            }

    # Print results
    print("Kruskal-Wallis Test Results for Category Distribution across Video Types:\n")
    for category, result in kruskal_results.items():
        print(f"Category: {category}")
        if 'error' in result:
            print(f"  Error during Kruskal-Wallis test: {result['error']}")
        else:
            print(f"  Kruskal-Wallis H-statistic: {result['H_statistic']:.4f}")
            print(f"  P-value: {result['p_value']:.4f}")
            print(f"  Epsilon-squared Effect Size: {result['epsilon_squared']:.4f}") # Print effect size

            # Interpret effect size (rough guidelines)
            effect_size_interpretation = "No effect size calculated"
            if not np.isnan(result['epsilon_squared']):
                epsilon_sq = result['epsilon_squared']
                if epsilon_sq < 0.01:
                    effect_size_interpretation = "Very small effect"
                elif epsilon_sq < 0.06:
                    effect_size_interpretation = "Small effect"
                elif epsilon_sq < 0.14:
                    effect_size_interpretation = "Medium effect"
                elif epsilon_sq >= 0.14:
                    effect_size_interpretation = "Large effect"
                else:
                    effect_size_interpretation = "No effect size calculated" # Should not reach here, but for safety

            print(f"  Effect Size Interpretation: {effect_size_interpretation}")


            if result['p_value'] < 0.05:
                print(f"  **Statistically Significant (p < 0.05)**: Reject Null Hypothesis. "
                      f"Distribution of '{category}' comments differs across video types.")
            else:
                print(f"  Not Statistically Significant (p >= 0.05): Fail to Reject Null Hypothesis. "
                      f"No strong evidence that distribution of '{category}' comments differs across video types.")

            print("\n  Category Presence Indicators by Source:")
            for source_id, data in result['groups_data'].items():
                print(f"    Source {source_id}:  Present Count = {sum(data)}, "
                      f"Absent Count = {len(data) - sum(data)}, Total = {len(data)}")
            print("-" * 50)

    return kruskal_results

In [24]:
results = perform_kruskal_wallis_test(df, category_dict)

Kruskal-Wallis Test Results for Category Distribution across Video Types:

Category: Musical Performance
  Kruskal-Wallis H-statistic: 40.3828
  P-value: 0.0000
  Epsilon-squared Effect Size: 0.0008
  Effect Size Interpretation: Very small effect
  **Statistically Significant (p < 0.05)**: Reject Null Hypothesis. Distribution of 'Musical Performance' comments differs across video types.

  Category Presence Indicators by Source:
    Source 1:  Present Count = 94, Absent Count = 13351, Total = 13445
    Source 2:  Present Count = 231, Absent Count = 16521, Total = 16752
    Source 3:  Present Count = 242, Absent Count = 16687, Total = 16929
--------------------------------------------------
Category: Vocal Performance
  Kruskal-Wallis H-statistic: 212.2334
  P-value: 0.0000
  Epsilon-squared Effect Size: 0.0045
  Effect Size Interpretation: Very small effect
  **Statistically Significant (p < 0.05)**: Reject Null Hypothesis. Distribution of 'Vocal Performance' comments differs across vi

In [7]:
import numpy as np
from sklearn.preprocessing import StandardScaler
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy import linalg

def perform_correspondence_analysis(df, category_dict):
    """
    Performs Correspondence Analysis on the contingency tables of video sources and categories.
    
    Args:
        df (pd.DataFrame): DataFrame with 'source_id' and binary category columns
        category_dict (dict): Dictionary defining categories and their codes
    """
    # Create contingency table for all categories
    categories = []
    source_ids = sorted(df['source_id'].unique())
    
    # Initialize the full contingency table
    full_contingency = []
    
    for category_name in category_dict.keys():
        column_name = f'has_{category_name}'
        if column_name not in df.columns:
            continue
            
        # Create contingency table for this category
        contingency = pd.crosstab(df['source_id'], df[column_name])
        # We only want the "present" column (1)
        if 1 in contingency.columns:
            full_contingency.append(contingency[1].values)
            categories.append(category_name)
    
    # Convert to numpy array and transpose so categories are columns
    N = np.array(full_contingency).T
    
    # Calculate row and column sums
    row_sums = N.sum(axis=1)
    col_sums = N.sum(axis=0)
    total = N.sum()
    
    # Calculate expected frequencies
    E = np.outer(row_sums, col_sums) / total
    
    # Calculate the correspondence matrix
    P = N / total
    row_probs = row_sums / total
    col_probs = col_sums / total
    
    # Calculate the standardized residuals
    S = (P - np.outer(row_probs, col_probs)) / np.sqrt(np.outer(row_probs, col_probs))
    
    # Singular Value Decomposition
    row_weights = np.diag(1 / np.sqrt(row_probs))
    col_weights = np.diag(1 / np.sqrt(col_probs))
    weighted_S = row_weights @ S @ col_weights
    U, sigma, Vt = linalg.svd(weighted_S, full_matrices=False)
    
    # Calculate principal coordinates
    row_coords = row_weights @ U @ np.diag(sigma)
    col_coords = col_weights @ Vt.T @ np.diag(sigma)
    
    # Calculate explained variance
    total_inertia = np.sum(sigma ** 2)
    explained_variance = (sigma ** 2) / total_inertia
    
    # Create Plotly figure
    fig = go.Figure()
    
    # Add video sources (rows)
    fig.add_trace(go.Scatter(
        x=row_coords[:, 0],
        y=row_coords[:, 1],
        mode='markers+text',
        name='Video Sources',
        text=[f'Source {source}' for source in source_ids],
        textposition="top center",
        marker=dict(size=10, color='blue'),
        hovertemplate='<b>Source %{text}</b><br>' +
                      'Dimension 1: %{x:.3f}<br>' +
                      'Dimension 2: %{y:.3f}<extra></extra>'
    ))
    
    # Add categories (columns)
    fig.add_trace(go.Scatter(
        x=col_coords[:, 0],
        y=col_coords[:, 1],
        mode='markers+text',
        name='Categories',
        text=categories,
        textposition="top center",
        marker=dict(size=10, color='red'),
        hovertemplate='<b>%{text}</b><br>' +
                      'Dimension 1: %{x:.3f}<br>' +
                      'Dimension 2: %{y:.3f}<extra></extra>'
    ))
    
    # Update layout
    fig.update_layout(
        title='Correspondence Analysis: Video Sources and Categories',
        xaxis_title=f'Dimension 1 ({explained_variance[0]:.1%} explained variance)',
        yaxis_title=f'Dimension 2 ({explained_variance[1]:.1%} explained variance)',
        showlegend=True,
        hovermode='closest',
        width=1000,
        height=800,
        template='presentation',
        shapes=[
            # Add zero lines
            dict(type="line", x0=-max(abs(row_coords[:, 0].max()), abs(col_coords[:, 0].max())), 
                 y0=0, x1=max(abs(row_coords[:, 0].max()), abs(col_coords[:, 0].max())), 
                 y1=0, line=dict(color="black", width=1, dash="dash")),
            dict(type="line", x0=0, 
                 y0=-max(abs(row_coords[:, 1].max()), abs(col_coords[:, 1].max())), 
                 x1=0, y1=max(abs(row_coords[:, 1].max()), abs(col_coords[:, 1].max())), 
                 line=dict(color="black", width=1, dash="dash"))
        ]
    )
    
    # Show the plot
    fig.show()
    
    # Print additional information
    print("\nCorrespondence Analysis Results:")
    print(f"Total Inertia: {total_inertia:.4f}")
    print("\nExplained Variance by Dimension:")
    for i, var in enumerate(explained_variance):
        print(f"Dimension {i+1}: {var:.1%}")
    
    return {
        'row_coords': row_coords,
        'col_coords': col_coords,
        'explained_variance': explained_variance,
        'total_inertia': total_inertia,
        'categories': categories,
        'source_ids': source_ids,
        'figure': fig  # Return the figure object for further customization if needed
    }

# Run the correspondence analysis
ca_results = perform_correspondence_analysis(df, category_dict)


Correspondence Analysis Results:
Total Inertia: 23.0655

Explained Variance by Dimension:
Dimension 1: 57.8%
Dimension 2: 42.2%
Dimension 3: 0.0%


In [11]:
def create_standardized_residuals_heatmap(df, category_dict):
    """
    Creates a heatmap of standardized residuals for the contingency tables.
    
    Args:
        df (pd.DataFrame): DataFrame with 'source_id' and binary category columns
        category_dict (dict): Dictionary defining categories and their codes
    """
    # Create contingency tables and calculate standardized residuals
    categories = []
    source_ids = sorted(df['source_id'].unique())
    residuals_matrix = []
    
    for category_name in category_dict.keys():
        column_name = f'has_{category_name}'
        if column_name not in df.columns:
            continue
            
        # Create contingency table for this category
        contingency = pd.crosstab(df['source_id'], df[column_name])
        categories.append(category_name)
        
        # Calculate expected frequencies and standardized residuals
        row_sums = contingency.sum(axis=1)
        col_sums = contingency.sum(axis=0)
        total = contingency.sum().sum()
        
        # We only want the "present" column (1)
        if 1 in contingency.columns:
            observed = contingency[1].values
            expected = (row_sums * col_sums[1]) / total
            
            # Calculate standardized residuals
            std_residuals = (observed - expected) / np.sqrt(expected * (1 - col_sums[1]/total) * (1 - row_sums/total))
            residuals_matrix.append(std_residuals)
    
    # Convert to numpy array and transpose
    residuals_matrix = np.array(residuals_matrix).T
    
    # Create heatmap
    fig = go.Figure(data=go.Heatmap(
        z=residuals_matrix,
        x=categories,
        y=[f'Source {id}' for id in source_ids],
        colorscale='RdBu',
        zmid=0,  # Center the colorscale at 0
        text=np.round(residuals_matrix, 2),  # Show values in cells
        texttemplate='%{text}',
        textfont={"size": 10},
        hoverongaps=False,
        hovertemplate='Source: %{y}<br>Category: %{x}<br>Std. Residual: %{z:.2f}<extra></extra>'
    ))
    
    # Update layout
    fig.update_layout(
        title='Standardized Residuals Heatmap: Video Sources vs Categories',
        xaxis_title='Categories',
        yaxis_title='Video Sources',
        width=1200,
        height=400,
        xaxis={'tickangle': 45},  # Rotate category labels for better readability
    )
    
    # Show the plot
    fig.show()
    
    # Print interpretation guide
    print("\nInterpretation Guide for Standardized Residuals:")
    print("* Values > 1.96 or < -1.96 are significant at p < 0.05 level")
    print("* Values > 2.58 or < -2.58 are significant at p < 0.01 level")
    print("* Positive values (red): More observations than expected")
    print("* Negative values (blue): Fewer observations than expected")
    print("* Values near 0 (white): Observed frequencies close to expected")

# Run the analysis
create_standardized_residuals_heatmap(df, category_dict)


Interpretation Guide for Standardized Residuals:
* Values > 1.96 or < -1.96 are significant at p < 0.05 level
* Values > 2.58 or < -2.58 are significant at p < 0.01 level
* Positive values (red): More observations than expected
* Negative values (blue): Fewer observations than expected
* Values near 0 (white): Observed frequencies close to expected
