In [None]:
# prompt: mount google drive  folder named 'mmCultural'

from google.colab import drive
drive.mount('/content/drive')

# Navigate to the specific folder within your Google Drive
%cd /content/drive/My Drive/mmCultural

In [None]:
!pip install evaluate bert_score

In [None]:
import json
import pandas as pd
import numpy as np
from scipy.stats import kendalltau
import sys
import os
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns

# Add the cultural values folder to path
sys.path.append('./cultural_values')
import utils

# File paths
INFERENCE_RESULTS_DIR = './inference_results'
CULTURAL_VALUES_DIR = './cultural_values'

# Model files
MODEL_FILES = [
    'responses_gemma_3_4b_it.jsonl',
    'responses_gemma_3_12b_it_qat.jsonl',
    'responses_internvl3_8b.jsonl',
    'responses_qwen_qwen2.5_vl_7b.jsonl',
    'responses_smolvlm2_2.2b_instruct.jsonl'
]

def load_jsonl_responses(file_path):
    """Load responses from JSONL file"""
    responses = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                data = json.loads(line.strip())
                responses.append(data)
            except json.JSONDecodeError:
                continue
    return responses

def convert_to_utils_format(responses, single_response_mode=True):
    """
    Convert JSONL responses to the format expected by utils.compute_similarity

    Expected format: data[topic][concept][identity][template] = [responses...]

    Args:
        responses: List of response dictionaries from JSONL
        single_response_mode: If True, handles case where we only have 1 response per concept-identity pair
                             If False, expects multiple responses per concept-identity pair
    """
    data = {}

    for response in responses:
        concept = response['concept']
        identity = response['identity']
        model_response = response['response']

        # Use concept as both topic and concept for compatibility
        topic = concept

        if single_response_mode:
            # Create a generic template since we only have one response per identity-concept
            template = f"story_about_{concept}"
        else:
            # Use more specific template when we have multiple responses
            template = f"Write about {concept} for {identity}"

        if topic not in data:
            data[topic] = {}
        if concept not in data[topic]:
            data[topic][concept] = {}
        if identity not in data[topic][concept]:
            data[topic][concept][identity] = {}
        if template not in data[topic][concept][identity]:
            data[topic][concept][identity][template] = []

        data[topic][concept][identity][template].append(model_response.lower())

    return data

def compute_pairwise_similarities(data, metric='bertscore', single_response_mode=True):
    """
    Compute pairwise similarities between identities for each concept
    Returns similarity matrix in the format expected by Kendall's tau analysis

    Args:
        data: Response data in utils format
        metric: Similarity metric to use ('bertscore', 'bleu', 'wer')
        single_response_mode: If True, handles single response per identity-concept pair
    """
    results = {}

    for topic in data:
        results[topic] = {}

        for concept in data[topic]:
            if concept == 'all_concepts':
                continue

            identities = list(data[topic][concept].keys())
            results[topic][concept] = {}

            print(f"  Processing concept '{concept}' with identities: {identities}")

            # Check if we have enough data for meaningful comparison
            if len(identities) < 2:
                print(f"    Skipping {concept}: need at least 2 identities")
                continue

            # Initialize similarity matrix
            for identity1 in identities:
                results[topic][concept][identity1] = {}
                for identity2 in identities:
                    if identity1 == identity2:
                        results[topic][concept][identity1][identity2] = 1.0
                    elif identity1 in results[topic][concept] and identity2 in results[topic][concept][identity1]:
                        # Already computed (symmetric)
                        continue
                    else:
                        # Check if both identities have responses
                        if (identity1 not in data[topic][concept] or
                            identity2 not in data[topic][concept]):
                            print(f"    Missing data for {identity1} or {identity2}")
                            results[topic][concept][identity1][identity2] = 0.0
                            continue

                        # In single response mode, we might not have common templates
                        # So we'll compare whatever responses we have
                        try:
                            similarity = utils.compute_similarity(
                                data[topic][concept][identity1],
                                data[topic][concept][identity2],
                                metric
                            )

                            if similarity is not None and similarity != -1:
                                results[topic][concept][identity1][identity2] = similarity
                                # Ensure symmetry
                                if identity2 not in results[topic][concept]:
                                    results[topic][concept][identity2] = {}
                                results[topic][concept][identity2][identity1] = similarity
                                print(f"    Similarity {identity1}-{identity2}: {similarity:.4f}")
                            else:
                                # No common templates or comparison failed
                                print(f"    No common templates for {identity1}-{identity2}")
                                if single_response_mode:
                                    # In single response mode, use a different approach
                                    # We'll use simple text similarity instead
                                    similarity = compute_fallback_similarity(
                                        data[topic][concept][identity1],
                                        data[topic][concept][identity2]
                                    )
                                    results[topic][concept][identity1][identity2] = similarity
                                    if identity2 not in results[topic][concept]:
                                        results[topic][concept][identity2] = {}
                                    results[topic][concept][identity2][identity1] = similarity
                                    print(f"    Fallback similarity {identity1}-{identity2}: {similarity:.4f}")
                                else:
                                    results[topic][concept][identity1][identity2] = 0.0
                                    if identity2 not in results[topic][concept]:
                                        results[topic][concept][identity2] = {}
                                    results[topic][concept][identity2][identity1] = 0.0

                        except Exception as e:
                            print(f"    Error computing similarity for {identity1}-{identity2}: {e}")
                            results[topic][concept][identity1][identity2] = 0.0
                            if identity2 not in results[topic][concept]:
                                results[topic][concept][identity2] = {}
                            results[topic][concept][identity2][identity1] = 0.0

    return results

def compute_fallback_similarity(responses1, responses2):
    """
    Fallback similarity computation when we don't have common templates
    Uses simple sentence similarity between the first response from each identity
    """
    try:
        # Get the first response from each identity
        resp1_templates = list(responses1.keys())
        resp2_templates = list(responses2.keys())

        if not resp1_templates or not resp2_templates:
            return 0.0

        resp1_text = responses1[resp1_templates[0]][0] if responses1[resp1_templates[0]] else ""
        resp2_text = responses2[resp2_templates[0]][0] if responses2[resp2_templates[0]] else ""

        if not resp1_text or not resp2_text:
            return 0.0

        # Use a simple approach: compute BLEU between the two responses
        # treating one as prediction and other as reference
        from nltk.translate.bleu_score import sentence_bleu
        from nltk.tokenize import word_tokenize

        # Tokenize
        tokens1 = word_tokenize(resp1_text)
        tokens2 = word_tokenize(resp2_text)

        if not tokens1 or not tokens2:
            return 0.0

        # Compute BLEU in both directions and average
        bleu1 = sentence_bleu([tokens2], tokens1)
        bleu2 = sentence_bleu([tokens1], tokens2)

        return (bleu1 + bleu2) / 2

    except Exception as e:
        print(f"      Fallback similarity computation failed: {e}")
        return 0.0

def load_cultural_distances(cultural_vectors_path, cultural_vectors_type):
    """Load cultural distance matrices"""
    if 'hofstede' in cultural_vectors_type:
        return utils.get_hofstede_distances(cultural_vectors_path, cultural_vectors_type)
    elif 'wvs' in cultural_vectors_type:
        return utils.get_wvs_250_dims_distance(cultural_vectors_path, cultural_vectors_type)
    else:
        raise ValueError(f"Unknown cultural vectors type: {cultural_vectors_type}")

def extract_similarity_matrix(similarity_results, concept):
    """Extract similarity matrix for a specific concept"""
    # Find the topic that contains this concept
    topic_data = None
    for topic in similarity_results:
        if concept in similarity_results[topic]:
            topic_data = similarity_results[topic][concept]
            break

    if topic_data is None:
        return None, None

    identities = list(topic_data.keys())
    n = len(identities)
    matrix = np.zeros((n, n))

    for i, id1 in enumerate(identities):
        for j, id2 in enumerate(identities):
            if id2 in topic_data[id1]:
                matrix[i, j] = topic_data[id1][id2]
            else:
                matrix[i, j] = 0.0

    return matrix, identities

def extract_cultural_matrix(cultural_distances, identities):
    """Extract cultural distance matrix for given identities"""
    n = len(identities)
    matrix = np.zeros((n, n))

    for i, id1 in enumerate(identities):
        for j, id2 in enumerate(identities):
            if id1 in cultural_distances and id2 in cultural_distances[id1]:
                matrix[i, j] = cultural_distances[id1][id2]
            else:
                matrix[i, j] = 0.0

    return matrix

def compute_kendalls_tau(similarity_matrix, cultural_matrix):
    """Compute Kendall's tau between similarity and cultural distance matrices"""
    # Get upper triangular indices (excluding diagonal)
    n = similarity_matrix.shape[0]
    triu_indices = np.triu_indices(n, k=1)

    sim_values = similarity_matrix[triu_indices]
    cultural_values = cultural_matrix[triu_indices]

    # Remove any NaN or infinite values
    valid_mask = np.isfinite(sim_values) & np.isfinite(cultural_values)
    sim_values = sim_values[valid_mask]
    cultural_values = cultural_values[valid_mask]

    if len(sim_values) < 2:
        return None, None

    tau, p_value = kendalltau(sim_values, cultural_values)
    return tau, p_value

def analyze_model_correlations(model_file, cultural_vectors_path, cultural_vectors_type,
                             text_similarity_metric='bertscore', single_response_mode=True):
    """Analyze correlations for a single model"""
    print(f"\nAnalyzing {model_file}...")

    # Load and process responses
    file_path = os.path.join(INFERENCE_RESULTS_DIR, model_file)
    responses = load_jsonl_responses(file_path)
    print(f"Loaded {len(responses)} responses")

    # Check data structure
    concepts = set(r['concept'] for r in responses)
    identities = set(r['identity'] for r in responses)
    print(f"Found {len(concepts)} concepts: {list(concepts)}")
    print(f"Found {len(identities)} identities: {list(identities)}")

    # Convert to utils format
    data = convert_to_utils_format(responses, single_response_mode=single_response_mode)

    # Compute similarities
    print("Computing pairwise similarities...")
    similarity_results = compute_pairwise_similarities(data, text_similarity_metric,
                                                     single_response_mode=single_response_mode)

    # Load cultural distances
    print("Loading cultural distances...")
    try:
        cultural_distances = load_cultural_distances(cultural_vectors_path, cultural_vectors_type)
    except Exception as e:
        print(f"Error loading cultural distances: {e}")
        return {}

    # Analyze each concept
    results = {}
    concepts_list = [concept for topic in data for concept in data[topic] if concept != 'all_concepts']

    for concept in concepts_list:
        print(f"Analyzing concept: {concept}")

        # Extract matrices
        sim_matrix, identities_list = extract_similarity_matrix(similarity_results, concept)
        if sim_matrix is None:
            print(f"  No similarity matrix for {concept}")
            continue

        # Check if we have enough identities for correlation
        if len(identities_list) < 3:
            print(f"  Not enough identities for {concept} (need at least 3, got {len(identities_list)})")
            continue

        cultural_matrix = extract_cultural_matrix(cultural_distances, identities_list)

        # Check if cultural matrix has valid values
        cultural_flat = cultural_matrix[np.triu_indices(len(identities_list), k=1)]
        if np.all(cultural_flat == 0) or not np.any(np.isfinite(cultural_flat)):
            print(f"  No valid cultural distances for identities: {identities_list}")
            continue

        # Compute Kendall's tau
        tau, p_value = compute_kendalls_tau(sim_matrix, cultural_matrix)

        if tau is not None:
            results[concept] = {
                'tau': tau,
                'p_value': p_value,
                'identities': identities_list,
                'n_pairs': len(identities_list) * (len(identities_list) - 1) // 2
            }
            print(f"  Tau: {tau:.4f}, p-value: {p_value:.4f} (n_pairs: {results[concept]['n_pairs']})")
        else:
            print(f"  Could not compute tau for {concept}")

    return results

def run_full_analysis(single_response_mode=True):
    """
    Run analysis for all models and cultural dimensions

    Args:
        single_response_mode: Set to True when you have only 1 response per concept-identity pair
                             Set to False when you have multiple responses per concept-identity pair
    """

    # Cultural dimension files (you'll need to update these paths)
    cultural_configs = [
        {
            'name': 'Hofstede',
            'path': os.path.join(CULTURAL_VALUES_DIR, 'distances_hofstede_raw_with_demonymns.csv'),
            'type': 'hofstede_vector_distance'
        },
        {
            'name': 'World Values Survey',
            'path': os.path.join(CULTURAL_VALUES_DIR, 'wvs_250_dims_with_demonyms.csv'),
            'type': 'wvs_250_dims_vector_distance'
        }
    ]

    all_results = {}

    for config in cultural_configs:
        print(f"\n{'='*50}")
        print(f"ANALYZING WITH {config['name'].upper()}")
        print(f"{'='*50}")

        if not os.path.exists(config['path']):
            print(f"Cultural values file not found: {config['path']}")
            continue

        all_results[config['name']] = {}

        for model_file in MODEL_FILES:
            model_name = model_file.replace('responses_', '').replace('.jsonl', '')

            try:
                results = analyze_model_correlations(
                    model_file,
                    config['path'],
                    config['type'],
                    'bertscore',  # Using BERTScore as you suggested
                    single_response_mode=single_response_mode
                )
                all_results[config['name']][model_name] = results

            except Exception as e:
                print(f"Error analyzing {model_file}: {str(e)}")
                continue

    return all_results

def visualize_results(all_results):
    """Create visualizations of the results"""

    for cultural_dim in all_results:
        print(f"\n{cultural_dim} Results Summary:")
        print("-" * 40)

        # Collect data for visualization
        models = []
        concepts = []
        taus = []
        p_values = []

        for model in all_results[cultural_dim]:
            for concept in all_results[cultural_dim][model]:
                models.append(model)
                concepts.append(concept)
                taus.append(all_results[cultural_dim][model][concept]['tau'])
                p_values.append(all_results[cultural_dim][model][concept]['p_value'])

        if not taus:
            print("No valid results to visualize")
            continue

        # Create DataFrame for easier plotting
        df = pd.DataFrame({
            'Model': models,
            'Concept': concepts,
            'Tau': taus,
            'P_Value': p_values
        })

        # Print summary statistics
        print(f"Mean Tau: {np.mean(taus):.4f}")
        print(f"Std Tau: {np.std(taus):.4f}")
        print(f"Significant correlations (p<0.05): {sum(p < 0.05 for p in p_values)}/{len(p_values)}")

        # Create heatmap
        if len(set(concepts)) > 1 and len(set(models)) > 1:
            pivot_df = df.pivot(index='Concept', columns='Model', values='Tau')

            plt.figure(figsize=(12, 8))
            sns.heatmap(pivot_df, annot=True, cmap='RdBu_r', center=0,
                       fmt='.3f', cbar_kws={'label': "Kendall's Tau"})
            plt.title(f'Kendall\'s Tau Correlations - {cultural_dim}')
            plt.tight_layout()
            plt.show()

# Run the full analysis
print("Starting VLM Cultural Competence Analysis...")
print("This will compute Kendall's tau correlations between text similarities and cultural distances")

# Set single_response_mode=True for current setup with 1 response per concept-identity pair
# Change to False once you have 5+ responses per prompt
SINGLE_RESPONSE_MODE = True

print(f"Running in {'single response' if SINGLE_RESPONSE_MODE else 'multiple response'} mode")

# Note: Make sure you have the cultural values CSV files in the right location
results = run_full_analysis(single_response_mode=SINGLE_RESPONSE_MODE)

# Visualize results
visualize_results(results)

# Print summary
print("\n" + "="*60)
print("ANALYSIS COMPLETE")
print("="*60)

for cultural_dim in results:
    print(f"\n{cultural_dim} Summary:")
    total_concepts = 0
    total_significant = 0

    for model in results[cultural_dim]:
        concepts = len(results[cultural_dim][model])
        significant = sum(1 for c in results[cultural_dim][model]
                        if results[cultural_dim][model][c]['p_value'] < 0.05)
        total_concepts += concepts
        total_significant += significant

        print(f"  {model}: {concepts} concepts, {significant} significant correlations")

    if total_concepts > 0:
        print(f"  Overall: {total_significant}/{total_concepts} significant correlations "
                f"({100*total_significant/total_concepts:.1f}%)")

print(f"\nNote: Running in {'single response' if SINGLE_RESPONSE_MODE else 'multiple response'} mode")
if SINGLE_RESPONSE_MODE:
    print("Switch to SINGLE_RESPONSE_MODE = False once you have 5+ responses per prompt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Starting VLM Cultural Competence Analysis...
This will compute Kendall's tau correlations between text similarities and cultural distances
Running in single response mode

ANALYZING WITH HOFSTEDE

Analyzing responses_gemma_3_4b_it.jsonl...
Loaded 2937 responses
Found 35 concepts: ['zoo', 'near a river', 'farm animals', 'kindness', 'robots', 'school', 'pets', 'space', 'compassion', 'humility', 'patience', 'fishes', 'sharing', 'ocean', 'library', 'respect', 'mythological figures', 'magicians', 'honesty', 'never hurting anyone', 'not being selfish', 'insects', 'friends', 'wild animals', 'forest', 'gratitude', 'not lying', 'birds', 'cooperation', 'playground', 'hard work', 'place of worship', 'empathy', 'farm', 'not being greedy']
Found 42 identities: ['Filipino', 'Tanzanian', 'Czech', 'Dutch', 'Indonesian', 'Indian', 'Egyptian', 'South Sudanese', 'German', 'Romanian', 'Mexican', 'Greek', 'Pakistani', 'Australian', 'Iraqi', 'Turkish', 'Zimbabwean', 'Polish', 'Israeli', 'Canadian', 'America

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


    Similarity American-Indian: 0.8602
    Similarity American-Nigerian: 0.8607
    Similarity American-Pakistani: 0.8573
    Similarity American-Indonesian: 0.8413
