In [1]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from datasets import Dataset, DatasetDict
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## Load Data from All Models

In [2]:
# Define base directory and model names
base_dir = Path("../data/generated_captions")
models = ['ft', 'base', 'zero_shot']

# Load all predictions and metrics
predictions = {}
metrics = {}

for model in models:
    predictions[model] = {}
    metrics[model] = {}
    
    # Load predictions
    pred_path = base_dir / f"{model}.csv"
    if pred_path.exists():
        predictions[model] = pd.read_csv(pred_path)

print(f"\nLoaded data for {len(models)} models")


Loaded data for 3 models


In [None]:
display(predictions['zero_shot'].head())
display(predictions['base'].head())
display(predictions['ft'].head())

In [None]:
# Compare the same aspects list across models
for model in ['zero_shot', 'base', 'ft']:
    print(f"\nModel: {model}")
    print("Aspects:", predictions[model]['aspect_list'].iloc[0])
    print("Caption:", predictions[model]['prediction'].iloc[0])

## Analyze overall statistics and compare to MusicCaps as baseline

In [None]:
from datasets import load_dataset

mc_dataset = load_dataset("google/MusicCaps", split="train")
mc_df = mc_dataset.to_pandas()
predictions['mc'] = mc_df
mc_df

In [None]:
# Prepare data for comparison
comparison_data = []
models_to_compare = ['zero_shot', 'base', 'ft', 'mc']
model_labels = {
    'zero_shot': 'Zero Shot', 
    'base': 'Base LLM', 
    'ft': 'Fine-tuned LLM',
    'mc': 'MusicCaps Dataset'
}

for model in models_to_compare:
    df = predictions[model]
    col_name = 'prediction' if 'prediction' in df.columns else 'caption'
    df[col_name] = df[col_name].astype(str)
    
    word_counts = df[col_name].apply(lambda x: len(x.split()))
    char_counts = df[col_name].apply(len)
    
    # Create temporary dataframe for plotting
    temp_df = pd.DataFrame({
        'Model': model_labels.get(model, model),
        'Word Count': word_counts,
        'Character Count': char_counts
    })
    comparison_data.append(temp_df)

viz_df = pd.concat(comparison_data, ignore_index=True)
fig, axes = plt.subplots(1, 2, figsize=(18, 6))
plt.style.use('petroff10')

sns.histplot(
    data=viz_df, 
    x='Character Count', 
    hue='Model', 
    kde=True, 
    element='step', 
    stat='density', 
    common_norm=False,
    alpha=0.3, 
    ax=axes[0]
)
axes[0].set_title('Distribution of Character Counts')
axes[0].set_xlabel('Character Count')

sns.histplot(
    data=viz_df, 
    x='Word Count', 
    hue='Model', 
    kde=True, 
    element='step', 
    stat='density', 
    common_norm=False,
    alpha=0.3,
    ax=axes[1]
)
axes[1].set_title('Distribution of Word Counts')
axes[1].set_xlabel('Word Count')

plt.tight_layout()
plt.show()

### Create huggingface datasets for further analysis

In [None]:
for model in models:
    df = predictions[model]['test']
    hf_dataset = Dataset.from_pandas(df)
    hf_dataset_dict = DatasetDict({ 'test': hf_dataset })
    hf_dataset_dict.push_to_hub(f"bsienkiewicz/{model}-caption-inference-dataset")

In [3]:
quick_test = pd.read_csv(base_dir / "quick_test.csv")
hf_dataset = Dataset.from_pandas(quick_test)
hf_dataset_dict = DatasetDict({ 'test': hf_dataset })
hf_dataset_dict.push_to_hub("bsienkiewicz/quick-test-caption-inference-dataset")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

CommitInfo(commit_url='https://huggingface.co/datasets/bsienkiewicz/quick-test-caption-inference-dataset/commit/2a94a7780103202bfa00399d901ef3f57eeb5e41', commit_message='Upload dataset', commit_description='', oid='2a94a7780103202bfa00399d901ef3f57eeb5e41', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/bsienkiewicz/quick-test-caption-inference-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='bsienkiewicz/quick-test-caption-inference-dataset'), pr_revision=None, pr_num=None)

## Push ConceptCaps dataset captions

In [None]:
final_dir = Path("../outputs/caption_inference/final")
splits = ['train', 'validation', 'test']
predictions = {}
datasets = {}

for split in splits:
    # Load predictions
    pred_path = final_dir / f"{split}_predictions.csv"
    df = pd.read_csv(pred_path)
    display(df.head())
    dataset = Dataset.from_pandas(df)
    predictions[split] = df
    datasets[split] = dataset
    print(f"Loaded {split}: {len(df)} samples")

In [None]:
# Analuze final inference predictions
for split, df in predictions.items():
    print(f"{split} - Avg Prediction Length: {df['prediction'].astype(str).apply(len).mean()}")

In [None]:
# Analyze word amount in captions
for split, df in predictions.items():
    df['word_count'] = df['prediction'].astype(str).apply(lambda x: len(x.split()))
    print(f"{split} - Avg Word Count: {df['word_count'].mean()}")

In [None]:
# Plot distribution of prediction lengths for final inference predictions
plt.figure(figsize=(10, 6))
for split, df in predictions.items():
    sns.kdeplot(df['prediction'].astype(str).apply(len), label=split)
plt.title('Distribution of Prediction Lengths for Final Inference Predictions')
plt.xlabel('Prediction Length')
plt.ylabel('Density')
plt.legend()
plt.show()

In [None]:
# Analyze correlation between prediction length and amount of aspects
import ast

for split, df in predictions.items():
    df['num_aspects'] = df['aspect_list'].map(ast.literal_eval).apply(len)
    correlation = df['prediction'].astype(str).apply(len).corr(df['num_aspects'])
    print(f"{split} - Correlation between Prediction Length and Number of Aspects: {correlation}")

In [None]:
hf_dataset_dict = DatasetDict({ split: datasets[split] for split in splits })
hf_dataset_dict.push_to_hub("bsienkiewicz/ConceptCaps")