# Similarity System Visualizations

Three focused visualizations for the hybrid movie recommendation system:

1. **Component Contribution Breakdown** - Shows how each similarity component contributes to the final hybrid score for recommendations (the key visualization)
2. **Similarity Distribution Comparison** - Box plots of each matrix's value distribution
3. **Weight Sensitivity Analysis** - How different weight configurations affect recommendations

## Setup

Run your existing data loading code first, then add these visualization cells.

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# Set plotting style
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['figure.dpi'] = 100
sns.set_style("whitegrid")

---
## Visualization 1: Component Contribution Breakdown

This is the key visualization — a stacked bar chart showing how much each similarity component (info embedding, genre, year, etc.) contributes to the final hybrid score of each recommendation.

**Why this matters:**
- Answers "What's actually driving these recommendations?"
- Exposes if one component dominates the scoring
- Demonstrates transparency in the hybrid approach

In [None]:
def plot_component_contributions(query_title, vectorized_data, 
                                  info_sim, content_rating_sim, genre_sim, 
                                  year_sim, style_sim, type_sim,
                                  weights=(0.2, 0.2, 0.2, 0.2, 0.1, 0.1), 
                                  top_k=5):
    """
    Stacked bar chart showing the contribution of each similarity component
    to the final hybrid score for top-k recommendations.
    
    Parameters:
    -----------
    query_title : str - The movie to find recommendations for
    vectorized_data : list - List of movie dictionaries
    *_sim : np.ndarray - Individual similarity matrices
    weights : tuple - (alpha, beta, gamma, delta, epsilon, zeta)
    top_k : int - Number of recommendations to display
    """
    # Unpack weights
    alpha, beta, gamma, delta, epsilon, zeta = weights
    
    # Find query movie index
    query_idx = None
    for i, movie in enumerate(vectorized_data):
        if movie['movie_title'].lower() == query_title.lower():
            query_idx = i
            break
    
    if query_idx is None:
        print(f"Movie '{query_title}' not found.")
        return
    
    # Calculate hybrid scores
    hybrid_sim = (alpha * info_sim + beta * content_rating_sim + 
                  gamma * genre_sim + delta * year_sim + 
                  epsilon * style_sim + zeta * type_sim)
    
    # Get top-k recommendations
    sim_scores = hybrid_sim[query_idx]
    sorted_indices = np.argsort(sim_scores)[::-1]
    top_indices = [idx for idx in sorted_indices if idx != query_idx][:top_k]
    
    # Get individual component contributions for each recommendation
    movie_titles = []
    info_contrib = []
    rating_contrib = []
    genre_contrib = []
    year_contrib = []
    style_contrib = []
    type_contrib = []
    
    for idx in top_indices:
        movie_titles.append(vectorized_data[idx]['movie_title'][:25])
        info_contrib.append(alpha * info_sim[query_idx, idx])
        rating_contrib.append(beta * content_rating_sim[query_idx, idx])
        genre_contrib.append(gamma * genre_sim[query_idx, idx])
        year_contrib.append(delta * year_sim[query_idx, idx])
        style_contrib.append(epsilon * style_sim[query_idx, idx])
        type_contrib.append(zeta * type_sim[query_idx, idx])
    
    # Create stacked bar chart
    fig, ax = plt.subplots(figsize=(12, 6))
    
    x = np.arange(len(movie_titles))
    width = 0.6
    
    # Stack the bars
    bars1 = ax.bar(x, info_contrib, width, label=f'Info Embedding (α={alpha})', color='#2ecc71')
    bars2 = ax.bar(x, rating_contrib, width, bottom=info_contrib, 
                   label=f'Content Rating (β={beta})', color='#3498db')
    bars3 = ax.bar(x, genre_contrib, width, 
                   bottom=np.array(info_contrib) + np.array(rating_contrib),
                   label=f'Genre (γ={gamma})', color='#9b59b6')
    bars4 = ax.bar(x, year_contrib, width,
                   bottom=np.array(info_contrib) + np.array(rating_contrib) + np.array(genre_contrib),
                   label=f'Year (δ={delta})', color='#f39c12')
    bars5 = ax.bar(x, style_contrib, width,
                   bottom=np.array(info_contrib) + np.array(rating_contrib) + np.array(genre_contrib) + np.array(year_contrib),
                   label=f'Review Style (ε={epsilon})', color='#e74c3c')
    bars6 = ax.bar(x, type_contrib, width,
                   bottom=np.array(info_contrib) + np.array(rating_contrib) + np.array(genre_contrib) + np.array(year_contrib) + np.array(style_contrib),
                   label=f'Review Type (ζ={zeta})', color='#1abc9c')
    
    # Add total score labels on top
    totals = [sum(x) for x in zip(info_contrib, rating_contrib, genre_contrib, 
                                   year_contrib, style_contrib, type_contrib)]
    for i, total in enumerate(totals):
        ax.text(i, total + 0.01, f'{total:.3f}', ha='center', va='bottom', fontsize=9, fontweight='bold')
    
    ax.set_xlabel('Recommended Movies', fontsize=11)
    ax.set_ylabel('Weighted Similarity Contribution', fontsize=11)
    ax.set_title(f'Hybrid Score Component Breakdown\nQuery: "{query_title}"', fontsize=13, fontweight='bold')
    ax.set_xticks(x)
    ax.set_xticklabels(movie_titles, rotation=45, ha='right', fontsize=9)
    ax.legend(loc='upper right', fontsize=8)
    ax.set_ylim(0, max(totals) * 1.15)
    
    plt.tight_layout()
    plt.show()
    
    # Print numerical breakdown
    print(f"\n{'='*80}")
    print(f"Component Contribution Analysis for Query: {query_title}")
    print(f"{'='*80}")
    print(f"{'Movie':<28} {'Info':>8} {'Rating':>8} {'Genre':>8} {'Year':>8} {'Style':>8} {'Type':>8} {'TOTAL':>8}")
    print(f"{'-'*80}")
    for i, title in enumerate(movie_titles):
        print(f"{title:<28} {info_contrib[i]:>8.4f} {rating_contrib[i]:>8.4f} {genre_contrib[i]:>8.4f} "
              f"{year_contrib[i]:>8.4f} {style_contrib[i]:>8.4f} {type_contrib[i]:>8.4f} {totals[i]:>8.4f}")

In [None]:
# Example usage - uncomment and modify after loading your data
# plot_component_contributions(
#     query_title="Alien",
#     vectorized_data=vectorized_data,
#     info_sim=info_sim,
#     content_rating_sim=content_rating_sim,
#     genre_sim=genre_sim,
#     year_sim=year_sim,
#     style_sim=style_sim,
#     type_sim=type_sim,
#     weights=(0.2, 0.2, 0.2, 0.2, 0.1, 0.1),
#     top_k=5
# )

---
## Visualization 2: Similarity Distribution Comparison

Box plots showing the value distribution of each similarity matrix across all movie pairs.

**Why this matters:**
- Different matrices have different natural ranges and variances
- Justifies why components might be weighted differently
- Identifies degenerate matrices (e.g., clustering near 0.5)

In [None]:
def plot_similarity_distributions(info_sim, content_rating_sim, genre_sim, 
                                   year_sim, style_sim, type_sim):
    """
    Box plots comparing the distribution of similarity values across all
    movie pairs for each similarity component.
    """
    # Sample upper triangle values (excluding diagonal) to avoid self-similarity
    n = info_sim.shape[0]
    upper_indices = np.triu_indices(n, k=1)
    
    # Extract values
    data = {
        'Info\nEmbedding': info_sim[upper_indices],
        'Content\nRating': content_rating_sim[upper_indices],
        'Genre': genre_sim[upper_indices],
        'Year': year_sim[upper_indices],
        'Review\nStyle': style_sim[upper_indices],
        'Review\nType': type_sim[upper_indices]
    }
    
    fig, ax = plt.subplots(figsize=(10, 6))
    
    # Create box plots
    bp = ax.boxplot(data.values(), labels=data.keys(), patch_artist=True)
    
    # Color the boxes
    colors = ['#2ecc71', '#3498db', '#9b59b6', '#f39c12', '#e74c3c', '#1abc9c']
    for patch, color in zip(bp['boxes'], colors):
        patch.set_facecolor(color)
        patch.set_alpha(0.7)
    
    ax.set_ylabel('Similarity Value', fontsize=11)
    ax.set_xlabel('Similarity Component', fontsize=11)
    ax.set_title('Distribution of Similarity Values by Component\n(All Movie Pairs)', 
                 fontsize=13, fontweight='bold')
    ax.grid(axis='y', alpha=0.3)
    
    # Add mean markers
    means = [np.mean(v) for v in data.values()]
    ax.scatter(range(1, 7), means, marker='D', color='black', s=50, zorder=3, label='Mean')
    ax.legend(loc='upper right')
    
    plt.tight_layout()
    plt.show()
    
    # Print statistics
    print(f"\n{'='*80}")
    print("Similarity Matrix Statistics")
    print(f"{'='*80}")
    print(f"{'Component':<20} {'Mean':>12} {'Std':>12} {'Min':>12} {'Max':>12} {'Median':>12}")
    print(f"{'-'*80}")
    for name, values in data.items():
        name_clean = name.replace('\n', ' ')
        print(f"{name_clean:<20} {np.mean(values):>12.4f} {np.std(values):>12.4f} "
              f"{np.min(values):>12.4f} {np.max(values):>12.4f} {np.median(values):>12.4f}")

In [None]:
# Example usage - uncomment after loading your data
# plot_similarity_distributions(
#     info_sim=info_sim,
#     content_rating_sim=content_rating_sim,
#     genre_sim=genre_sim,
#     year_sim=year_sim,
#     style_sim=style_sim,
#     type_sim=type_sim
# )

---
## Visualization 3: Weight Sensitivity Analysis

Line plot showing how varying each component's weight affects the average similarity score of top recommendations.

**Why this matters:**
- Justifies weight choices empirically
- Shows which components have high discriminative power
- Identifies components with low marginal impact

In [None]:
def plot_weight_sensitivity(query_title, vectorized_data,
                            info_sim, content_rating_sim, genre_sim,
                            year_sim, style_sim, type_sim,
                            base_weights=(0.2, 0.2, 0.2, 0.2, 0.1, 0.1),
                            top_k=10):
    """
    Line plot showing how varying each weight (while keeping others proportional)
    affects the average top-k recommendation score.
    """
    # Find query movie index
    query_idx = None
    for i, movie in enumerate(vectorized_data):
        if movie['movie_title'].lower() == query_title.lower():
            query_idx = i
            break
    
    if query_idx is None:
        print(f"Movie '{query_title}' not found.")
        return
    
    component_names = ['Info Embedding', 'Content Rating', 'Genre', 'Year', 'Review Style', 'Review Type']
    sim_matrices = [info_sim, content_rating_sim, genre_sim, year_sim, style_sim, type_sim]
    colors = ['#2ecc71', '#3498db', '#9b59b6', '#f39c12', '#e74c3c', '#1abc9c']
    
    weight_range = np.linspace(0, 0.6, 20)
    
    fig, ax = plt.subplots(figsize=(10, 6))
    
    for comp_idx, (name, sim_matrix, color) in enumerate(zip(component_names, sim_matrices, colors)):
        avg_scores = []
        
        for target_weight in weight_range:
            # Create weight vector with target weight for this component
            # Distribute remaining weight proportionally among others
            remaining = 1.0 - target_weight
            weights = list(base_weights)
            
            # Calculate how much weight others had originally
            original_other_sum = sum(w for i, w in enumerate(base_weights) if i != comp_idx)
            
            if original_other_sum > 0:
                for i in range(6):
                    if i == comp_idx:
                        weights[i] = target_weight
                    else:
                        weights[i] = base_weights[i] / original_other_sum * remaining
            else:
                weights[comp_idx] = target_weight
            
            # Calculate hybrid similarity
            hybrid = sum(w * m for w, m in zip(weights, sim_matrices))
            
            # Get top-k average score
            scores = hybrid[query_idx]
            sorted_scores = np.sort(scores)[::-1]
            top_scores = sorted_scores[1:top_k+1]  # Exclude self
            avg_scores.append(np.mean(top_scores))
        
        ax.plot(weight_range, avg_scores, label=name, color=color, linewidth=2, marker='o', markersize=3)
    
    # Mark base weights with vertical lines
    for comp_idx, (name, color) in enumerate(zip(component_names, colors)):
        ax.axvline(x=base_weights[comp_idx], color=color, linestyle='--', alpha=0.3)
    
    ax.set_xlabel('Component Weight', fontsize=11)
    ax.set_ylabel(f'Average Top-{top_k} Similarity Score', fontsize=11)
    ax.set_title(f'Weight Sensitivity Analysis\nQuery: "{query_title}"', fontsize=13, fontweight='bold')
    ax.legend(loc='best', fontsize=9)
    ax.grid(alpha=0.3)
    
    plt.tight_layout()
    plt.show()

In [None]:
# Example usage - uncomment after loading your data
# plot_weight_sensitivity(
#     query_title="Alien",
#     vectorized_data=vectorized_data,
#     info_sim=info_sim,
#     content_rating_sim=content_rating_sim,
#     genre_sim=genre_sim,
#     year_sim=year_sim,
#     style_sim=style_sim,
#     type_sim=type_sim,
#     base_weights=(0.2, 0.2, 0.2, 0.2, 0.1, 0.1),
#     top_k=10
# )

---
## Full Example: Load Data and Generate All Visualizations

Copy the cells below into your main notebook after your data loading code.

In [None]:
# Assuming your similarity.py functions are available:
#
# from similarity import load_or_create_vectorized_data, load_or_create_similarity_matrices
#
# vectorized_data = load_or_create_vectorized_data()
# info_sim, content_rating_sim, genre_sim, year_sim, style_sim, type_sim = load_or_create_similarity_matrices(vectorized_data)
#
# # Then run all three visualizations:
#
# # 1. Component Contributions (KEY VISUALIZATION)
# plot_component_contributions(
#     query_title="Alien",
#     vectorized_data=vectorized_data,
#     info_sim=info_sim,
#     content_rating_sim=content_rating_sim,
#     genre_sim=genre_sim,
#     year_sim=year_sim,
#     style_sim=style_sim,
#     type_sim=type_sim,
#     weights=(0.2, 0.2, 0.2, 0.2, 0.1, 0.1),
#     top_k=5
# )
#
# # 2. Similarity Distributions
# plot_similarity_distributions(
#     info_sim, content_rating_sim, genre_sim, year_sim, style_sim, type_sim
# )
#
# # 3. Weight Sensitivity
# plot_weight_sensitivity(
#     query_title="Alien",
#     vectorized_data=vectorized_data,
#     info_sim=info_sim,
#     content_rating_sim=content_rating_sim,
#     genre_sim=genre_sim,
#     year_sim=year_sim,
#     style_sim=style_sim,
#     type_sim=type_sim,
#     base_weights=(0.2, 0.2, 0.2, 0.2, 0.1, 0.1),
#     top_k=10
# )