In [2]:
%pip install plotly pandas

# 📚 Imports
import pandas as pd
import plotly.express as px
import os

# 📂 Ensure output folder exists
os.makedirs("visuals", exist_ok=True)

# 📥 Load CSVs
fp16 = pd.read_csv("data/evaluation_results_FP16.csv")
int8 = pd.read_csv("data/evaluation_results_INT8.csv")
int4 = pd.read_csv("data/evaluation_results_INT4.csv")

# 🏷️ Add precision column
fp16["Precision"] = "FP16"
int8["Precision"] = "INT8"
int4["Precision"] = "INT4"

# 🔗 Combine into one DataFrame
combined = pd.concat([fp16, int8, int4], ignore_index=True)

# ✅ Ensure numeric columns
combined["BLEU Score"] = pd.to_numeric(combined["BLEU Score"], errors="coerce")
combined["Latency (ms)"] = pd.to_numeric(combined["Latency (ms)"], errors="coerce")

# 📌 Create a unique variant ID
combined["variant_id"] = combined["Model"] + "_" + combined["Precision"]

# ========== 1️⃣ FP16 BLEU Score Comparison ==========
fig1 = px.bar(fp16, x="Model", y="BLEU Score", color="Model",
              title="1️⃣ BLEU Score Comparison Within FP16",
              labels={"BLEU Score": "BLEU Score", "Model": "Model"})

fig1.update_layout(xaxis_tickangle=-45)
fig1.write_html("visuals/interactive_FP16_bleu.html")

# ========== 2️⃣ INT8 BLEU Score Comparison ==========
fig2 = px.bar(int8, x="Model", y="BLEU Score", color="Model",
              title="2️⃣ BLEU Score Comparison Within INT8",
              labels={"BLEU Score": "BLEU Score", "Model": "Model"})

fig2.update_layout(xaxis_tickangle=-45)
fig2.write_html("visuals/interactive_INT8_bleu.html")

# ========== 3️⃣ INT4 BLEU Score Comparison ==========
fig3 = px.bar(int4, x="Model", y="BLEU Score", color="Model",
              title="3️⃣ BLEU Score Comparison Within INT4",
              labels={"BLEU Score": "BLEU Score", "Model": "Model"})

fig3.update_layout(xaxis_tickangle=-45)
fig3.write_html("visuals/interactive_INT4_bleu.html")

# ========== 4️⃣ Average BLEU & Latency Across Precisions ==========
avg_metrics = combined.groupby("Precision")[["BLEU Score", "Latency (ms)"]].mean().reset_index()
avg_melted = avg_metrics.melt(id_vars="Precision", var_name="Metric", value_name="Value")

fig4 = px.bar(avg_melted, x="Metric", y="Value", color="Precision", barmode="group",
              title="4️⃣ Average BLEU and Latency Across Precisions",
              labels={"Value": "Average Value", "Metric": "Metric"})

fig4.write_html("visuals/interactive_avg_metrics.html")

# ========== 5️⃣ BLEU Comparison Across All Model Variants ==========
fig5 = px.bar(combined, x="variant_id", y="BLEU Score", color="Precision",
              title="5️⃣ BLEU Score Comparison Across All Model Variants",
              labels={"BLEU Score": "BLEU Score", "variant_id": "Model + Precision"})

fig5.update_layout(xaxis_tickangle=-90)
fig5.write_html("visuals/interactive_all_variants_bleu.html")


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
# Add these new visualizations to your existing notebook

# ========== 6️⃣ Performance vs. Efficiency Scatter Plot ==========
# This shows the trade-off between BLEU score and latency
fig6 = px.scatter(combined, x="Latency (ms)", y="BLEU Score", 
                  color="Precision", symbol="Model", size="BLEU Score",
                  hover_data=["Model", "Precision", "BLEU Score", "Latency (ms)"],
                  title="6️⃣ Performance vs. Efficiency Trade-off",
                  labels={"BLEU Score": "BLEU Score (higher is better)", 
                          "Latency (ms)": "Latency in ms (lower is better)"})

# Add a trend line
fig6.update_traces(marker=dict(line=dict(width=1, color='DarkSlateGrey')))
fig6.update_layout(legend_title_text='Precision')
fig6.write_html("visuals/interactive_performance_efficiency.html")

# ========== 7️⃣ Precision Degradation Analysis ==========
# First, create a pivot table to compare the same model across different precisions
pivot_df = combined.pivot_table(index="Model", columns="Precision", 
                              values="BLEU Score", aggfunc="first").reset_index()

# Calculate degradation percentages
for precision in ["INT8", "INT4"]:
    pivot_df[f"{precision} vs FP16 (%)"] = ((pivot_df[precision] - pivot_df["FP16"]) / pivot_df["FP16"] * 100).round(2)

# Melt the dataframe for visualization
degradation_df = pivot_df.melt(id_vars="Model", 
                              value_vars=["INT8 vs FP16 (%)", "INT4 vs FP16 (%)"],
                              var_name="Comparison", value_name="Degradation (%)")

fig7 = px.bar(degradation_df, x="Model", y="Degradation (%)", color="Comparison",
             barmode="group", title="7️⃣ BLEU Score Degradation Relative to FP16",
             labels={"Degradation (%)": "% Change from FP16 (negative = worse)"})

fig7.update_layout(xaxis_tickangle=-45)
fig7.write_html("visuals/interactive_precision_degradation.html")

# ========== 8️⃣ Model Size vs. Performance ==========
# If you have model size data, you can add it to your combined dataframe
# This is a placeholder - you'll need to add the actual model size data
# Let's assume you have a dictionary mapping model names to their sizes in MB
model_sizes = {
    "MODEL_A": 350,
    "MODEL_B": 420, 
    "MODEL_C": 500,
    "MODEL_D": 650,
    # Add all your models here
}

# Add model size to the combined dataframe
combined["Model Size (MB)"] = combined["Model"].map(model_sizes)

# Create a bubble chart
fig8 = px.scatter(combined, x="Model Size (MB)", y="BLEU Score", 
                 size="Latency (ms)", color="Precision", symbol="Model",
                 hover_data=["Model", "Precision", "BLEU Score"],
                 title="8️⃣ Model Size vs. Performance Trade-off",
                 labels={"BLEU Score": "BLEU Score", "Model Size (MB)": "Model Size (MB)"})

fig8.update_layout(xaxis_title="Model Size (MB)")
fig8.write_html("visuals/interactive_size_performance.html")

# ========== 9️⃣ Performance Distribution Boxplots ==========
fig9 = px.box(combined, x="Precision", y="BLEU Score", color="Precision",
             points="all", title="9️⃣ BLEU Score Distribution by Precision",
             labels={"BLEU Score": "BLEU Score", "Precision": "Precision"})

fig9.write_html("visuals/interactive_performance_distribution.html")

# ========== 🔟 Performance Radar Charts ==========
# This creates a radar chart to compare multiple metrics for each precision
# Let's assume you have multiple metrics in your data
# If not, you could use other columns or calculate additional metrics

# Create a sample dataframe with multiple metrics
# In a real scenario, you'd use your actual metrics
metrics_df = pd.DataFrame({
    "Precision": ["FP16", "INT8", "INT4"],
    "BLEU Score": avg_metrics["BLEU Score"].tolist(),
    "Speed (1/Latency)": (1000 / avg_metrics["Latency (ms)"]).tolist(),
    "Memory Efficiency": [1.0, 2.0, 4.0],  # Relative to FP16
    "Inference Throughput": [1.0, 1.8, 3.5]  # Relative to FP16
})

# Normalize the metrics for better visualization
for col in metrics_df.columns:
    if col != "Precision":
        max_val = metrics_df[col].max()
        metrics_df[col] = metrics_df[col] / max_val

# Create the radar chart
metrics_melted = metrics_df.melt(id_vars="Precision", var_name="Metric", value_name="Value")

fig10 = px.line_polar(metrics_melted, r="Value", theta="Metric", color="Precision", line_close=True,
                     title="🔟 Multi-metric Performance Comparison",
                     range_r=[0, 1])

fig10.update_layout(polar=dict(radialaxis=dict(visible=True, range=[0, 1])))
fig10.write_html("visuals/interactive_radar_chart.html")

# ========== 1️⃣1️⃣ Interactive Model Selector Dashboard ==========
# Create a combined dashboard with model selector
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Function to create a model comparison dashboard
def create_model_comparison(model_name):
    model_data = combined[combined["Model"] == model_name]
    
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=("BLEU Score by Precision", "Latency by Precision", 
                       "BLEU vs Latency", "Precision Comparison"),
        specs=[[{"type": "bar"}, {"type": "bar"}],
              [{"type": "scatter"}, {"type": "bar"}]]
    )
    
    # BLEU Score by Precision
    fig.add_trace(
        go.Bar(x=model_data["Precision"], y=model_data["BLEU Score"], name="BLEU Score",
              marker_color=['#1f77b4', '#ff7f0e', '#2ca02c']),
        row=1, col=1
    )
    
    # Latency by Precision
    fig.add_trace(
        go.Bar(x=model_data["Precision"], y=model_data["Latency (ms)"], name="Latency",
              marker_color=['#1f77b4', '#ff7f0e', '#2ca02c']),
        row=1, col=2
    )
    
    # BLEU vs Latency Scatter
    fig.add_trace(
        go.Scatter(x=model_data["Latency (ms)"], y=model_data["BLEU Score"], mode="markers+text",
                  marker=dict(size=12, color=['#1f77b4', '#ff7f0e', '#2ca02c']),
                  text=model_data["Precision"], textposition="top center"),
        row=2, col=1
    )
    
    # If you have the precision degradation data
    if "MODEL_A" in model_sizes:  # Just a check to ensure we have the previous code executed
        model_pivot = pivot_df[pivot_df["Model"] == model_name]
        
        if not model_pivot.empty:
            degradation_data = {
                "Precision": ["FP16", "INT8", "INT4"],
                "BLEU Score": [
                    model_pivot["FP16"].values[0],
                    model_pivot["INT8"].values[0],
                    model_pivot["INT4"].values[0]
                ]
            }
            
            degradation_df = pd.DataFrame(degradation_data)
            degradation_df["Relative"] = degradation_df["BLEU Score"] / degradation_df["BLEU Score"].max()
            
            fig.add_trace(
                go.Bar(x=degradation_df["Precision"], y=degradation_df["Relative"], 
                      name="Relative Score", marker_color=['#1f77b4', '#ff7f0e', '#2ca02c'],
                      text=[f"{x:.2f}" for x in degradation_df["BLEU Score"]], textposition="auto"),
                row=2, col=2
            )
    
    # Update layout
    fig.update_layout(height=800, width=1000, 
                     title_text=f"1️⃣1️⃣ Detailed Analysis for {model_name}",
                     showlegend=False)
    
    return fig

# Create individual dashboard for each model
for model in combined["Model"].unique():
    fig = create_model_comparison(model)
    fig.write_html(f"visuals/interactive_dashboard_{model}.html")

# Create an index for all model-specific dashboards
models_list = combined["Model"].unique().tolist()

In [4]:
metrics_df = pd.DataFrame({
    "Precision": ["FP16", "INT8", "INT4"],
    "BLEU Score": avg_metrics["BLEU Score"].tolist(),
    "Speed (1/Latency)": (1000 / avg_metrics["Latency (ms)"]).tolist(),
    "Memory Efficiency": [1.0, 2.0, 4.0],  # Customize if needed
    "Inference Throughput": [1.0, 1.8, 3.5]
})

# Normalize metrics
for col in metrics_df.columns[1:]:
    metrics_df[col] = metrics_df[col] / metrics_df[col].max()

metrics_melted = metrics_df.melt(id_vars="Precision", var_name="Metric", value_name="Value")

fig10 = px.line_polar(metrics_melted, r="Value", theta="Metric", color="Precision", line_close=True,
                     title="🔟 Multi-metric Performance Comparison",
                     range_r=[0, 1])

fig10.update_layout(polar=dict(radialaxis=dict(visible=True, range=[0, 1])))
fig10.write_html("visuals/interactive_radar_chart.html")

In [5]:
# ========== 8️⃣ Model Size vs. Performance (Actual Implementation) ==========
# Add this after your existing visualizations

# Create a dictionary with actual model sizes in MB (you'll need to fill these in)
model_sizes = {
    "qwen2.5-0.5b-instruct": 500,
    "tiny-llama-1b-chat": 1000,
    "DeepSeek-R1-Distill-Qwen-1.5B": 1500,
    "DeepSeek-R1-Distill-Qwen-7B": 7000,
    "qwen2.5-1.5b-instruct": 1500,
    "gemma-2b-it": 2000,
    "gemma-2-2b-it": 2000,
    "qwen2.5-3b-instruct": 3000,
    "minicpm3-4b": 4000,
    "DeepSeek-R1-Distill-Llama-8B": 8000,
    "qwen2.5-7b-instruct": 7000,
    "llama-3.2-1b-instruct": 1000,
    "llama-3.2-3b-instruct": 3000,
    "zephyr-7b-beta": 7000,
    "notus-7b-v1": 7000,
    "gemma-2-9b-it": 9000
}

# Add model size to the combined dataframe
combined["Model Size (MB)"] = combined["Model"].map(model_sizes)

# Create bubble chart with actual sizes
fig8 = px.scatter(
    combined, 
    x="Model Size (MB)", 
    y="BLEU Score", 
    size="Throughput (tokens/sec)", 
    color="Precision", 
    symbol="Model",
    hover_data=["Model", "Precision", "BLEU Score", "Latency (ms)", "Throughput (tokens/sec)"],
    title="8️⃣ Model Size vs. Performance Trade-off (Actual Sizes)",
    labels={
        "BLEU Score": "BLEU Score", 
        "Model Size (MB)": "Model Size (MB)",
        "Throughput (tokens/sec)": "Throughput (tokens/sec)"
    }
)

fig8.update_layout(
    xaxis_title="Model Size (MB) - Lower is Better",
    yaxis_title="BLEU Score - Higher is Better",
    height=600
)

fig8.write_html("visuals/interactive_size_performance_actual.html")

In [6]:
# ========== 2️⃣2️⃣ Throughput vs. Latency Efficiency Frontier ==========
# Add this after your existing visualizations

fig22 = px.scatter(
    combined,
    x="Latency (ms)",
    y="Throughput (tokens/sec)",
    color="Model",
    symbol="Precision",
    size="BLEU Score",
    hover_data=["BLEU Score", "ROUGE-L", "CHRF Score"],
    title="2️⃣2️⃣ Throughput vs. Latency Efficiency Frontier",
    labels={
        "Latency (ms)": "Latency (ms) - Lower is Better",
        "Throughput (tokens/sec)": "Throughput (tokens/sec) - Higher is Better",
        "BLEU Score": "BLEU Score"
    }
)

# Add trend lines for each model
for model in combined["Model"].unique():
    model_data = combined[combined["Model"] == model]
    if len(model_data) > 1:  # Only add trend line if multiple points exist
        fig22.add_trace(
            go.Scatter(
                x=model_data["Latency (ms)"],
                y=model_data["Throughput (tokens/sec)"],
                mode="lines",
                line=dict(width=1, dash="dot"),
                name=f"{model} Trend",
                showlegend=False
            )
        )

fig22.update_layout(
    height=700,
    xaxis_type="log",  # Use log scale for better visualization
    yaxis_type="log"
)

fig22.write_html("visuals/interactive_throughput_latency_frontier.html")

In [7]:
# ========== 2️⃣3️⃣ Precision Impact on Quality Metrics ==========
# Add this after your existing visualizations

# Melt the dataframe for quality metrics
quality_metrics = ["BLEU Score", "ROUGE-1", "ROUGE-2", "ROUGE-L", "CHRF Score"]
melted_quality = combined.melt(
    id_vars=["Model", "Precision"], 
    value_vars=quality_metrics,
    var_name="Metric", 
    value_name="Score"
)

fig23 = px.box(
    melted_quality,
    x="Precision",
    y="Score",
    color="Precision",
    facet_col="Metric",
    facet_col_wrap=3,
    title="2️⃣3️⃣ Impact of Precision on Different Quality Metrics",
    labels={
        "Score": "Metric Value",
        "Precision": "Precision Level"
    }
)

fig23.update_layout(
    height=800,
    showlegend=False
)

fig23.update_yaxes(matches=None)  # Allow different y-axis scales per metric
fig23.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))

fig23.write_html("visuals/interactive_precision_impact_quality.html")

In [8]:
# ========== 2️⃣4️⃣ Model Ranking by Multiple Metrics ==========
# Add this after your existing visualizations

# Calculate normalized scores (0-1) for each metric
metrics_to_rank = {
    "BLEU Score": True,  # Higher is better
    "ROUGE-L": True,
    "CHRF Score": True,
    "Latency (ms)": False,  # Lower is better
    "Throughput (tokens/sec)": True
}

# Create a ranking dataframe
ranking_df = combined.copy()
for metric, higher_better in metrics_to_rank.items():
    if higher_better:
        ranking_df[f"{metric}_norm"] = (ranking_df[metric] - ranking_df[metric].min()) / (ranking_df[metric].max() - ranking_df[metric].min())
    else:
        ranking_df[f"{metric}_norm"] = 1 - (ranking_df[metric] - ranking_df[metric].min()) / (ranking_df[metric].max() - ranking_df[metric].min())

# Calculate composite score (weighted average)
weights = {
    "BLEU Score_norm": 0.3,
    "ROUGE-L_norm": 0.2,
    "CHRF Score_norm": 0.2,
    "Latency (ms)_norm": 0.15,
    "Throughput (tokens/sec)_norm": 0.15
}

ranking_df["Composite Score"] = sum(ranking_df[col] * weight for col, weight in weights.items())

# Get top models by composite score
top_models = ranking_df.sort_values("Composite Score", ascending=False).drop_duplicates(["Model", "Precision"])

# Create ranking visualization
fig24 = px.bar(
    top_models.sort_values("Composite Score", ascending=True),
    x="Composite Score",
    y="variant_id",
    color="Precision",
    orientation="h",
    hover_data=list(metrics_to_rank.keys()),
    title="2️⃣4️⃣ Model Ranking by Composite Score (Higher is Better)",
    labels={
        "Composite Score": "Composite Score (0-1)",
        "variant_id": "Model + Precision"
    }
)

fig24.update_layout(
    height=900,
    yaxis={"categoryorder": "total ascending"}
)

fig24.write_html("visuals/interactive_model_ranking_composite.html")

In [9]:
# ========== 2️⃣5️⃣ Performance per Parameter Count ==========
# Add this after your existing visualizations

# Add parameter counts to your dataframe (you'll need to fill these in)
param_counts = {
    "qwen2.5-0.5b-instruct": 0.5,
    "tiny-llama-1b-chat": 1.0,
    "DeepSeek-R1-Distill-Qwen-1.5B": 1.5,
    "DeepSeek-R1-Distill-Qwen-7B": 7.0,
    "qwen2.5-1.5b-instruct": 1.5,
    "gemma-2b-it": 2.0,
    "gemma-2-2b-it": 2.0,
    "qwen2.5-3b-instruct": 3.0,
    "minicpm3-4b": 4.0,
    "DeepSeek-R1-Distill-Llama-8B": 8.0,
    "qwen2.5-7b-instruct": 7.0,
    "llama-3.2-1b-instruct": 1.0,
    "llama-3.2-3b-instruct": 3.0,
    "zephyr-7b-beta": 7.0,
    "notus-7b-v1": 7.0,
    "gemma-2-9b-it": 9.0
}

combined["Parameter Count (B)"] = combined["Model"].map(param_counts)
combined["BLEU per Billion Params"] = combined["BLEU Score"] / combined["Parameter Count (B)"]
combined["Throughput per Billion Params"] = combined["Throughput (tokens/sec)"] / combined["Parameter Count (B)"]

fig25 = px.scatter(
    combined,
    x="Parameter Count (B)",
    y="BLEU per Billion Params",
    color="Precision",
    symbol="Model",
    size="Throughput per Billion Params",
    hover_data=["BLEU Score", "Throughput (tokens/sec)", "Latency (ms)"],
    title="2️⃣5️⃣ Performance Efficiency per Billion Parameters",
    labels={
        "Parameter Count (B)": "Model Size (Billion Parameters)",
        "BLEU per Billion Params": "BLEU Score per Billion Parameters",
        "Throughput per Billion Params": "Throughput per Billion Params"
    }
)

fig25.update_layout(
    height=700,
    xaxis_title="Model Size (Billion Parameters)",
    yaxis_title="BLEU Score per Billion Parameters (Higher is Better)"
)

fig25.write_html("visuals/interactive_performance_per_parameter.html")

In [10]:
# ========== 2️⃣6️⃣ Model Family Performance Comparison ==========
# Extract model family from names
combined['Model Family'] = combined['Model'].str.extract(r'^([a-zA-Z\-]+)')[0]

fig26 = px.box(
    combined,
    x="Model Family",
    y="BLEU Score",
    color="Precision",
    title="2️⃣6️⃣ Performance Comparison by Model Family",
    labels={"BLEU Score": "BLEU Score", "Model Family": "Model Family"}
)
fig26.update_layout(xaxis_tickangle=-45)
fig26.write_html("visuals/interactive_model_family_comparison.html")

In [11]:
# ========== 2️⃣7️⃣ Speed-Quality Trade-off Matrix ==========
fig27 = px.scatter(
    combined,
    x="Latency (ms)",
    y="BLEU Score",
    color="Throughput (tokens/sec)",
    size="Model Size (MB)",
    facet_col="Precision",
    hover_data=["Model", "ROUGE-L", "CHRF Score"],
    title="2️⃣7️⃣ Speed-Quality Trade-off Matrix by Precision",
    labels={
        "Latency (ms)": "Latency (ms) - Lower is Better",
        "BLEU Score": "BLEU Score - Higher is Better",
        "Throughput (tokens/sec)": "Throughput"
    }
)
fig27.update_layout(height=500)
fig27.write_html("visuals/interactive_speed_quality_matrix.html")

In [12]:
# ========== 2️⃣8️⃣ Precision Degradation by Model Size ==========
# Calculate degradation percentages
pivot_df = combined.pivot_table(index=["Model", "Model Size (MB)"], 
                              columns="Precision", values="BLEU Score").reset_index()
for precision in ["INT8", "INT4"]:
    pivot_df[f"{precision}_degradation"] = ((pivot_df[precision] - pivot_df["FP16"]) / pivot_df["FP16"] * 100)

fig28 = px.scatter(
    pivot_df,
    x="Model Size (MB)",
    y="INT8_degradation",
    size="FP16",
    color="Model",
    trendline="lowess",
    title="2️⃣8️⃣ INT8 Precision Degradation by Model Size",
    labels={
        "Model Size (MB)": "Model Size (MB)",
        "INT8_degradation": "INT8 BLEU Score Degradation (%)",
        "FP16": "FP16 BLEU Score"
    }
)
fig28.write_html("visuals/interactive_degradation_by_size.html")

In [13]:
# ========== 3️⃣0️⃣ Model Efficiency Score ==========
# Calculate composite efficiency score
combined["Efficiency Score"] = (
    0.4 * (1 - (combined["Latency (ms)"] - combined["Latency (ms)"].min()) / 
           (combined["Latency (ms)"].max() - combined["Latency (ms)"].min())) +
    0.4 * ((combined["Throughput (tokens/sec)"] - combined["Throughput (tokens/sec)"].min()) / 
           (combined["Throughput (tokens/sec)"].max() - combined["Throughput (tokens/sec)"].min())) +
    0.2 * (combined["BLEU Score"] - combined["BLEU Score"].min()) / 
           (combined["BLEU Score"].max() - combined["BLEU Score"].min())
)

fig30 = px.bar(
    combined.sort_values("Efficiency Score", ascending=False),
    x="variant_id",
    y="Efficiency Score",
    color="Precision",
    title="3️⃣0️⃣ Model Efficiency Score (Higher is Better)",
    labels={"variant_id": "Model + Precision", "Efficiency Score": "Efficiency Score (0-1)"}
)
fig30.update_layout(xaxis_tickangle=-90, height=600)
fig30.write_html("visuals/interactive_efficiency_score.html")

In [14]:
# ========== 3️⃣1️⃣ Performance vs. Model Size Growth ==========
fig31 = px.scatter(
    combined,
    x="Model Size (MB)",
    y="BLEU Score",
    animation_frame="Precision",
    animation_group="Model",
    size="Throughput (tokens/sec)",
    color="Model Family",
    hover_name="Model",
    title="3️⃣1️⃣ Performance vs. Model Size Growth by Precision",
    labels={
        "Model Size (MB)": "Model Size (MB)",
        "BLEU Score": "BLEU Score",
        "Throughput (tokens/sec)": "Throughput"
    }
)
fig31.update_layout(height=700)
fig31.write_html("visuals/interactive_size_growth.html")

In [15]:
corr_matrix = combined[["BLEU Score", "ROUGE-1", "ROUGE-2", "ROUGE-L", "CHRF Score"]].corr()

# Create the Sankey diagram
fig32 = go.Figure(go.Sankey(
    node=dict(
        label=["BLEU", "ROUGE-1", "ROUGE-2", "ROUGE-L", "CHRF"],
        color="blue"
    ),
    link=dict(
        source=[0, 0, 0, 0, 1, 1, 1, 2, 2, 3],
        target=[1, 2, 3, 4, 2, 3, 4, 3, 4, 4],
        value=[
            abs(corr_matrix.iloc[0,1]*10),  # BLEU -> ROUGE-1
            abs(corr_matrix.iloc[0,2]*10),  # BLEU -> ROUGE-2
            abs(corr_matrix.iloc[0,3]*10),  # BLEU -> ROUGE-L
            abs(corr_matrix.iloc[0,4]*10),  # BLEU -> CHRF
            abs(corr_matrix.iloc[1,2]*10),  # ROUGE-1 -> ROUGE-2
            abs(corr_matrix.iloc[1,3]*10),  # ROUGE-1 -> ROUGE-L
            abs(corr_matrix.iloc[1,4]*10),  # ROUGE-1 -> CHRF
            abs(corr_matrix.iloc[2,3]*10),  # ROUGE-2 -> ROUGE-L
            abs(corr_matrix.iloc[2,4]*10),  # ROUGE-2 -> CHRF
            abs(corr_matrix.iloc[3,4]*10)   # ROUGE-L -> CHRF
        ]
    )
))

# Update layout
fig32.update_layout(
    title_text="3️⃣2️⃣ Quality Metrics Correlation Network",
    font_size=12,
    height=600
)

# Save the visualization
fig32.write_html("visuals/interactive_metrics_network.html")

In [16]:
# ========== 3️⃣3️⃣ Precision Impact on Latency/Throughput ==========
fig33 = make_subplots(rows=1, cols=2, subplot_titles=("Latency Impact", "Throughput Impact"))

for i, metric in enumerate(["Latency (ms)", "Throughput (tokens/sec)"]):
    fig33.add_trace(
        go.Box(
            x=combined["Precision"],
            y=combined[metric],
            name=metric
        ),
        row=1, col=i+1
    )

fig33.update_layout(
    title_text="3️⃣3️⃣ Precision Impact on Latency and Throughput",
    showlegend=False,
    height=500
)
fig33.write_html("visuals/interactive_precision_impact_latency_throughput.html")

In [17]:
# Define the metrics of interest
metrics = ["BLEU Score", "ROUGE-L", "CHRF Score", "Latency (ms)", "Throughput (tokens/sec)"]
best_models = pd.DataFrame()

# Find the best model for each metric and precision
for metric in metrics:
    if metric == "Latency (ms)":
        best = combined.loc[combined.groupby("Precision")[metric].idxmin()]
    else:
        best = combined.loc[combined.groupby("Precision")[metric].idxmax()]
    
    best = best.copy()
    best["Metric"] = metric
    best["value"] = best[metric]
    best_models = pd.concat([best_models, best], ignore_index=True)

# Plot the results using Plotly
fig35 = px.bar(
    best_models,
    x="Precision",
    y="value",
    facet_col="Metric",
    facet_col_wrap=3,
    color="Model",
    title="3️⃣5️⃣ Best Model by Metric and Precision",
    labels={"value": "Metric Value"}
)

# Customize layout
fig35.update_layout(height=800)

# Save the interactive chart
fig35.write_html("visuals/interactive_best_by_metric.html")

In [18]:
# ========== 1️⃣ CHRF Score Comparison ==========
fig_chrf = px.box(
    combined,
    x="Precision",
    y="CHRF Score",
    color="Precision",
    title="CHRF Score Comparison Across Precisions",
    labels={"CHRF Score": "CHRF Score", "Precision": "Precision"}
)
fig_chrf.update_layout(height=500)
fig_chrf.write_html("visuals/interactive_chrf_comparison.html")

In [19]:
# ========== 2️⃣ ROUGE-L Comparison ==========
fig_rouge_l = px.violin(
    combined,
    x="Precision",
    y="ROUGE-L",
    color="Precision",
    box=True,
    points="all",
    title="ROUGE-L Comparison Across Precisions",
    labels={"ROUGE-L": "ROUGE-L Score", "Precision": "Precision"}
)
fig_rouge_l.update_layout(height=500)
fig_rouge_l.write_html("visuals/interactive_rouge_l_comparison.html")

In [20]:
# ========== 3️⃣ ROUGE-1 Comparison ==========
fig_rouge_1 = px.strip(
    combined,
    x="Precision",
    y="ROUGE-1",
    color="Model",
    facet_col="Model Family",
    facet_col_wrap=4,
    title="ROUGE-1 Comparison Across Precisions by Model Family",
    labels={"ROUGE-1": "ROUGE-1 Score", "Precision": "Precision"}
)
fig_rouge_1.update_layout(height=700, showlegend=False)
fig_rouge_1.write_html("visuals/interactive_rouge_1_comparison.html")

In [21]:
# ========== 4️⃣ ROUGE-2 Comparison ==========
fig_rouge_2 = px.bar(
    combined,
    x="Model",
    y="ROUGE-2",
    color="Precision",
    barmode="group",
    title="ROUGE-2 Comparison Across Precisions by Model",
    labels={"ROUGE-2": "ROUGE-2 Score", "Model": "Model"}
)
fig_rouge_2.update_layout(xaxis_tickangle=-45, height=600)
fig_rouge_2.write_html("visuals/interactive_rouge_2_comparison.html")

In [22]:
# ========== 4️⃣ ROUGE-2 Comparison ==========
fig_rouge_2 = px.bar(
    combined,
    x="Model",
    y="ROUGE-2",
    color="Precision",
    barmode="group",
    title="ROUGE-2 Comparison Across Precisions by Model",
    labels={"ROUGE-2": "ROUGE-2 Score", "Model": "Model"}
)
fig_rouge_2.update_layout(xaxis_tickangle=-45, height=600)
fig_rouge_2.write_html("visuals/interactive_rouge_2_comparison.html")

In [23]:
# ========== 5️⃣ All Variants CHRF ==========
fig_all_chrf = px.line(
    combined,
    x="variant_id",
    y="CHRF Score",
    color="Precision",
    title="CHRF Scores Across All Model Variants",
    labels={"variant_id": "Model + Precision", "CHRF Score": "CHRF Score"}
)
fig_all_chrf.update_layout(xaxis_tickangle=-90, height=600)
fig_all_chrf.write_html("visuals/interactive_all_variants_chrf.html")

In [24]:
# ========== 6️⃣ All Variants ROUGE-L ==========
fig_all_rouge_l = px.scatter(
    combined,
    x="variant_id",
    y="ROUGE-L",
    color="Precision",
    size="BLEU Score",
    title="ROUGE-L Scores Across All Model Variants",
    labels={"variant_id": "Model + Precision", "ROUGE-L": "ROUGE-L Score"}
)
fig_all_rouge_l.update_layout(xaxis_tickangle=-90, height=600)
fig_all_rouge_l.write_html("visuals/interactive_all_variants_rouge_l.html")

In [25]:
# ========== 7️⃣ All Variants ROUGE-1 ==========
fig_all_rouge_1 = px.bar(
    combined.sort_values("ROUGE-1", ascending=False),
    x="variant_id",
    y="ROUGE-1",
    color="Precision",
    title="ROUGE-1 Scores Across All Model Variants (Sorted)",
    labels={"variant_id": "Model + Precision", "ROUGE-1": "ROUGE-1 Score"}
)
fig_all_rouge_1.update_layout(xaxis_tickangle=-90, height=600)
fig_all_rouge_1.write_html("visuals/interactive_all_variants_rouge_1.html")

In [26]:
# ========== 8️⃣ All Variants ROUGE-2 ==========
fig_all_rouge_2 = px.scatter(
    combined,
    x="Model Size (MB)",
    y="ROUGE-2",
    color="Precision",
    symbol="Model Family",
    size="Throughput (tokens/sec)",
    title="ROUGE-2 Scores vs Model Size",
    labels={
        "Model Size (MB)": "Model Size (MB)",
        "ROUGE-2": "ROUGE-2 Score",
        "Model Family": "Model Family"
    }
)
fig_all_rouge_2.update_layout(height=600)
fig_all_rouge_2.write_html("visuals/interactive_all_variants_rouge_2.html")

In [27]:
# ========== 🔟 CHRF Degradation ==========
pivot_df = combined.pivot_table(index="Model", columns="Precision", values="CHRF Score").reset_index()
for precision in ["INT8", "INT4"]:
    pivot_df[f"{precision}_degradation"] = ((pivot_df[precision] - pivot_df["FP16"]) / pivot_df["FP16"] * 100)

fig_chrf_degradation = px.bar(
    pivot_df.melt(id_vars="Model", 
                value_vars=["INT8_degradation", "INT4_degradation"],
                var_name="Comparison", 
                value_name="Degradation (%)"),
    x="Model",
    y="Degradation (%)",
    color="Comparison",
    barmode="group",
    title="CHRF Score Degradation Relative to FP16",
    labels={"Degradation (%)": "% Change from FP16 (negative = worse)"}
)
fig_chrf_degradation.update_layout(xaxis_tickangle=-45, height=600)
fig_chrf_degradation.write_html("visuals/interactive_chrf_degradation.html")

In [28]:
# ========== 1️⃣1️⃣ ROUGE-L Degradation ==========
pivot_df = combined.pivot_table(index="Model", columns="Precision", values="ROUGE-L").reset_index()
for precision in ["INT8", "INT4"]:
    pivot_df[f"{precision}_degradation"] = ((pivot_df[precision] - pivot_df["FP16"]) / pivot_df["FP16"] * 100)

fig_rouge_l_degradation = px.bar(
    pivot_df.melt(id_vars="Model", 
                value_vars=["INT8_degradation", "INT4_degradation"],
                var_name="Comparison", 
                value_name="Degradation (%)"),
    x="Model",
    y="Degradation (%)",
    color="Comparison",
    barmode="group",
    title="ROUGE-L Score Degradation Relative to FP16",
    labels={"Degradation (%)": "% Change from FP16 (negative = worse)"}
)
fig_rouge_l_degradation.update_layout(xaxis_tickangle=-45, height=600)
fig_rouge_l_degradation.write_html("visuals/interactive_rouge_l_degradation.html")

In [29]:
# ========== 1️⃣2️⃣ ROUGE-1 Degradation ==========
pivot_df = combined.pivot_table(index="Model", columns="Precision", values="ROUGE-1").reset_index()
for precision in ["INT8", "INT4"]:
    pivot_df[f"{precision}_degradation"] = ((pivot_df[precision] - pivot_df["FP16"]) / pivot_df["FP16"] * 100)

fig_rouge_1_degradation = px.bar(
    pivot_df.melt(id_vars="Model", 
                value_vars=["INT8_degradation", "INT4_degradation"],
                var_name="Comparison", 
                value_name="Degradation (%)"),
    x="Model",
    y="Degradation (%)",
    color="Comparison",
    barmode="group",
    title="ROUGE-1 Score Degradation Relative to FP16",
    labels={"Degradation (%)": "% Change from FP16 (negative = worse)"}
)
fig_rouge_1_degradation.update_layout(xaxis_tickangle=-45, height=600)
fig_rouge_1_degradation.write_html("visuals/interactive_rouge_1_degradation.html")

In [30]:
# ========== 1️⃣3️⃣ ROUGE-2 Degradation ==========
pivot_df = combined.pivot_table(index="Model", columns="Precision", values="ROUGE-2").reset_index()
for precision in ["INT8", "INT4"]:
    pivot_df[f"{precision}_degradation"] = ((pivot_df[precision] - pivot_df["FP16"]) / pivot_df["FP16"] * 100)

fig_rouge_2_degradation = px.bar(
    pivot_df.melt(id_vars="Model", 
                value_vars=["INT8_degradation", "INT4_degradation"],
                var_name="Comparison", 
                value_name="Degradation (%)"),
    x="Model",
    y="Degradation (%)",
    color="Comparison",
    barmode="group",
    title="ROUGE-2 Score Degradation Relative to FP16",
    labels={"Degradation (%)": "% Change from FP16 (negative = worse)"}
)
fig_rouge_2_degradation.update_layout(xaxis_tickangle=-45, height=600)
fig_rouge_2_degradation.write_html("visuals/interactive_rouge_2_degradation.html")

In [31]:
# ========== 1️⃣4️⃣ Improved Performance vs Efficiency ==========
fig_perf_eff = px.scatter(
    combined,
    x="Latency (ms)",
    y="BLEU Score",
    color="Model Family",
    symbol="Precision",
    size="Throughput (tokens/sec)",
    facet_col="Precision",
    hover_data=["Model", "ROUGE-L", "CHRF Score"],
    title="Performance vs Efficiency by Precision",
    labels={
        "Latency (ms)": "Latency (ms) - Lower is Better",
        "BLEU Score": "BLEU Score - Higher is Better",
        "Throughput (tokens/sec)": "Throughput"
    }
)

# Add trend lines for each model family
for family in combined["Model Family"].unique():
    for precision in combined["Precision"].unique():
        subset = combined[(combined["Model Family"] == family) & (combined["Precision"] == precision)]
        if len(subset) > 1:
            fig_perf_eff.add_trace(
                go.Scatter(
                    x=subset["Latency (ms)"],
                    y=subset["BLEU Score"],
                    mode="lines",
                    line=dict(width=1, dash="dot"),
                    name=f"{family} Trend",
                    showlegend=False,
                    legendgroup=family
                ),
                row=1, col=["FP16", "INT8", "INT4"].index(precision)+1
            )

fig_perf_eff.update_layout(
    height=500,
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=-0.3,
        xanchor="center",
        x=0.5
    )
)
fig_perf_eff.write_html("visuals/interactive_performance_efficiency_improved.html")

In [32]:
# ========== 1️⃣5️⃣ Entropy Comparison ==========
fig_entropy = px.box(
    combined,
    x="Model Family",
    y="Entropy",
    color="Precision",
    points="all",
    title="Output Entropy Comparison by Model Family and Precision",
    labels={
        "Entropy": "Output Entropy (Higher = More Diverse)",
        "Model Family": "Model Family"
    }
)

fig_entropy.update_layout(
    xaxis_tickangle=-45,
    height=600,
    yaxis_title="Output Entropy (Higher = More Diverse)"
)
fig_entropy.write_html("visuals/interactive_entropy_comparison.html")