# Standard vs COD vs COT

In [5]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Load the data
df = pd.read_csv('test_results/model_comparison_summary_20250314_231521.csv')

# Filter out rows with error_rate = 100.0 (failed runs)
df = df[df['error_rate'] == 0.0]

# Create a box plot for accuracy by instruction type
fig1 = px.box(df, x="instruction_type", y="accuracy", 
             color="instruction_type",
             title="Model Accuracy by Instruction Type",
             labels={"instruction_type": "Instruction Type", 
                    "accuracy": "Accuracy (%)"},
             category_orders={"instruction_type": ["standard_instruction", 
                                                  "cod_instruction", 
                                                  "cot_instruction"]},
             color_discrete_map={"standard_instruction": "#1f77b4", 
                                "cod_instruction": "#ff7f0e", 
                                "cot_instruction": "#2ca02c"})

fig1.update_layout(boxmode='group', 
                  xaxis_title="Instruction Type",
                  yaxis_title="Accuracy (%)",
                  legend_title="Instruction Type",
                  font=dict(size=12))

fig1.show()

# Create a grouped bar chart for each model's performance across instruction types
fig2 = px.bar(df, x="model_name", y="accuracy", 
             color="instruction_type",
             barmode="group",
             title="Model Performance by Instruction Type",
             labels={"model_name": "Model", 
                    "accuracy": "Accuracy (%)", 
                    "instruction_type": "Instruction Type"},
             category_orders={"instruction_type": ["standard_instruction", 
                                                  "cod_instruction", 
                                                  "cot_instruction"]},
             color_discrete_map={"standard_instruction": "#1f77b4", 
                                "cod_instruction": "#ff7f0e", 
                                "cot_instruction": "#2ca02c"})

fig2.update_layout(xaxis_title="Model",
                  yaxis_title="Accuracy (%)",
                  legend_title="Instruction Type",
                  xaxis_tickangle=-45,
                  font=dict(size=12))

fig2.show()

# Create a scatter plot to visualize the relationship between reasoning length and accuracy
fig3 = px.scatter(df, x="reasoning_length", y="accuracy", 
                 color="instruction_type", 
                 size="inference_time",
                 hover_data=["model_name"],
                 title="Reasoning Length vs. Accuracy by Instruction Type",
                 labels={"reasoning_length": "Reasoning Length", 
                        "accuracy": "Accuracy (%)", 
                        "inference_time": "Inference Time (s)"},
                 category_orders={"instruction_type": ["standard_instruction", 
                                                      "cod_instruction", 
                                                      "cot_instruction"]},
                 color_discrete_map={"standard_instruction": "#1f77b4", 
                                    "cod_instruction": "#ff7f0e", 
                                    "cot_instruction": "#2ca02c"})

fig3.update_layout(xaxis_title="Reasoning Length",
                  yaxis_title="Accuracy (%)",
                  legend_title="Instruction Type",
                  font=dict(size=12))

fig3.show()

# Create a combined visualization with subplots
fig4 = make_subplots(rows=2, cols=2, 
                    subplot_titles=("Accuracy by Instruction Type", 
                                   "Reasoning Length by Instruction Type",
                                   "Inference Time by Instruction Type",
                                   "Accuracy vs. Reasoning Length"),
                    specs=[[{"type": "box"}, {"type": "box"}],
                          [{"type": "box"}, {"type": "scatter"}]])

# Add box plot for accuracy
for i, inst_type in enumerate(["standard_instruction", "cod_instruction", "cot_instruction"]):
    subset = df[df['instruction_type'] == inst_type]
    fig4.add_trace(
        go.Box(y=subset['accuracy'], name=inst_type.replace('_instruction', ''),
              marker_color=['#1f77b4', '#ff7f0e', '#2ca02c'][i]),
        row=1, col=1
    )

# Add box plot for reasoning length
for i, inst_type in enumerate(["standard_instruction", "cod_instruction", "cot_instruction"]):
    subset = df[df['instruction_type'] == inst_type]
    fig4.add_trace(
        go.Box(y=subset['reasoning_length'], name=inst_type.replace('_instruction', ''),
              marker_color=['#1f77b4', '#ff7f0e', '#2ca02c'][i]),
        row=1, col=2
    )

# Add box plot for inference time
for i, inst_type in enumerate(["standard_instruction", "cod_instruction", "cot_instruction"]):
    subset = df[df['instruction_type'] == inst_type]
    fig4.add_trace(
        go.Box(y=subset['inference_time'], name=inst_type.replace('_instruction', ''),
              marker_color=['#1f77b4', '#ff7f0e', '#2ca02c'][i]),
        row=2, col=1
    )

# Add scatter plot for accuracy vs reasoning length
for i, inst_type in enumerate(["standard_instruction", "cod_instruction", "cot_instruction"]):
    subset = df[df['instruction_type'] == inst_type]
    fig4.add_trace(
        go.Scatter(x=subset['reasoning_length'], y=subset['accuracy'], 
                  mode='markers',
                  name=inst_type.replace('_instruction', ''),
                  marker=dict(color=['#1f77b4', '#ff7f0e', '#2ca02c'][i], 
                             size=subset['inference_time']*3),
                  hovertemplate='<b>%{text}</b><br>Accuracy: %{y}<br>Reasoning Length: %{x}<extra></extra>',
                  text=subset['model_name']),
        row=2, col=2
    )

fig4.update_layout(height=800, width=1000, 
                  title_text="Comparison of Instruction Types Across Models",
                  boxmode='group',
                  showlegend=False)

# Update axis labels
fig4.update_yaxes(title_text="Accuracy (%)", row=1, col=1)
fig4.update_yaxes(title_text="Reasoning Length", row=1, col=2)
fig4.update_yaxes(title_text="Inference Time (s)", row=2, col=1)
fig4.update_yaxes(title_text="Accuracy (%)", row=2, col=2)
fig4.update_xaxes(title_text="Instruction Type", row=1, col=1)
fig4.update_xaxes(title_text="Instruction Type", row=1, col=2)
fig4.update_xaxes(title_text="Instruction Type", row=2, col=1)
fig4.update_xaxes(title_text="Reasoning Length", row=2, col=2)

fig4.show()

# Create a heatmap to visualize model performance across instruction types
pivot_df = df.pivot_table(values='accuracy', 
                         index='model_name', 
                         columns='instruction_type')

# Reorder columns
pivot_df = pivot_df[['standard_instruction', 'cod_instruction', 'cot_instruction']]

# Rename columns for better readability
pivot_df.columns = ['Standard', 'CoD', 'CoT']

fig5 = px.imshow(pivot_df,
                text_auto=True,
                aspect="auto",
                color_continuous_scale='RdYlGn',
                title="Heatmap of Model Accuracy by Instruction Type",
                labels=dict(x="Instruction Type", y="Model", color="Accuracy (%)"))

fig5.update_layout(xaxis_title="Instruction Type",
                  yaxis_title="Model",
                  font=dict(size=12))

fig5.show()

# Create a figure with two subplots for accuracy and completion rate
fig6 = make_subplots(rows=1, cols=2, 
                    subplot_titles=("Model Accuracy by Instruction Type", 
                                   "Model Completion Rate by Instruction Type"),
                    specs=[[{"type": "bar"}, {"type": "bar"}]])

# Load the original data without filtering out error_rate rows
df_full = pd.read_csv('test_results/model_comparison_summary_20250314_231521.csv')

# Create pivot tables for both metrics
accuracy_pivot = df_full.pivot_table(
    values='accuracy',
    index='model_name',
    columns='instruction_type'
)

completion_pivot = df_full.pivot_table(
    values='completion_rate',
    index='model_name',
    columns='instruction_type'
)

# Reorder columns for consistency
column_order = ['standard_instruction', 'cod_instruction', 'cot_instruction']
accuracy_pivot = accuracy_pivot[column_order]
completion_pivot = completion_pivot[column_order]

# Define colors for instruction types
colors = {'standard_instruction': '#1f77b4', 
          'cod_instruction': '#ff7f0e', 
          'cot_instruction': '#2ca02c'}

# Add accuracy bars
for i, col in enumerate(column_order):
    fig6.add_trace(
        go.Bar(
            x=accuracy_pivot.index,
            y=accuracy_pivot[col],
            name=col.replace('_instruction', ''),
            marker_color=colors[col],
            legendgroup=col,
            showlegend=True
        ),
        row=1, col=1
    )

# Add completion rate bars
for i, col in enumerate(column_order):
    fig6.add_trace(
        go.Bar(
            x=completion_pivot.index,
            y=completion_pivot[col],
            name=col.replace('_instruction', ''),
            marker_color=colors[col],
            legendgroup=col,
            showlegend=False
        ),
        row=1, col=2
    )

# Add annotation for qwq:latest with CoT instruction
fig6.add_annotation(
    x="qwq:latest", 
    y=97.5,
    text="97.5% completion rate<br>with 2.5% error rate",
    showarrow=True,
    arrowhead=1,
    ax=0,
    ay=-40,
    bgcolor="rgba(255, 255, 200, 0.8)",
    bordercolor="black",
    borderwidth=1,
    row=1, col=2
)

# Update layout
fig6.update_layout(
    title_text="Model Performance: Accuracy vs. Completion Rate",
    barmode='group',
    height=600,
    width=1200,
    legend_title="Instruction Type",
    font=dict(size=12)
)

# Set y-axis ranges
fig6.update_yaxes(title_text="Accuracy (%)", range=[0, 100], row=1, col=1)
fig6.update_yaxes(title_text="Completion Rate (%)", range=[95, 100.5], row=1, col=2)

# Update x-axis labels
fig6.update_xaxes(title_text="Model", tickangle=-45, row=1, col=1)
fig6.update_xaxes(title_text="Model", tickangle=-45, row=1, col=2)

fig6.show()

# Optional: Create a combined heatmap that shows both metrics
# Create a figure with two subplots for heatmaps
fig7 = make_subplots(rows=1, cols=2, 
                    subplot_titles=("Model Accuracy (%)", 
                                   "Model Completion Rate (%)"),
                    specs=[[{"type": "heatmap"}, {"type": "heatmap"}]])

# Rename columns for better readability
accuracy_pivot.columns = ['Standard', 'CoD', 'CoT']
completion_pivot.columns = ['Standard', 'CoD', 'CoT']

# Add accuracy heatmap
fig7.add_trace(
    go.Heatmap(
        z=accuracy_pivot.values,
        x=accuracy_pivot.columns,
        y=accuracy_pivot.index,
        colorscale='RdYlGn',
        text=accuracy_pivot.values.round(1),
        texttemplate="%{text}",
        colorbar=dict(title="Accuracy (%)", x=-0.05),
        zmin=40, zmax=100
    ),
    row=1, col=1
)

# Add completion rate heatmap
fig7.add_trace(
    go.Heatmap(
        z=completion_pivot.values,
        x=completion_pivot.columns,
        y=completion_pivot.index,
        colorscale='Blues',
        text=completion_pivot.values.round(1),
        texttemplate="%{text}",
        colorbar=dict(title="Completion Rate (%)"),
        zmin=95, zmax=100
    ),
    row=1, col=2
)

# Update layout
fig7.update_layout(
    title_text="Model Performance Comparison: Accuracy vs. Completion Rate",
    height=500,
    width=1200,
    font=dict(size=12)
)

fig7.show()

In [4]:
pivot_df

Unnamed: 0_level_0,Standard,CoD,CoT
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
deepseek-r1:14b,55.0,62.5,62.5
llama3.1:8b,55.0,52.5,62.5
llama3.2:latest,45.0,45.0,52.5
phi4:latest,77.5,77.5,77.5
qwen2.5:14b,72.5,75.0,75.0
qwq:latest,80.0,80.0,
