To test: bigger range of edge values; 1 central node; different params: density, ...; different graph families; how algorithms behave for different terminals (maybe one of them gives more consistant results than other?)

###USE IT FOR GENERATED GRAPHS###

In [146]:
cimport os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Folder paths
folder_path = './results'
# folder_path = './results/GraphInstances/B'
output_folder = os.path.join(folder_path, 'plots')

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Function to create and save plots for costs and times
def plot_data(df, file_name):
    # Plot Costs by Number of Terminals
    plt.figure(figsize=(12, 6))
    for y in ["TakahashiMatsuyamaCost", "KouMarkowskyBermanCost"]:
        sns.lineplot(data=df, x="NumberOfTerminals", y=y, label=y)
    plt.title(f"Costs vs. Number of Terminals ({file_name})")
    plt.xlabel("Number of Terminals")
    plt.ylabel("Cost")
    plt.legend()
    output_file = os.path.join(output_folder, f"{file_name}_terminals_cost.png")
    plt.savefig(output_file)
    plt.close()

    # Plot Times by Number of Terminals
    plt.figure(figsize=(12, 6))
    for y in ["TakahashiMatsuyamaTime", "KouMarkowskyBermanTime"]:
        sns.lineplot(data=df, x="NumberOfTerminals", y=y, label=y)
    plt.title(f"Times vs. Number of Terminals ({file_name})")
    plt.xlabel("Number of Terminals")
    plt.ylabel("Time")
    plt.legend()
    output_file = os.path.join(output_folder, f"{file_name}_terminals_time.png")
    plt.savefig(output_file)
    plt.close()

    # Plot Costs by Number of Nodes
    plt.figure(figsize=(12, 6))
    for y in ["TakahashiMatsuyamaCost", "KouMarkowskyBermanCost"]:
        sns.lineplot(data=df, x="NumberOfNodes", y=y, label=y)
    plt.title(f"Costs vs. Number of Nodes ({file_name})")
    plt.xlabel("Number of Nodes")
    plt.ylabel("Cost")
    plt.legend()
    output_file = os.path.join(output_folder, f"{file_name}_nodes_cost.png")
    plt.savefig(output_file)
    plt.close()

    # Plot Times by Number of Nodes
    plt.figure(figsize=(12, 6))
    for y in ["TakahashiMatsuyamaTime", "KouMarkowskyBermanTime"]:
        sns.lineplot(data=df, x="NumberOfNodes", y=y, label=y)
    plt.title(f"Times vs. Number of Nodes ({file_name})")
    plt.xlabel("Number of Nodes")
    plt.ylabel("Time")
    plt.legend()
    output_file = os.path.join(output_folder, f"{file_name}_nodes_time.png")
    plt.savefig(output_file)
    plt.close()

# Iterate over all CSV files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.results'c):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_csv(file_path)
        print(f"Processing file: {filename}")
        plot_data(df, filename.split('.')[0])  # Pass filename without extension


SyntaxError: invalid syntax (3926239588.py, line 1)

###USE IT FOR DATA BASE GRAPHS###

In [188]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
import re

In [189]:
def preprocess_label(label):  
    # Add spaces before capital letters (if not already present)
    label = re.sub(r'(?<!\s)(?=[A-Z])', ' ', label)
    
    # Replace long names with abbreviations
    label = label.replace("Takahashi Matsuyama", "TM ")
    label = label.replace("T M", "TM ")
    label = label.replace("Dreyfus Wagner", "DW  ")
    label = label.replace("D W", "DW  ")
    label = label.replace("Kou Markowsky Berman", "KMB  ")
    label = label.replace("K M B", "KMB  ")

    # Replace ...
    label = label.replace("Cost_ratio", "Approx-ratio ")
    label = label.replace("worst case", "Worst Case")
    label = label.replace("_", " ")

    # Remove "Time" from the label
    label = label.replace("Time", "")
    label = label.replace("duration", "")
    label = label.replace("TimeToPerformance", "")

    # Remove extra spaces (if any)
    label = re.sub(r'\s+', ' ', label).strip()
    
    return label

In [190]:
def get_subdirectory_from_path(path):
    # Extract the last part of the path (i.e., the folder name)
    return os.path.basename(os.path.normpath(path))   

In [191]:
def split_after_second_space(input_string):
    # Step 1: Insert a space before each capital letter
    modified_string = re.sub(r'([a-z])([A-Z])', r'\1 \2', input_string)
    
    # Step 2: Split the string by spaces
    split_parts = modified_string.split()
    
    # Step 3: Return the string after the second space (i.e., index 2 onward)
    result = " ".join(split_parts[2:])
    return result[0].lower() + result[1:]

In [192]:
def create_approximation_ratio_table(df, x_col, y_cols, opt_col, output_folder="./tables"):
    """
    Generate a table for approximation ratios relative to `opt_col`.

    Parameters:
        df (pd.DataFrame): The input data.
        x_col (str): The independent variable.
        y_cols (list[str]): List of dependent variables for which to calculate approximation ratios.
        opt_col (str): The column representing the "Opt" value.
        output_folder (str): Path to save the LaTeX table.

    Returns:
        pd.DataFrame: Aggregated per-x_col approximation ratios.
        pd.DataFrame: Overall approximation ratio summary.
    """
    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Calculate approximation ratios
    approx_ratios = df.copy()
    for col in y_cols:
        if col != "worst_case":
            approx_ratios[f"{col}_ratio"] = approx_ratios[col] / approx_ratios[opt_col]
    
    # Per-x_col statistics
    agg_funcs = {f"{col}_ratio": ["mean", "min", "max"] for col in y_cols if col != "worst_case"}
    agg_funcs["worst_case"] = ["mean", "min", "max"]
    per_x_stats = approx_ratios.groupby(x_col).agg(agg_funcs)
    per_x_stats.columns = ["_".join(col).strip() for col in per_x_stats.columns]
    per_x_stats = per_x_stats.reset_index()
    
    # Apply preprocessing to column names in per_x_stats
    per_x_stats.columns = [preprocess_label(col) for col in per_x_stats.columns]
                           
    # Overall statistics
    overall_stats = {}
    for col in y_cols:
        if col != "worst_case":
            ratio_col = f"{col}_ratio"
            overall_stats[ratio_col] = {
                "mean": approx_ratios[ratio_col].mean(),
                "min": approx_ratios[ratio_col].min(),
                "max": approx_ratios[ratio_col].max(),
            }
    # Include overall statistics for worst-case ratio
    overall_stats["worst_case"] = {
        "mean": approx_ratios["worst_case"].mean(),
        "min": approx_ratios["worst_case"].min(),
        "max": approx_ratios["worst_case"].max(),
    }
    overall_stats_df = pd.DataFrame(overall_stats).T.reset_index()
    overall_stats_df.columns = ["Metric", "Mean", "Min", "Max"]

    # Apply preprocessing to the "Metric" column
    overall_stats_df["Metric"] = overall_stats_df["Metric"].apply(preprocess_label)


    print(overall_stats_df)
    # Apply preprocessing to column names in overall_stats_df
    overall_stats_df.columns = [preprocess_label(col) for col in overall_stats_df.columns]

    # Save per-x_col approximation ratios to a LaTeX table
    per_x_table_path = os.path.join(output_folder, f"approx_ratios_per_x_{x_col}.tex")
    with open(per_x_table_path, "w") as f:
        f.write(per_x_stats.to_latex(index=False, float_format="%.3f"))

    # Save overall approximation ratios to a LaTeX table
    overall_table_path = os.path.join(output_folder, f"approx_ratios_overall_{x_col}.tex")
    with open(overall_table_path, "w") as f:
        f.write(overall_stats_df.to_latex(index=False, float_format="%.3f"))

    print(f"Approximation ratio tables saved: {per_x_table_path}, {overall_table_path}")
    return per_x_stats, overall_stats_df

In [193]:
def create_summary_tables(df, x_col, y_cols, opt_col=None, output_folder="./tables"):
    """
    Generate summary statistics tables for y_cols and opt_col, including standard deviation of differences.

    Parameters:
        df (pd.DataFrame): The input data.
        x_col (str): The independent variable.
        y_cols (list[str]): List of dependent variables to process.
        opt_col (str, optional): The column for "Opt" values, if available.
        output_folder (str): Path to save the generated LaTeX tables.
    """
    import os
    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Compute per-x_col statistics
    agg_funcs = {col: ["mean", "min", "max"] for col in y_cols}

    if opt_col:
        agg_funcs[opt_col] = ["mean", "min", "max"]
    
    per_x_stats = df.groupby(x_col).agg(agg_funcs)
    per_x_stats.columns = ["_".join(col).strip() for col in per_x_stats.columns]
    per_x_stats = per_x_stats.reset_index()
    
    # Apply preprocessing to column names in per_x_stats
    per_x_stats.columns = [preprocess_label(col) for col in per_x_stats.columns]

    # Compute overall statistics across all x_col values
    overall_stats = {}
    for col in y_cols + ([opt_col] if opt_col else []):
        overall_stats[col] = {
            "mean": df[col].mean(),
            "min": df[col].min(),
            "max": df[col].max(),
        }

    overall_stats_df = pd.DataFrame(overall_stats).T.reset_index()
    overall_stats_df.columns = ["Metric", "Mean", "Min", "Max"]

    # Apply preprocessing to the "Metric" column
    overall_stats_df["Metric"] = overall_stats_df["Metric"].apply(preprocess_label)

    # Apply preprocessing to column names in overall_stats_df
    overall_stats_df.columns = [preprocess_label(col) for col in overall_stats_df.columns]

    # Save per-x_col statistics to a LaTeX table
    per_x_table_path = os.path.join(output_folder, f"per_x_stats_{x_col}.tex")
    with open(per_x_table_path, "w") as f:
        f.write(per_x_stats.to_latex(index=False, float_format="%.3f"))

    # Save overall statistics to a LaTeX table
    overall_table_path = os.path.join(output_folder, f"overall_stats_{x_col}.tex")
    with open(overall_table_path, "w") as f:
        f.write(overall_stats_df.to_latex(index=False, float_format="%.3f"))

    print(f"Summary tables saved: {per_x_table_path}, {overall_table_path}")
    return per_x_stats, overall_stats_df


In [194]:
# Plotting
def plot_data(df, x_col, y_cols, title, x_label, y_label, output_file=None, opt_col=None, log_scale=False):
    # Aggregate data by x_col (compute mean, min, max for each x)
    agg_funcs = {col: ["mean", "min", "max"] for col in y_cols}
    if opt_col:
        agg_funcs[opt_col] = ["mean", "min", "max"]

    # Group by x_col and compute aggregated values
    aggregated_df = df.groupby(x_col).agg(agg_funcs)
    aggregated_df.columns = ["_".join(col).strip() for col in aggregated_df.columns]
    aggregated_df = aggregated_df.reset_index()

    # Compute the worst-case values
    if opt_col and "worst_case" in y_cols:
        aggregated_df["worst_case_mean"] *= aggregated_df[f"{opt_col}_mean"]
        aggregated_df["worst_case_min"] *= aggregated_df[f"{opt_col}_min"]
        aggregated_df["worst_case_max"] *= aggregated_df[f"{opt_col}_max"]
            

    plt.figure(figsize=(12, 6))

    # Generate up to 10 distinct colors using a colormap
    colormap = plt.get_cmap("tab10", 10)  # Use "tab10" for distinct qualitative colors
    color_list = [colormap(i) for i in range(10)]  # Generate a list of RGBA colors

    y_cols_filtered = [col for col in y_cols if col not in ["worst_case"]]
    for i, y in enumerate(y_cols_filtered):
        # Assign a color based on the index (cycling if more than 10 functions)
        base_color = color_list[i % 10]
        
        # Generate colors for mean, min, and max
        mean_color = base_color  # Base color for the mean
        min_color = (base_color[0], base_color[1], base_color[2], 0.7)  # Adjust alpha for transparency
        max_color = (base_color[0], base_color[1], base_color[2], 1.0)  # Keep max fully opaque
        
        # Columns for aggregated data
        mean_col = f"{y}_mean"
        min_col = f"{y}_min"
        max_col = f"{y}_max"
        
        # Plot mean line
        sns.lineplot(data=aggregated_df, x=x_col, y=mean_col, label=f"{preprocess_label(y)} (Mean)", color=mean_color, errorbar=None)
        
        # Plot scatter points for min and max with their respective colors
        plt.scatter(aggregated_df[x_col], aggregated_df[min_col], color=min_color, label=f"{preprocess_label(y)} (Min)", alpha=0.7)
        plt.scatter(aggregated_df[x_col], aggregated_df[max_col], color=max_color, label=f"{preprocess_label(y)} (Max)", alpha=0.7)
    
    # Plot Opt column if available
    if opt_col:
        # Assign a unique color for the `opt_col` using an additional color index
        opt_color = color_list[len(y_cols) % 10]  # Next color in the sequence
    
        # Generate colors for mean, min, and max
        opt_mean_color = opt_color  # Base color for the mean
        opt_min_color = (opt_color[0], opt_color[1], opt_color[2], 0.7)  # Adjust alpha for transparency
        opt_max_color = (opt_color[0], opt_color[1], opt_color[2], 1.0)  # Keep max fully opaque
    
        # Columns for aggregated data
        opt_mean_col = f"{opt_col}_mean"
        opt_min_col = f"{opt_col}_min"
        opt_max_col = f"{opt_col}_max"
    
        # Plot mean line
        sns.lineplot(data=aggregated_df, x=x_col, y=opt_mean_col, label=f"{opt_col} (Mean)", linestyle="--", color=opt_mean_color, errorbar=None)
    
        # Plot scatter points for min and max with their respective colors
        plt.scatter(aggregated_df[x_col], aggregated_df[opt_min_col], color=opt_min_color, label=f"{opt_col} (Min)", alpha=0.7)
        plt.scatter(aggregated_df[x_col], aggregated_df[opt_max_col], color=opt_max_color, label=f"{opt_col} (Max)", alpha=0.7)

    # print(aggregated_df)
    # Check if "worst_case" is in y_cols
    if "worst_case" in y_cols:
        # Plot the worst-case mean line
        sns.lineplot(
            data=aggregated_df,
            x=x_col,
            y="worst_case_mean",
            label="Worst Case (Mean)",
            color="black",
            linestyle="-",
            linewidth=2.5,
            errorbar=None
        )
        
        # Plot scatter points for worst-case min and max
        plt.scatter(
            aggregated_df[x_col],
            aggregated_df["worst_case_min"],
            color="gray",
            label="Worst Case (Min)",
            alpha=0.7
        )
        plt.scatter(
            aggregated_df[x_col],
            aggregated_df["worst_case_max"],
            color="gray",
            label="Worst Case (Max)",
            alpha=1.0
        )

    plt.legend()
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(title)
    plt.savefig(output_file)
    plt.close()

In [None]:


def process_and_plot(
    testset,
    results_folder="./results/GraphInstances",
    base_url="https://steinlib.zib.de/showset.php?",
    output_folder="./overleaf-repo/images/plots",
    worst_case_param = "NumberOfTerminals",
):

    # Prepare paths and URL
    testset_url = base_url + testset
    folder_path = os.path.join(results_folder, testset)
    output_folder = os.path.join(output_folder, testset)

    os.makedirs(output_folder, exist_ok=True)

    # Fetch "Opt" values
    response = requests.get(testset_url)
    opt_values = {}
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        table_rows = soup.find_all("tr")[1:]
        for row in table_rows:
            cols = row.find_all("td")
            if len(cols) > 5:
                instance_name = cols[0].text.strip()
                opt_value = cols[-1].text.strip().replace("\xa0", "")
                opt_values[instance_name] = float(opt_value) if opt_value.isdigit() else None
    else:
        print(f"Failed to fetch 'Opt' values for testset {testset}, status code: {response.status_code}")
        return

    # Process results files
    data_frames = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".results"):
            file_path = os.path.join(folder_path, filename)
            df = pd.read_csv(file_path)
            instance_name = filename.split(".")[0]
            if instance_name in opt_values:
                df["Opt"] = opt_values[instance_name]

            time_columns = [col for col in df.columns if "Time" in col or "duration" in col]
            for col in time_columns:
                df[col] = df[col] / 1_000_000
            data_frames.append(df)

    if not data_frames:
        print(f"No .results files found in {folder_path}.")
        return

    combined_df = pd.concat(data_frames, ignore_index=True)
    
    # Calculate worst-case values
    if worst_case_param in combined_df.columns:
        combined_df["worst_case"] = 2 * (1 - 1 / combined_df[worst_case_param])
    else:
        print("Column '{worst_case_param}' not found in the data.")
        return
        
    # Calculate "time to performance" for TM and KMB
    combined_df["TMTimeToPerformance"] = combined_df["TakahashiMatsuyamaCost"] / combined_df["TakahashiMatsuyamaTime"]
    combined_df["KMBTimeToPerformance"] = combined_df["KouMarkowskyBermanCost"] / combined_df["KouMarkowskyBermanTime"]


    
    # Extract the subdirectory (e.g., 'B' from './overleaf-repo/images/plots/B')
    subdirectory = get_subdirectory_from_path(output_folder)

    # Set the new output folder path for tables (e.g., './overleaf-repo/tables/B')
    tables_output_folder = os.path.join("./overleaf-repo/tables", subdirectory)

    set_of_x_axes = ["NumberOfTerminals", "NumberOfNodes", "NumberOfEdges"]

    for variable in set_of_x_axes:
        # Generate plots
        # Plot Costs by variable
        plot_data(
            combined_df,        
            x_col=variable,
            y_cols=["TakahashiMatsuyamaCost", "KouMarkowskyBermanCost", "worst_case"],
            title=f"Costs vs {preprocess_label(variable)} ({testset})",
            x_label=preprocess_label(variable),
            y_label="Cost",
            output_file=os.path.join(output_folder, f"{split_after_second_space(variable)}_cost.png"),
            opt_col="Opt"
        )
        # Generate tables with costs
        per_x_stats, overall_stats_df = create_summary_tables(
            combined_df,        
            x_col=variable,
            y_cols=["TakahashiMatsuyamaCost", "KouMarkowskyBermanCost"],
            opt_col="Opt",
            output_folder=tables_output_folder
        )
        # Generate tables with approximation ratio
        per_x_ratios, overall_ratios = create_approximation_ratio_table(
            df=combined_df,        
            x_col=variable,
            y_cols=["TakahashiMatsuyamaCost", "KouMarkowskyBermanCost", "worst_case"],
            opt_col="Opt",
            output_folder=tables_output_folder
        )
        # Plot Times by variable
        plot_data(
            combined_df,
            x_col=variable,
            y_cols=["TakahashiMatsuyamaTime", "KouMarkowskyBermanTime"],
            title=f"Times vs {preprocess_label(variable)} ({testset})",
            x_label=preprocess_label(variable),
            y_label="Time (seconds)",
            output_file=os.path.join(output_folder, f"{split_after_second_space(variable)}_time.png")
        )
        # Time to performence
        plot_data(
            combined_df,
            x_col=variable,
            y_cols=["TMTimeToPerformance", "KMBTimeToPerformance"],
            title=f"Cost/Time vs {preprocess_label(variable)} ({testset})",
            x_label=preprocess_label(variable),
            y_label="Cost/Time",
            output_file=os.path.join(output_folder, f"{split_after_second_space(variable)}_CostToTime.png")
        )

    # Additional plots for TMduration* and KMBduration*
    tm_duration_columns = [col for col in combined_df.columns if col.startswith("TMduration")]
    kmb_duration_columns = [col for col in combined_df.columns if col.startswith("KMBduration")]

    for variable in set_of_x_axes:
        # Inner Times
        plot_data(
            combined_df,        
            x_col=variable,
            y_cols=tm_duration_columns,
            title=f"TM Durations vs Number of Terminals ({testset})",
            x_label=preprocess_label(variable),
            y_label="Time (seconds)",
            output_file=os.path.join(output_folder, f"{split_after_second_space(variable)}_tm_durations.png")
        )
    
        plot_data(
            combined_df,        
            x_col=variable,
            y_cols=kmb_duration_columns,
            title=f"KMB Durations vs Number of Terminals ({testset})",
            x_label=preprocess_label(variable),
            y_label="Time (seconds)",
            output_file=os.path.join(output_folder, f"{split_after_second_space(variable)}_kmb_durations.png"),
        )
        
        plot_data(
            combined_df,        
            x_col=variable,
            y_cols=kmb_duration_columns,
            title=f"KMB Durations vs Number of Terminals ({testset})",
            x_label=preprocess_label(variable),
            y_label="Time (seconds)",
            output_file=os.path.join(output_folder, f"{split_after_second_space(variable)}_kmb_durations_log_scale.png"),
            log_scale=True
        )

    print(f"All plots for testset {testset} have been generated in {output_folder}.")
    print(combined_df)

In [185]:
process_and_plot("C")

ValueError: Length mismatch: Expected axis has 4 elements, new values have 5 elements