 ## Create hypertune report for searchspace

In [1]:
from ray.tune import ExperimentAnalysis
import ray
ray.init(ignore_reinit_error=True)

2025-02-03 12:13:58,006	INFO worker.py:1821 -- Started a local Ray instance.


0,1
Python version:,3.11.9
Ray version:,2.40.0


In [2]:
import pandas as pd
results_df_CNNGRU = pd.DataFrame()
results_df_2DCNN = pd.DataFrame()

In [62]:
from tabulate import tabulate
def create_report_md(report_df):
    # Convert to Markdown table
    markdown_table = tabulate(report_df, headers="keys", tablefmt="pipe")
    
    # Print or save the Markdown table
    print(markdown_table)
    return markdown_table

def report_top_results_md(results_df, top=5):
    report={}
    top_5 = results_df[:top]
    for col in top_5.columns:
        report[col] = "<br>".join(map(lambda x: f"{x:.4f}" if isinstance(x, float) else str(x), top_5[col]))
    # Convert report into a DataFrame if needed
    report_df = pd.DataFrame([report])
    #print(report_df.columns) 
    #report_df.columns = ['accuracy', 'iterations', 'filters', 'hidden_size' 'dropout', 'num_layers', 'units1', 'units2', 'model_type', 'experiment']
    

    markdown_table = create_report_md(report_df)
    return report_df

In [5]:
from plotly import graph_objects as go
def plot_contour(df, x, y, z, start=0.90, end=1.0, size=0.01):
    fig = go.Figure()

    fig.add_trace(
        go.Contour(
            z=df[z],
            x=df[x],
            y=df[y],
            contours=dict(
                coloring='heatmap',
                showlabels=True,  # show labels on contours
                start=start,       # start of the contour range
                end=end,          # end of the contour range
                size=size,
            ),
            colorscale="plasma",
            colorbar=dict(
                title='Accuracy'
            )
        )
    )

    fig.add_trace(
        go.Scatter(
            x=df[x],
            y=df[y],
            mode='markers',
            marker=dict(
                color='black',
                size=8,
                symbol='circle'
            ),
            customdata=df['accuracy'],  # Pass accuracy values for hover text
            hovertemplate=(
                'Hidden Size: %{x}<br>'
                'Number of Layers: %{y}<br>'
                'Accuracy: %{customdata:.4f}<extra></extra>'
            ),
            name='Data Points'
        )
    )

    fig.update_layout(
        title="Contour Plot",
        xaxis_title="Hidden Size",
        yaxis_title="Number of Layers",
        xaxis=dict(showgrid=False),  # Remove x-axis grid lines
        yaxis=dict(showgrid=False),
        plot_bgcolor='white',        # Set background color to white
        paper_bgcolor='white'
    )

    fig.show()

In [7]:
from pathlib import Path
tune_dir = Path("models/ray").resolve()
tune_dir.exists()
tunelogs = [d for d in tune_dir.iterdir()]
tunelogs.sort()


In [391]:
tunelogs = [d for d in tune_dir.iterdir()]
tunelogs.sort()
latest = tunelogs[-1]
latest
Path(latest).name

'train_2024-12-16_22-44-01'

In [16]:
import pandas as pd
from ray.tune import ExperimentAnalysis
from loguru import logger
from pathlib import Path
import ray


def load_tunelogs_data(path="models/ray") -> pd.DataFrame:
        """
        Loads the Ray Tune results from a specified directory and returns them as a DataFrame.

        Args:
            path (str): Directory path containing Ray Tune experiment logs.

        Returns:
            pd.DataFrame: Combined and cleaned results DataFrame.
        """
        tune_dir = Path(path).resolve()
        logger.info(f"Tune directory: {tune_dir}")
        if not tune_dir.exists():
            logger.warning("Model data directory does not exist. Check your tune directory path.")
            return pd.DataFrame()

        # Initialize Ray
        ray.init(ignore_reinit_error=True)

        # Collect all directories within the tune_dir
        tunelogs = sorted([d for d in tune_dir.iterdir() if d.is_dir()])
        results = []

        for logs in tunelogs:
            try:
                # Load experiment analysis
                analysis = ExperimentAnalysis(logs)

                # Convert results to DataFrame
                df = analysis.dataframe()
                df.columns = [col.lower().replace("config/", "") for col in df.columns]
                df.sort_values("accuracy", inplace=True, ascending=False)

                # Add experiment name as a column
                df["experiment"] = logs.name.replace("train_", "")

                # Optionally get best trial (for debugging/logging purposes)
                best_trial = analysis.get_best_trial(metric="test_loss", mode="min")
                if best_trial:
                    logger.info(f"Best trial for {logs.name}: {best_trial}")

                # Accumulate DataFrame
                results.append(df)

            except Exception as e:
                logger.error(f"Failed to process {logs}: {e}")

        # Combine all results into a single DataFrame
        results_df = pd.concat(results, ignore_index=True)
        results_df.sort_values("accuracy", inplace=True, ascending=False)
        results_df 

            
        return results_df

results_df=load_tunelogs_data()


[32m2025-02-03 12:23:05.579[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_tunelogs_data[0m:[36m19[0m - [1mTune directory: /Users/francesca/code_repo/MADS-exam-25/src/models/ray[0m
2025-02-03 12:23:05,595	INFO worker.py:1654 -- Calling ray.init() again after it has already been called.
[32m2025-02-03 12:23:05.613[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_tunelogs_data[0m:[36m47[0m - [1mBest trial for train_2025-01-21_20-12-30: train_85395c8b[0m
[32m2025-02-03 12:23:05.631[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_tunelogs_data[0m:[36m47[0m - [1mBest trial for train_2025-01-22_00-11-48: train_35901bdf[0m
[32m2025-02-03 12:23:05.654[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_tunelogs_data[0m:[36m47[0m - [1mBest trial for train_2025-01-22_09-31-05: train_b9a3cf06[0m
[32m2025-02-03 12:23:05.674[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_tunelogs_data[0m:[36m47[0m - [1mBest trial for train_2025-01-22_11-32-38: trai

In [173]:
def report_top_results(results_df, top=30):
    report={}
    for model in results_df.model_type.unique():
        top10_results = results_df[results_df.model_type==model].nlargest(top, "accuracy")
        report[model] = top10_results.to_dict(orient='records')
    return report

def report_config_results(report, modelname):
        # Get the top 10 rows based on accuracy
    if "recallmacro" in results_df.columns:
            #print(results_df.columns)
            top_10_df = results_df.nlargest(20, "recallmacro")
            top_10_df['trainfile'] = top_10_df['trainfile'].apply(lambda x: x.name)
            top_10_df = top_10_df[["experiment", "trial_id", "accuracy", "model_type", "test_loss", "batch", 'optimizer', 'num_blocks', "dropout", "hidden", "num_layers", "num_heads", "recallmacro", "iterations", "factor", "trainfile"]]
            top_10_df.reset_index(drop=True, inplace=True)
            #print(top_10_df)
            # Save the top 10 results to a CSV file
            top_10_df.to_csv("top10_results.csv", index=False)
            top_10_df.reset_index(drop=True, inplace=True)
            top_config = top_10_df.iloc[0].to_dict()
            print(f"Top model configurations:{top_config}")
    return top_config

# Report results for CNN 1D GRU

In [174]:
report = report_top_results(results_df)
df_cnngru = pd.DataFrame(report['CNN1DGRUResNet'])
df_cnngru_clean = df_cnngru [['iterations', 'accuracy', 'recallmacro', 'experiment',
        'batch', 'hidden', 'dropout', 'num_layers', 'num_blocks',  'factor', 
       'gru_hidden', 'trainfile']]
df_cnngru_clean['trainfile'] = df_cnngru_clean['trainfile'].apply(lambda x: x.name.split("_")[2])       

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cnngru_clean['trainfile'] = df_cnngru_clean['trainfile'].apply(lambda x: x.name.split("_")[2])


In [176]:
df_cnngru_clean.sort_values(by = ['accuracy','recallmacro', 'iterations'], inplace=True, ascending=False)
for col in df_cnngru_clean.columns:
    if col not in ['trainfile', 'experiment', 'accuracy', 'recallmacro', 'iterations']:
        val = sorted(df_cnngru_clean[col].unique().tolist())
        print(f'{col} {val}')


batch [16, 32, 48]
hidden [64, 128]
dropout [0.2, 0.298894093637274, 0.3, 0.4]
num_layers [2, 3, 4]
num_blocks [2, 3, 4, 5]
factor [0.1, 0.2, 0.3, 0.4]
gru_hidden [32.0, 64.0, 128.0, 256.0, 512.0]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cnngru_clean.sort_values(by = ['accuracy','recallmacro', 'iterations'], inplace=True, ascending=False)


In [170]:
df_cnngru_ovs =df_cnngru_clean[df_cnngru_clean['trainfile']=='oversampled']
report_top_results_md(df_cnngru_ovs[:1])

|    |   iterations |   accuracy |   recallmacro |   batch |   hidden |   dropout |   num_layers |   num_blocks |   factor |   gru_hidden | trainfile   |
|---:|-------------:|-----------:|--------------:|--------:|---------:|----------:|-------------:|-------------:|---------:|-------------:|:------------|
|  0 |           27 |     0.9858 |        0.9593 |      32 |       64 |       0.4 |            2 |            5 |      0.2 |          256 | oversampled |


Unnamed: 0,iterations,accuracy,recallmacro,batch,hidden,dropout,num_layers,num_blocks,factor,gru_hidden,trainfile
0,27,0.9858,0.9593,32,64,0.4,2,5,0.2,256.0,oversampled


In [165]:
df_cnngru_smote =df_cnngru_clean[df_cnngru_clean['trainfile']=='SMOTE']
report_top_results_md(df_cnngru_smote[:1])

|    |   iterations |   accuracy |   recallmacro |   batch |   hidden |   dropout |   num_layers |   num_blocks |   factor |   gru_hidden | trainfile   |
|---:|-------------:|-----------:|--------------:|--------:|---------:|----------:|-------------:|-------------:|---------:|-------------:|:------------|
|  0 |           36 |     0.9846 |        0.9576 |      32 |      128 |       0.3 |            3 |            4 |      0.3 |          256 | SMOTE       |


Unnamed: 0,iterations,accuracy,recallmacro,batch,hidden,dropout,num_layers,num_blocks,factor,gru_hidden,trainfile
0,36,0.9846,0.9576,32,128,0.3,3,4,0.3,256.0,SMOTE


In [185]:
report_top_results_md(df_cnngru_clean, top=2)

|    | iterations   | accuracy         | recallmacro      | batch    | hidden   | dropout          | num_layers   | num_blocks   | factor           | gru_hidden           | trainfile                  |
|---:|:-------------|:-----------------|:-----------------|:---------|:---------|:-----------------|:-------------|:-------------|:-----------------|:---------------------|:---------------------------|
|  0 | 27<br>25     | 0.9858<br>0.9849 | 0.9593<br>0.9769 | 32<br>16 | 64<br>64 | 0.4000<br>0.2000 | 2<br>4       | 5<br>4       | 0.2000<br>0.3000 | 256.0000<br>256.0000 | oversampled<br>oversampled |


Unnamed: 0,iterations,accuracy,recallmacro,batch,hidden,dropout,num_layers,num_blocks,factor,gru_hidden,trainfile
0,27<br>25,0.9858<br>0.9849,0.9593<br>0.9769,32<br>16,64<br>64,0.4000<br>0.2000,2<br>4,5<br>4,0.2000<br>0.3000,256.0000<br>256.0000,oversampled<br>oversampled


# Report results for CNN 2D

In [181]:
report = report_top_results(results_df)
df_cnn2D = pd.DataFrame(report['2DCNNResnet'])
df_cnn2D_clean = df_cnn2D[['iterations', 'accuracy', 'recallmacro', 'experiment',
        'batch', 'hidden', 'dropout', 'num_layers', 'num_blocks',  'factor', 'optimizer',
       'gru_hidden', 'trainfile']]
df_cnn2D_clean['trainfile'] =df_cnn2D_clean['trainfile'].apply(lambda x: x.name.split("_")[2])    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cnn2D_clean['trainfile'] =df_cnn2D_clean['trainfile'].apply(lambda x: x.name.split("_")[2])


In [182]:
df_cnn2D_clean

Unnamed: 0,iterations,accuracy,recallmacro,experiment,batch,hidden,dropout,num_layers,num_blocks,factor,optimizer,gru_hidden,trainfile
0,29,0.988782,0.974396,2025-01-25_10-39-41,16,128,0.3,3,1,0.2,,,oversampled
1,39,0.987866,0.967824,2025-01-23_20-23-12,16,128,0.338071,4,3,0.2,,,oversampled
2,39,0.987866,0.973342,2025-01-25_15-38-08,16,64,0.3,3,3,0.3,,,oversampled
3,39,0.986722,0.964716,2025-01-25_15-38-08,16,64,0.3,2,1,0.3,,,oversampled
4,14,0.985294,0.946092,2025-01-22_11-32-38,32,83,0.139081,4,1,0.610975,,,oversampled
5,14,0.985294,0.956036,2025-01-22_14-19-25,32,203,0.126891,3,1,,,,oversampled
6,14,0.984835,0.940389,2025-01-22_11-32-38,32,122,0.187109,3,1,0.433813,,,oversampled
7,29,0.983974,0.957462,2025-01-24_15-57-13,16,128,0.289253,2,1,0.3,,,oversampled
8,14,0.983456,0.948983,2025-01-22_18-54-05,32,114,0.277567,3,3,,,,oversampled
9,26,0.983288,0.962754,2025-01-25_10-39-41,16,64,0.3,2,2,0.2,,,oversampled


In [180]:
df_cnn2D_clean.sort_values(by = ['accuracy','recallmacro', 'iterations'], inplace=True, ascending=False)
for col in df_cnn2D_clean.columns:
    if col not in ['trainfile', 'experiment', 'accuracy', 'recallmacro', 'iterations']:
        val = sorted(df_cnn2D_clean[col].unique().tolist())
        print(f'{col} {val}')

batch [16, 32, 48]
hidden [64, 75, 83, 114, 122, 128, 132, 133, 139, 156, 203, 205, 241]
dropout [0.006368903682303451, 0.10267061456918729, 0.1268911146149138, 0.13908102313406687, 0.18710895448556605, 0.19966357654447664, 0.22655727436661166, 0.23777599330961882, 0.24286298668634002, 0.2601697376448717, 0.2775670047538534, 0.28925341585391184, 0.29002307872558647, 0.2923425318144807, 0.3, 0.30144113990252885, 0.3095378210754046, 0.3229150917802194, 0.3247027901091531, 0.32761369531372353, 0.3380709324005429, 0.3443683474453454, 0.3696396380836571, 0.37041431745867937, 0.3973971279888311]
num_layers [2, 3, 4]
num_blocks [1, 2, 3, 4]
factor [0.2, 0.3, nan, 0.4, 0.43381321058668637, 0.4444185961632646, 0.5182885809224532, 0.6109753507665872, 0.6377936861614196, 0.7904332095479256]
gru_hidden [nan]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cnn2D_clean.sort_values(by = ['accuracy','recallmacro', 'iterations'], inplace=True, ascending=False)


In [184]:
report_top_results_md(df_cnn2D_clean, top=2)

|    | iterations   | accuracy         | recallmacro      | batch    | hidden     | dropout          | num_layers   | num_blocks   | factor           | optimizer   | gru_hidden   | trainfile                  |
|---:|:-------------|:-----------------|:-----------------|:---------|:-----------|:-----------------|:-------------|:-------------|:-----------------|:------------|:-------------|:---------------------------|
|  0 | 29<br>39     | 0.9888<br>0.9879 | 0.9744<br>0.9678 | 16<br>16 | 128<br>128 | 0.3000<br>0.3381 | 3<br>4       | 1<br>3       | 0.2000<br>0.2000 | nan<br>nan  | nan<br>nan   | oversampled<br>oversampled |


Unnamed: 0,iterations,accuracy,recallmacro,batch,hidden,dropout,num_layers,num_blocks,factor,optimizer,gru_hidden,trainfile
0,29<br>39,0.9888<br>0.9879,0.9744<br>0.9678,16<br>16,128<br>128,0.3000<br>0.3381,3<br>4,1<br>3,0.2000<br>0.2000,nan<br>nan,nan<br>nan,oversampled<br>oversampled
