In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
from patsy import dmatrices

In [2]:
data_properties = ['hlr', 'Transition', 'Shifting', 'Seasonality', 'Trend', 'Stationarity']
configurations = ['norm', 'sd', 'fusion', 'embed', 'ff']
target = 'rank'

In [3]:
df = pd.read_csv("res_s_processed.csv")

In [4]:
df.keys()

Index(['dataset', 'hlr', 'Transition', 'Shifting', 'Seasonality', 'Trend',
       'Stationarity', 'norm', 'sd', 'fusion', 'embed', 'ff', 'rank'],
      dtype='object')

In [5]:
from sklearn.tree import DecisionTreeRegressor, export_text
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

X = df[data_properties + configurations]
y = df['rank']

# One-hot encode the categorical config features
preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(drop=None), ['fusion', 'embed', 'ff'])
], remainder='passthrough')

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('tree', DecisionTreeRegressor(max_depth=3))
])

model.fit(X, y)

# Extract rules
tree = model.named_steps['tree']
feature_names = model.named_steps['preprocessor'].get_feature_names_out(data_properties + configurations)
rules = export_text(tree, feature_names=list(feature_names))
print(rules)


|--- cat__embed_patch <= 0.50
|   |--- cat__ff_mlp <= 0.50
|   |   |--- cat__embed_invert <= 0.50
|   |   |   |--- value: [18.00]
|   |   |--- cat__embed_invert >  0.50
|   |   |   |--- value: [16.09]
|   |--- cat__ff_mlp >  0.50
|   |   |--- remainder__norm <= 0.50
|   |   |   |--- value: [13.29]
|   |   |--- remainder__norm >  0.50
|   |   |   |--- value: [16.05]
|--- cat__embed_patch >  0.50
|   |--- cat__fusion_temporal <= 0.50
|   |   |--- remainder__Stationarity <= 0.00
|   |   |   |--- value: [8.09]
|   |   |--- remainder__Stationarity >  0.00
|   |   |   |--- value: [14.06]
|   |--- cat__fusion_temporal >  0.50
|   |   |--- remainder__norm <= 0.50
|   |   |   |--- value: [21.19]
|   |   |--- remainder__norm >  0.50
|   |   |   |--- value: [13.49]



In [6]:
import pandas as pd
import statsmodels.formula.api as smf

# Define properties and configuration features
data_properties = ['hlr', 'Transition', 'Shifting', 'Seasonality', 'Trend', 'Stationarity']
binary_features = ['norm', 'sd']
categorical_features = ['fusion', 'embed', 'ff']
target = 'rank'

# Store results
final_notes = []

# Loop over each data property
for prop in data_properties:
    # For each config feature
    for config in binary_features + categorical_features:
        # Create appropriate formula
        if config in categorical_features:
            formula = f"{target} ~ Q('{prop}') * C(Q('{config}'))"
        else:
            formula = f"{target} ~ Q('{prop}') * Q('{config}')"

        # Fit OLS model
        model = smf.ols(formula=formula, data=df).fit()

        # Track best improving config for both 'higher' and 'lower' cases
        best_effects = {
            "higher": {'abs_coef': 0, 'note': None},
            "lower": {'abs_coef': 0, 'note': None}
        }

        for term in model.params.index:
            if ':' in term and model.pvalues[term] <= 0.05:
                coef = model.params[term]
                pval = model.pvalues[term]

                # Determine direction and interpretation
                if coef < 0:
                    # Rank improves as config interacts negatively with higher prop
                    direction = "higher"
                    performance = "improve"
                else:
                    # Rank improves as config interacts negatively with lower prop
                    direction = "lower"
                    performance = "improve"

                # Check if this is the strongest (abs) improving effect for this direction
                if abs(coef) > best_effects[direction]['abs_coef']:
                    best_effects[direction] = {
                        'abs_coef': abs(coef),
                        'note': (
                            f"When {prop} is {direction}, best to use {term.split(':')[1]} "
                            f"to {performance} rank (p={pval:.3f}, coef={coef:.3f})"
                        )
                    }

        # Store notes for both directions if they exist
        for dir_effect in best_effects.values():
            if dir_effect['note']:
                final_notes.append(dir_effect['note'])

# Output to console and file
with open("best_config_by_property_s.txt", "w") as f:
    for note in final_notes:
        print(note)
        f.write(note + '\n')


When hlr is higher, best to use C(Q('fusion'))[T.temporal] to improve rank (p=0.022, coef=-0.411)
When Transition is higher, best to use Q('sd')[T.True] to improve rank (p=0.000, coef=-34.721)
When Shifting is higher, best to use Q('sd')[T.True] to improve rank (p=0.000, coef=-5.152)
When Shifting is higher, best to use C(Q('fusion'))[T.temporal] to improve rank (p=0.011, coef=-3.016)
When Seasonality is lower, best to use C(Q('fusion'))[T.temporal] to improve rank (p=0.008, coef=6.117)
When Seasonality is higher, best to use C(Q('ff'))[T.trans] to improve rank (p=0.000, coef=-12.388)
When Trend is higher, best to use Q('sd')[T.True] to improve rank (p=0.000, coef=-5.884)
When Trend is higher, best to use C(Q('fusion'))[T.temporal] to improve rank (p=0.000, coef=-3.826)
When Trend is higher, best to use C(Q('embed'))[T.none] to improve rank (p=0.044, coef=-4.635)
When Stationarity is higher, best to use Q('sd')[T.True] to improve rank (p=0.002, coef=-5.103)
