This notebook requires pre-computed data. You can get this data by running:

`python3 -m analysis download`

and then

`python3 -m analysis compute-features`

This will build a `data.json` and `features.csv` file in the root of this directory.

In [None]:
import pandas as pd
from analysis.models.data import Data
from analysis.performance_gap import top_performers, unresolved_instances

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

TOP_K = 3

with open("data.json") as f:
    data = Data.model_validate_json(f.read())

df = pd.read_csv("features.csv")

def good_metric(metric: str) -> bool:
    if metric == "instance_id":
        return False

    if metric.startswith("instance") or metric.startswith("patch"):
        return True

    if metric.endswith("diff"):
        return True

    return False


In [None]:
# Drop any features that are zero for all instances (this is the default and indicative of some kind of failure)
df = df.loc[:, (df != 0).any(axis=0)]
metrics = [column for column in df.columns if good_metric(column)]

In [None]:
print(f"Number of models: {len(data.systems)}")
print(f"Number of instances: {len(data.dataset.instances)}")
print(f"Number of features: {len(df.columns)}")

In [None]:
# Source model is always OpenHands
source = data.systems[data.closest_system("OpenHands")]
targets = top_performers(data.systems.values(), k=TOP_K)

In [None]:
print(f"Source model: {source.metadata.name}")
print(f"Target models: \n{'\n'.join(['  - ' + t.metadata.name for t in targets])}")

In [None]:
# Compute the performance gap
gap = unresolved_instances(source, targets, threshold=1)
df['gap'] = df['instance_id'].apply(lambda instance_id: 1 if instance_id in gap else 0)

In [None]:
print(f"Number of instances with performance gap: {len(gap)}")

In [None]:
from scipy import stats # type: ignore

max_stat = (len(data.dataset.instances) - len(gap))  * len(gap)

results = []
for metric in metrics:
    result = stats.mannwhitneyu(
        df[df['gap']==0][metric], 
        df[df['gap']==1][metric]
    )
    results.append({
        "metric": metric,
        "p value": result.pvalue,
        "relative statistic": (result.statistic / max_stat) - 0.5
    })

print("Mann-Whitney U Test Results")
results = sorted(results, key=lambda x: abs(x["relative statistic"]), reverse=True)
for result in results[:10]:
    print(f"Metric: {result['metric']}, relative statistic: {result['relative statistic']*100:0.2f}%, p: {result['p value']:.2f}")

In [None]:
from sklearn.feature_selection import f_classif

# F-scores for each feature
f_scores, p_values = f_classif(df[metrics], df['gap'])
f_scores
feature_scores = pd.DataFrame({
    'feature': df[metrics].columns,
    'F_score': f_scores,
    'p_value': p_values
})
top_features = feature_scores.sort_values('F_score', ascending=False)[:10]
print(top_features)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df[metrics], df['gap'], test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train model
clf = RandomForestClassifier(
    n_estimators=100,
    class_weight='balanced',
    max_depth=None,    # to control tree depth
    min_samples_leaf=2 # to prevent overfitting
)
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
import numpy as np # type: ignore

feature_importance = pd.DataFrame({
    'feature': df[metrics].columns,
    'importance': clf.feature_importances_,
    'std': np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0)  # Add std across trees
}).sort_values('importance', ascending=False)

print(feature_importance.head(10))

In [None]:
from sklearn.inspection import permutation_importance # type: ignore

r = permutation_importance(clf, X_test, y_test, n_repeats=10)
perm_importance = pd.DataFrame({
    'feature': df[metrics].columns,
    'perm_importance': r.importances_mean,
    'perm_std': r.importances_std
}).sort_values('perm_importance', ascending=False)

print(perm_importance.head(10))


In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(class_weight='balanced')
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)
y_prob = lr.predict_proba(X_test)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
df['unresolved'] = df['instance_id'].apply(lambda instance_id: 0 if instance_id in source.results.resolved else 1)

correlations = {metric: df[metric].corr(df["unresolved"]) for metric in metrics}

# filter out the nans
correlations = {metric: corr for metric, corr in correlations.items() if not pd.isna(corr)}

# The top 5 metrics with the highest correlation
top_metrics = sorted(correlations, key=lambda x: abs(correlations[x]), reverse=True)[:5]
for top_metric in top_metrics:
    print(f"{top_metric}: {correlations[top_metric]}")

In [None]:
max_stat = (len(data.dataset.instances) - len(source.results.resolved))  * len(source.results.resolved)

results = []
for metric in metrics:
    result = stats.mannwhitneyu(
        df[df['unresolved']==0][metric], 
        df[df['unresolved']==1][metric]
    )
    results.append({
        "metric": metric,
        "p value": result.pvalue,
        "relative statistic": (result.statistic / max_stat) - 0.5
    })

print("Mann-Whitney U Test Results")
results = sorted(results, key=lambda x: abs(x["relative statistic"]), reverse=True)
for result in results[:10]:
    print(f"Metric: {result['metric']}, relative statistic: {result['relative statistic']*100:0.2f}%, p: {result['p value']:.10f}")

In [None]:
# F-scores for each feature
f_scores, p_values = f_classif(df[metrics], df['unresolved'])
f_scores
feature_scores = pd.DataFrame({
    'feature': df[metrics].columns,
    'F_score': f_scores,
    'p_value': p_values
})
top_features = feature_scores.sort_values('F_score', ascending=False)[:10]
print(top_features)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df[metrics], df['unresolved'], test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train model
clf = RandomForestClassifier(
    n_estimators=100,
    class_weight='balanced',
    max_depth=None,    # to control tree depth
    min_samples_leaf=2 # to prevent overfitting
)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

feature_importance = pd.DataFrame({
    'feature': df[metrics].columns,
    'importance': clf.feature_importances_,
    'std': np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0)
}).sort_values('importance', ascending=False)

print(feature_importance.head(10))

r = permutation_importance(clf, X_test, y_test, n_repeats=10)
perm_importance = pd.DataFrame({
    'feature': df[metrics].columns,
    'perm_importance': r.importances_mean,
    'perm_std': r.importances_std
}).sort_values('perm_importance', ascending=False)

print(perm_importance.head(10))