In [1]:
import pandas as pd
import numpy as np
from typing import List
import optuna

optuna.logging.set_verbosity(optuna.logging.ERROR)

In [2]:
oof_28 = pd.read_csv("../input/ooofs/deberta_large_28_preds.csv") #0.713
oof_29 = pd.read_csv("../input/ooofs/deberta_large_29_preds.csv") # 0.712
oof_30 = pd.read_csv("../input/ranking-30-deberta-large-oof/oof.csv") # 0.714
oof_43 = pd.read_csv("../input/ranking-43-roberta-base-oof/oof.csv") # 0.711
oof_52 = pd.read_csv("../input/ranking-52-deberta-large-oof/oof.csv") # 0.714
oof_57 = pd.read_csv("../input/ranking-57-roberta-large-oof/oof.csv") # 0.708
oof_58 = pd.read_csv("../input/ranking-58-rembert-oof/oof.csv") # 0.713
oof_60 = pd.read_csv("../input/ranking-60-roberta-base-with-oof/oof.csv") # 0.711
oof_61 = pd.read_csv("../input/ranking-61-deberta-v3-base-with-oof/oof.csv") # 0.715
oof_62 = pd.read_csv("../input/ranking-62-distilroberta-base-oof/oof.csv") # 0. 714
oof_63 = pd.read_csv("../input/ranking-63-deberta-v3-large-oof/oof.csv") # 0.714
oof_64 = pd.read_csv("../input/ranking-64-distilroberta-base-with-oof/oof.csv") # 0.714

In [3]:
def get_weighted_mean(oofs: List[pd.DataFrame], weights: List[int]) -> float:
    less_toxic = sum([w*oof.less_toxic_score for oof, w in zip(oofs, weights)])
    more_toxic = sum([w*oof.more_toxic_score for oof, w in zip(oofs, weights)])
    return np.mean(less_toxic < more_toxic)

## Greedy model selection

In [4]:
def objective(trial):
    weights = [trial.suggest_float(f"weight_{name}", 0, 1) for name in selected_names]
    return get_weighted_mean(selected_models, weights)


oofs_by_score = [
    (61, oof_61), # 0.715 <-- start
    (62, oof_62), # 0.714
    (64, oof_64), # 0.714
    (30, oof_30), # 0.714
    (63, oof_63), # 0.714
    (52, oof_52), # 0.714
    (28, oof_28), # 0.713
    (58, oof_58), # 0.713
    (29, oof_29), # 0.712
    (60, oof_60), # 0.711
    (43, oof_43), # 0.711
    (57, oof_57), # 0.708
]

selected_models = []
selected_names = []

name, data = oofs_by_score.pop(0)
selected_names.append(name)
selected_models.append(data)

best_oof = 0
best_params = None
best_models = []

while len(oofs_by_score) > 0:
    round_best_oof = best_oof
    round_best_params = None
    round_best_name = None
    round_best_model = None
    
    print(f"Round start with {selected_names}")
    
    for name, data in oofs_by_score:
        selected_names.append(name)
        selected_models.append(data)

        study = optuna.create_study(direction='maximize')
        study.optimize(objective, n_trials=1000)

        if study.best_trial.value > round_best_oof:
            round_best_oof = study.best_trial.value
            round_best_params = study.best_trial.params
            round_best_model = data
            round_best_name = name
            print(f"trial add {name}, score improved to {round_best_oof}")
        else:
            print(f"trial add {name}, no improvement")
    
        selected_names = selected_names[:-1]
        selected_models = selected_models[:-1]
    
    if round_best_oof > best_oof:
        print(f"Successfully improved score with {round_best_name} to get score {round_best_oof}")
        selected_names.append(round_best_name)
        selected_models.append(round_best_model)
        best_names = [m for m in selected_names]
        best_oof = round_best_oof
        best_params = round_best_params
        oofs_by_score.remove((round_best_name, round_best_model))
    else:
        print("Round did not improve overall score. Stopping")
        break
            
print(f"Selected: {selected_names}")
print(f"OOF: {best_oof}")
print(f"Best hyperparameters: {best_params}")