In [None]:
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit
from autogluon.tabular import TabularPredictor

In [None]:
df = pd.read_csv('../../data/data_ez.csv')
df.head(), df.shape

In [None]:
seq_cols = [col for col in df.columns if col.startswith("B")]
df_model = df[['gene_id'] +seq_cols + ["label"]].copy()

df_model["label"] = (
    df_model["label"].astype(str).str.lower().map({"true": 1, "false": 0})
)
df_model["label"].value_counts(dropna=False)


def split_by_gene(data, test_size=0.2, random_state=42):
    
    splitter = GroupShuffleSplit(test_size=test_size, n_splits=1, random_state=random_state)
    idx_train, idx_test = next(splitter.split(data, groups=data["gene_id"]))

    return data.iloc[idx_train].copy(), data.iloc[idx_test].copy()



train_val, test_data = split_by_gene(df_model, test_size=0.2, random_state=42)
train_data, val_data = split_by_gene(train_val, test_size=0.25, random_state=42)


train = train_data.drop(columns=["gene_id"])
val = val_data.drop(columns=["gene_id"])
test = test_data.drop(columns=["gene_id"]) 

train.shape, val.shape, test_data.shape

In [None]:
predictor = TabularPredictor(
    label='label',
    problem_type='binary',
    eval_metric='f1',
    path='../models/autogluon_ez'
)

predictor.fit(
    train_data=train,
    tuning_data=val,
    time_limit=200,
    presets='medium_quality_faster_train'
)

In [None]:
perf = predictor.evaluate(test, silent=True)
perf

In [None]:
predictor.leaderboard(val, silent=True).head(10)

In [None]:
predictor.feature_importance(
    val,
    subsample_size=min(200, len(val)),
    num_shuffle_sets=3
).head(10)