In [1]:
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit
from autogluon.tabular import TabularPredictor

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('../../data/data_ie.csv')
df.head(), df.shape

(             gene_id  chromosome  global_position  ...  B104 B105 label
 0  ENSG00000260861.6          20          1556582  ...     a    g  True
 1  ENSG00000260861.6          20          1610358  ...     t    t  True
 2  ENSG00000260861.6          20          1577339  ...     g    t  True
 3  ENSG00000260861.6          20          1587561  ...     g    g  True
 4  ENSG00000260861.6          20          1577339  ...     g    t  True
 
 [5 rows x 110 columns],
 (22391, 110))

In [3]:
seq_cols = [col for col in df.columns if col.startswith("B")]
df_model = df[['gene_id'] +seq_cols + ["label"]].copy()

df_model["label"] = (
    df_model["label"].astype(str).str.lower().map({"true": 1, "false": 0})
)
df_model["label"].value_counts(dropna=False)


def split_by_gene(data, test_size=0.2, random_state=42):
    
    splitter = GroupShuffleSplit(test_size=test_size, n_splits=1, random_state=random_state)
    idx_train, idx_test = next(splitter.split(data, groups=data["gene_id"]))

    return data.iloc[idx_train].copy(), data.iloc[idx_test].copy()



train_val, test_data = split_by_gene(df_model, test_size=0.2, random_state=42)
train_data, val_data = split_by_gene(train_val, test_size=0.25, random_state=42)


train = train_data.drop(columns=["gene_id"])
val = val_data.drop(columns=["gene_id"])
test = test_data.drop(columns=["gene_id"]) 

train.shape, val.shape, test_data.shape





((13849, 106), (3154, 106), (5388, 107))

In [4]:
predictor = TabularPredictor(
    label="label", 
    problem_type="binary",
    eval_metric="f1",
    path="../models/autogluon_ie")


predictor.fit(
    train_data=train,
    tuning_data=val,
    time_limit=200,
    presets="medium_quality_faster_train",
)

Preset alias specified: 'medium_quality_faster_train' maps to 'medium_quality'.
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.5.0
Python Version:     3.11.14
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Thu Jun  5 18:30:46 UTC 2025
CPU Count:          12
Pytorch Version:    2.6.0
CUDA Version:       12.6
GPU Memory:         GPU 0: 6.00/6.00 GB
Total GPU Memory:   Free: 6.00 GB, Allocated: 0.00 GB, Total: 6.00 GB
GPU Count:          1
Memory Avail:       7.86 GB / 11.55 GB (68.0%)
Disk Space Avail:   638.99 GB / 951.65 GB (67.1%)
Presets specified: ['medium_quality_faster_train']
Using hyperparameters preset: hyperparameters='default'
Beginning AutoGluon training ... Time limit = 200s
AutoGluon will save models to "/mnt/c/Users/Carlos/OneDrive/Documentos/Genome-Transition_AutoGluon/training/models/autogluon_ie"
Train Data Rows:    13849
Train Data Columns: 105
Tuning Data Rows:    3154
Tuning Data Columns: 105
Label Column:     

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x753e7a02e3d0>

In [10]:
perf = predictor.evaluate(test, silent=True)
perf

{'f1': 0.9767168851496771,
 'accuracy': 0.9779138827023014,
 'balanced_accuracy': np.float64(0.9774042283064694),
 'mcc': 0.9559968694092162,
 'roc_auc': np.float64(0.9965095771693531),
 'precision': 0.9892984542211652,
 'recall': 0.964451313755796}

In [6]:
predictor.leaderboard(val, silent=True).head(10)

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.973568,0.973568,f1,2.016205,2.181388,143.609399,0.005102,0.001382,0.311671,2,True,12
1,NeuralNetFastAI,0.971002,0.971002,f1,0.93048,1.42777,120.991065,0.93048,1.42777,120.991065,1,True,8
2,LightGBMLarge,0.970149,0.970149,f1,0.063488,0.014173,0.914926,0.063488,0.014173,0.914926,1,True,11
3,CatBoost,0.969938,0.969938,f1,0.05163,0.027426,21.631268,0.05163,0.027426,21.631268,1,True,5
4,LightGBMXT,0.969483,0.969483,f1,0.033091,0.017784,1.274711,0.033091,0.017784,1.274711,1,True,1
5,LightGBM,0.967656,0.967656,f1,0.022226,0.018217,1.139483,0.022226,0.018217,1.139483,1,True,2
6,XGBoost,0.965152,0.965152,f1,0.124654,0.055743,6.887184,0.124654,0.055743,6.887184,1,True,9
7,ExtraTreesGini,0.952894,0.952894,f1,0.518789,0.09075,0.953722,0.518789,0.09075,0.953722,1,True,6
8,ExtraTreesEntr,0.95054,0.95054,f1,0.449573,0.088817,0.948494,0.449573,0.088817,0.948494,1,True,7
9,RandomForestGini,0.94139,0.94139,f1,0.311267,0.125074,1.138407,0.311267,0.125074,1.138407,1,True,3


In [8]:
predictor.feature_importance(
    val,
    subsample_size=min(500, len(val)),
    num_shuffle_sets=3
).head(10)

Computing feature importance via permutation shuffling for 105 features using 500 rows with 3 shuffle sets...


	121.42s	= Expected runtime (40.47s per shuffle set)
	79.77s	= Actual runtime (Completed 3 of 3 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
B100,0.211811,0.020583,0.001566,3,0.329751,0.09387
B99,0.154696,0.003514,8.6e-05,3,0.174831,0.134561
B98,0.018641,0.01538,0.085314,3,0.10677,-0.069487
B95,0.009645,0.00153,0.004141,3,0.018411,0.000879
B36,0.004856,0.001867,0.022942,3,0.015552,-0.00584
B80,0.004574,0.003183,0.06525,3,0.022811,-0.013662
B62,0.003953,0.003427,0.091895,3,0.023593,-0.015686
B101,0.003839,0.001683,0.02925,3,0.013482,-0.005805
B96,0.003795,0.00329,0.091871,3,0.022647,-0.015056
B94,0.003772,0.001414,0.021899,3,0.011875,-0.004331
