In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from autogluon.tabular import TabularPredictor

In [None]:
df = pd.read_csv("../../data/data_ei.csv")
df.head() , df.shape

In [None]:
seq_cols = [col for col in df.columns if col.startswith("B")]
df_model = df[seq_cols + ["label"]]

df_model["label"] = (
    df_model["label"].astype(str).str.lower().map({"true": 1, "false": 0})
)
df_model["label"].value_counts(dropna=False)

df_model.head()

train_val, test_data = train_test_split(
    df_model,
    test_size=0.2,
    random_state=42,
    stratify=df_model["label"]
)

train , val = train_test_split(
    train_val,
    test_size=0.25,
    random_state=42,
    stratify=train_val["label"]
)


train_val.shape, train.shape, val.shape, test_data.shape


In [29]:
predictor = TabularPredictor(
    label="label", 
    problem_type="binary",
    eval_metric="f1",
    path="../models/autogluon_ei")


predictor.fit(
    train_data=train,
    tuning_data=val,
    time_limit=200,
    presets="medium_quality_faster_train",
)


Preset alias specified: 'medium_quality_faster_train' maps to 'medium_quality'.
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.5.0
Python Version:     3.11.14
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Thu Jun  5 18:30:46 UTC 2025
CPU Count:          12
Pytorch Version:    2.6.0
CUDA Version:       12.6
GPU Memory:         GPU 0: 6.00/6.00 GB
Total GPU Memory:   Free: 6.00 GB, Allocated: 0.00 GB, Total: 6.00 GB
GPU Count:          1
Memory Avail:       7.30 GB / 11.55 GB (63.2%)
Disk Space Avail:   642.65 GB / 951.65 GB (67.5%)
Presets specified: ['medium_quality_faster_train']
Using hyperparameters preset: hyperparameters='default'
Beginning AutoGluon training ... Time limit = 200s
AutoGluon will save models to "/mnt/c/Users/Carlos/OneDrive/Documentos/Genome-Transition_AutoGluon/training/models/autogluon_ei"
Train Data Rows:    13434
Train Data Columns: 12
Tuning Data Rows:    4478
Tuning Data Columns: 12
Label Column:       

[1000]	valid_set's binary_logloss: 0.0150239	valid_set's f1: 0.995318


	0.9956	 = Validation score   (f1)
	2.37s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: LightGBM ... Training model for up to 197.37s of the 197.37s of remaining time.
	Fitting with cpus=6, gpus=0, mem=0.0/7.3 GB
	0.9959	 = Validation score   (f1)
	1.97s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: RandomForestGini ... Training model for up to 195.28s of the 195.28s of remaining time.
	Fitting with cpus=12, gpus=0, mem=0.1/7.3 GB
	0.9953	 = Validation score   (f1)
	0.72s	 = Training   runtime
	0.09s	 = Validation runtime
Fitting model: RandomForestEntr ... Training model for up to 194.30s of the 194.29s of remaining time.
	Fitting with cpus=12, gpus=0, mem=0.1/7.4 GB
	0.995	 = Validation score   (f1)
	0.71s	 = Training   runtime
	0.09s	 = Validation runtime
Fitting model: CatBoost ... Training model for up to 193.33s of the 193.33s of remaining time.
	Fitting with cpus=6, gpus=0
	0.9953	 = Validation score   (f1)
	16.01s	 = Training   runtime
	0

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x77a2f5fca9d0>

In [30]:
perf = predictor.evaluate(test_data, silent=True)
perf


{'f1': 0.9961559582646897,
 'accuracy': 0.9968743022996205,
 'balanced_accuracy': np.float64(0.9970208971291719),
 'mcc': 0.9935262396956116,
 'roc_auc': np.float64(0.9995334557882696),
 'precision': 0.9945175438596491,
 'recall': 0.9977997799779978}

In [32]:
predictor.leaderboard(test_data,silent=True).head(10)


Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,ExtraTreesGini,0.99643,0.995321,f1,0.22946,0.105277,0.711324,0.22946,0.105277,0.711324,1,True,6
1,ExtraTreesEntr,0.99643,0.995597,f1,0.281597,0.091305,0.736462,0.281597,0.091305,0.736462,1,True,7
2,XGBoost,0.996156,0.996145,f1,0.079401,0.016662,2.148482,0.079401,0.016662,2.148482,1,True,9
3,WeightedEnsemble_L2,0.996156,0.996145,f1,0.107996,0.018913,2.48056,0.028595,0.00225,0.332078,2,True,11
4,RandomForestEntr,0.996156,0.995044,f1,0.202919,0.092142,0.711756,0.202919,0.092142,0.711756,1,True,4
5,RandomForestGini,0.996156,0.995318,f1,0.240264,0.09415,0.719666,0.240264,0.09415,0.719666,1,True,3
6,CatBoost,0.995878,0.995323,f1,0.032505,0.008015,16.014644,0.032505,0.008015,16.014644,1,True,5
7,LightGBM,0.995878,0.995869,f1,0.043969,0.027948,1.966027,0.043969,0.027948,1.966027,1,True,2
8,LightGBMXT,0.995331,0.995592,f1,0.058749,0.033338,2.366506,0.058749,0.033338,2.366506,1,True,1
9,NeuralNetFastAI,0.993392,0.994496,f1,0.795324,0.933927,117.057591,0.795324,0.933927,117.057591,1,True,8


In [33]:
predictor.feature_importance(test_data).head(10)

Computing feature importance via permutation shuffling for 12 features using 4479 rows with 5 shuffle sets...
	5.15s	= Expected runtime (1.03s per shuffle set)
	1.82s	= Actual runtime (Completed 5 of 5 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
B6,0.204047,0.003725,1.332259e-08,5,0.211716,0.196377
B7,0.180694,0.003593,1.876022e-08,5,0.188093,0.173295
B10,0.045968,0.002359,8.289351e-07,5,0.050825,0.041111
B5,0.029088,0.00221,3.964542e-06,5,0.033638,0.024539
B8,0.015514,0.001577,1.263168e-05,5,0.018761,0.012268
B9,0.009002,0.001431,7.413761e-05,5,0.011949,0.006055
B4,0.007499,0.000579,4.238192e-06,5,0.008692,0.006306
B11,0.003956,0.000534,3.882673e-05,5,0.005054,0.002857
B1,0.001709,0.000901,0.006623258,5,0.003564,-0.000146
B12,0.001539,0.000571,0.001902406,5,0.002714,0.000365
