In [1]:
import sys
import os

sys.path.append(os.path.abspath(".."))  # sube de notebooks a training
sys.path.append(os.path.abspath("../.."))  # sube a ra√≠z del proyecto
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit
from autogluon.tabular import TabularPredictor
from training.src.print_baseline_metrics import BaselineMetrics

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('../../data/data_ez.csv')
df.head(), df.shape

(             gene_id  chromosome  global_position  Last_Exon_End B1 B2 B3 B4  \
 0  ENSG00000260861.6          20          1610564          72419  c  t  a  a   
 1  ENSG00000260861.6          20          1587670          49525  g  t  g  c   
 2  ENSG00000260861.6          20          1619010          80865  c  c  t  g   
 3  ENSG00000171873.8          20          4248288          29657  c  t  g  t   
 4  ENSG00000101282.9          20          1001312          44859  c  c  t  a   
 
   B5 B6  ... B542 B543 B544 B545 B546 B547 B548 B549 B550 label  
 0  a  c  ...    a    a    a    t    t    t    t    t    a  True  
 1  c  a  ...    g    a    c    c    c    t    c    a    t  True  
 2  g  c  ...    a    g    a    a    g    t    g    a    a  True  
 3  c  a  ...    c    t    g    g    g    g    g    t    g  True  
 4  a  c  ...    t    t    a    a    a    t    c    t    a  True  
 
 [5 rows x 555 columns],
 (2606, 555))

In [3]:
seq_cols = [col for col in df.columns if col.startswith("B")]
df_model = df[['gene_id'] +seq_cols + ["label"]].copy()

df_model["label"] = (
    df_model["label"].astype(str).str.lower().map({"true": 1, "false": 0})
)
df_model["label"].value_counts(dropna=False)


def split_by_gene(data, test_size=0.2, random_state=42):
    
    splitter = GroupShuffleSplit(test_size=test_size, n_splits=1, random_state=random_state)
    idx_train, idx_test = next(splitter.split(data, groups=data["gene_id"]))

    return data.iloc[idx_train].copy(), data.iloc[idx_test].copy()



train_val, test_data = split_by_gene(df_model, test_size=0.2, random_state=42)
train_data, val_data = split_by_gene(train_val, test_size=0.25, random_state=42)


train = train_data.drop(columns=["gene_id"])
val = val_data.drop(columns=["gene_id"])
test = test_data.drop(columns=["gene_id"]) 

train.shape, val.shape, test_data.shape

((1439, 551), (509, 551), (658, 552))

In [4]:
predictor = TabularPredictor(
    label='label',
    problem_type='binary',
    eval_metric='f1',
    path='../models/autogluon_ez'
)

predictor.fit(
    train_data=train,
    tuning_data=val,
    time_limit=200,
    presets='medium_quality_faster_train'
)

Preset alias specified: 'medium_quality_faster_train' maps to 'medium_quality'.
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.5.0
Python Version:     3.11.14
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Thu Jun  5 18:30:46 UTC 2025
CPU Count:          12
Pytorch Version:    2.6.0
CUDA Version:       12.6
GPU Memory:         GPU 0: 6.00/6.00 GB
Total GPU Memory:   Free: 6.00 GB, Allocated: 0.00 GB, Total: 6.00 GB
GPU Count:          1
Memory Avail:       8.86 GB / 11.55 GB (76.7%)
Disk Space Avail:   632.56 GB / 951.65 GB (66.5%)
Presets specified: ['medium_quality_faster_train']
Using hyperparameters preset: hyperparameters='default'
Beginning AutoGluon training ... Time limit = 200s
AutoGluon will save models to "/mnt/c/Users/Carlos/OneDrive/Documentos/Genome-Transition_AutoGluon/training/models/autogluon_ez"
Train Data Rows:    1439
Train Data Columns: 550
Tuning Data Rows:    509
Tuning Data Columns: 550
Label Column:       

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7d35cd87ab10>

In [5]:
perf = predictor.evaluate(test, silent=True)
perf

{'f1': 0.580046403712297,
 'accuracy': 0.44984802431610943,
 'balanced_accuracy': np.float64(0.536861711881255),
 'mcc': 0.1282172149347223,
 'roc_auc': np.float64(0.5571093546425779),
 'precision': 0.415973377703827,
 'recall': 0.9578544061302682}

In [6]:
predictor.leaderboard(val, silent=True).head(10)

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.64598,0.607945,f1,-0.28458,0.988379,51.914707,0.045424,0.002946,0.335131,2,True,11
1,ExtraTreesEntr,0.639785,0.40281,f1,0.391818,0.367981,1.257439,0.391818,0.367981,1.257439,1,True,7
2,RandomForestGini,0.639785,0.49467,f1,0.394217,0.191067,1.546069,0.394217,0.191067,1.546069,1,True,3
3,ExtraTreesGini,0.63807,0.48203,f1,0.619668,0.198271,1.372426,0.619668,0.198271,1.372426,1,True,6
4,RandomForestEntr,0.635135,0.538793,f1,-1.457696,0.318492,1.371618,-1.457696,0.318492,1.371618,1,True,4
5,NeuralNetTorch,0.623907,0.603066,f1,0.508025,0.46867,48.835533,0.508025,0.46867,48.835533,1,True,10
6,CatBoost,0.608069,0.495327,f1,0.205046,0.169325,44.409977,0.205046,0.169325,44.409977,1,True,5
7,LightGBM,0.587302,0.557769,f1,0.163373,0.082548,2.648232,0.163373,0.082548,2.648232,1,True,2
8,LightGBMXT,0.582064,0.534884,f1,0.157953,0.203115,5.371449,0.157953,0.203115,5.371449,1,True,1
9,XGBoost,0.570922,0.533066,f1,0.318974,0.142267,7.545178,0.318974,0.142267,7.545178,1,True,9


In [7]:
predictor.feature_importance(
    val,
    subsample_size=min(200, len(val)),
    num_shuffle_sets=3
).head(10)

Computing feature importance via permutation shuffling for 550 features using 200 rows with 3 shuffle sets...
	2780.81s	= Expected runtime (926.94s per shuffle set)
	207.51s	= Actual runtime (Completed 3 of 3 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
B18,0.029636,0.005965,0.006619,3,0.063817,-0.004546
B373,0.027247,0.007675,0.012722,3,0.071225,-0.016732
B63,0.023601,0.013614,0.047662,3,0.101612,-0.05441
B448,0.023094,0.010963,0.033796,3,0.085913,-0.039726
B406,0.020923,0.0082,0.023786,3,0.06791,-0.026063
B465,0.018823,0.001505,0.001062,3,0.027449,0.010198
B51,0.01817,0.01038,0.04687,3,0.077649,-0.041309
B230,0.017909,0.00277,0.003939,3,0.033779,0.002039
B499,0.017699,0.001367,0.000992,3,0.025533,0.009865
B371,0.017095,0.00225,0.002862,3,0.029987,0.004203


In [8]:
save_output = BaselineMetrics(transition="EZ",
    perf=perf,
    train_df=train,
    val_df=val,
    test_df=test_data,)

save_output.save_metrics()