In [1]:
import sys
import os

sys.path.append(os.path.abspath(".."))  # sube de notebooks a training
sys.path.append(os.path.abspath("../.."))  # sube a ra√≠z del proyecto

import pandas as pd
from sklearn.model_selection import GroupShuffleSplit
from autogluon.tabular import TabularPredictor
from training.src.print_baseline_metrics import BaselineMetrics

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('../../data/data_ze.csv')
df.head(), df.shape

(             gene_id  chromosome  global_position  First_Exon_Start B1 B2 B3  \
 0  ENSG00000260861.6          20          1539145              1000  c  t  g   
 1  ENSG00000260861.6          20          1550939             12794  g  c  c   
 2  ENSG00000260861.6          20          1550961             12816  a  g  g   
 3  ENSG00000171873.8          20          4219631              1000  g  g  t   
 4  ENSG00000101282.9          20           957453              1000  g  a  g   
 
   B4 B5 B6  ... B542 B543 B544 B545 B546 B547 B548 B549 B550 label  
 0  a  a  t  ...    g    a    g    c    a    g    g    c    t  True  
 1  a  a  g  ...    g    t    c    g    t    a    a    g    c  True  
 2  t  c  t  ...    t    g    t    a    a    a    t    c    a  True  
 3  g  c  g  ...    g    a    g    c    c    g    c    c    g  True  
 4  g  a  c  ...    c    g    g    c    g    c    g    c    c  True  
 
 [5 rows x 555 columns],
 (2606, 555))

In [3]:
seq_cols = [col for col in df.columns if col.startswith("B")]
df_model = df[['gene_id'] +seq_cols + ["label"]].copy()

df_model["label"] = (
    df_model["label"].astype(str).str.lower().map({"true": 1, "false": 0})
)
df_model["label"].value_counts(dropna=False)


def split_by_gene(data, test_size=0.2, random_state=42):
    
    splitter = GroupShuffleSplit(test_size=test_size, n_splits=1, random_state=random_state)
    idx_train, idx_test = next(splitter.split(data, groups=data["gene_id"]))

    return data.iloc[idx_train].copy(), data.iloc[idx_test].copy()



train_val, test_data = split_by_gene(df_model, test_size=0.2, random_state=42)
train_data, val_data = split_by_gene(train_val, test_size=0.25, random_state=42)


train = train_data.drop(columns=["gene_id"])
val = val_data.drop(columns=["gene_id"])
test = test_data.drop(columns=["gene_id"]) 

train.shape, val.shape, test_data.shape

((1439, 551), (509, 551), (658, 552))

In [4]:
predictor = TabularPredictor(
    label='label',
    problem_type='binary',
    eval_metric='f1',
    path='../models/autogluon_ze'
)

predictor.fit(
    train_data=train,
    tuning_data=val,
    time_limit=200,
    presets='medium_quality_faster_train'
)


Preset alias specified: 'medium_quality_faster_train' maps to 'medium_quality'.
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.5.0
Python Version:     3.11.14
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Thu Jun  5 18:30:46 UTC 2025
CPU Count:          12
Pytorch Version:    2.6.0
CUDA Version:       12.6
GPU Memory:         GPU 0: 6.00/6.00 GB
Total GPU Memory:   Free: 6.00 GB, Allocated: 0.00 GB, Total: 6.00 GB
GPU Count:          1
Memory Avail:       7.75 GB / 11.55 GB (67.1%)
Disk Space Avail:   633.50 GB / 951.65 GB (66.6%)
Presets specified: ['medium_quality_faster_train']
Using hyperparameters preset: hyperparameters='default'
Beginning AutoGluon training ... Time limit = 200s
AutoGluon will save models to "/mnt/c/Users/Carlos/OneDrive/Documentos/Genome-Transition_AutoGluon/training/models/autogluon_ze"
Train Data Rows:    1439
Train Data Columns: 550
Tuning Data Rows:    509
Tuning Data Columns: 550
Label Column:       

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7f4cc5289190>

In [5]:
perf = predictor.evaluate(test,silent=True)
perf

{'f1': 0.6099071207430341,
 'accuracy': 0.6170212765957447,
 'balanced_accuracy': np.float64(0.6406188173755272),
 'mcc': 0.27923975968306297,
 'roc_auc': np.float64(0.7121514809345957),
 'precision': 0.5116883116883116,
 'recall': 0.7547892720306514}

In [6]:
predictor.leaderboard(val, silent=True)

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.683398,0.676409,f1,1.285807,0.633705,37.989351,0.028223,0.002022,0.375506,2,True,12
1,NeuralNetTorch,0.676806,0.676349,f1,0.810001,0.415512,29.403759,0.810001,0.415512,29.403759,1,True,10
2,LightGBMXT,0.662651,0.675159,f1,0.125366,0.217503,3.680164,0.125366,0.217503,3.680164,1,True,1
3,CatBoost,0.65996,0.64,f1,0.230654,0.225452,61.679798,0.230654,0.225452,61.679798,1,True,5
4,ExtraTreesGini,0.658065,0.491329,f1,0.452857,0.213091,1.385273,0.452857,0.213091,1.385273,1,True,6
5,RandomForestEntr,0.646018,0.5,f1,0.433289,0.304788,1.298502,0.433289,0.304788,1.298502,1,True,4
6,LightGBMLarge,0.641732,0.649789,f1,0.174835,0.099684,2.442453,0.174835,0.099684,2.442453,1,True,11
7,LightGBM,0.638889,0.639659,f1,0.125185,0.111491,0.631747,0.125185,0.111491,0.631747,1,True,2
8,RandomForestGini,0.638831,0.529412,f1,0.450982,0.198405,1.599874,0.450982,0.198405,1.599874,1,True,3
9,XGBoost,0.627957,0.628571,f1,0.447583,0.216171,8.210086,0.447583,0.216171,8.210086,1,True,9


In [7]:
predictor.feature_importance(
    val,
    subsample_size=min(200, len(val)),
    num_shuffle_sets=3
).head(10)

Computing feature importance via permutation shuffling for 550 features using 200 rows with 3 shuffle sets...
	3068.9s	= Expected runtime (1022.97s per shuffle set)
	200.95s	= Actual runtime (Completed 3 of 3 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
B391,0.031624,0.007334,0.008729,3,0.073648,-0.0104
B197,0.029132,0.006103,0.007158,3,0.064104,-0.00584
B182,0.027766,0.010661,0.022895,3,0.088853,-0.033321
B263,0.027109,0.007653,0.012775,3,0.070961,-0.016742
B238,0.026578,0.010346,0.02349,3,0.085861,-0.032706
B59,0.025607,0.001658,0.000697,3,0.035107,0.016107
B341,0.025582,0.0069,0.011701,3,0.065119,-0.013955
B435,0.025304,0.014146,0.045145,3,0.10636,-0.055753
B186,0.025265,0.005045,0.006516,3,0.054172,-0.003643
B112,0.0252,0.008681,0.018675,3,0.07494,-0.02454


In [8]:
save_output = BaselineMetrics(transition="ZE",
    perf=perf,
    train_df=train,
    val_df=val,
    test_df=test_data,)

save_output.save_metrics()