In [2]:
import sys
import os

sys.path.append(os.path.abspath(".."))  # sube de notebooks a training
sys.path.append(os.path.abspath("../.."))  # sube a ra√≠z del proyecto

import pandas as pd
from sklearn.model_selection import GroupShuffleSplit
from autogluon.tabular import TabularPredictor
from training.src.print_baseline_metrics import BaselineMetrics

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df = pd.read_csv('../../data/data_ie.csv')
df.head(), df.shape

(             gene_id  chromosome  global_position  Exon_Start B1 B2 B3 B4 B5  \
 0  ENSG00000260861.6          20          1556582       18437  t  t  t  t  a   
 1  ENSG00000260861.6          20          1610358       72213  a  g  c  t  g   
 2  ENSG00000260861.6          20          1577339       39194  a  t  t  g  a   
 3  ENSG00000260861.6          20          1587561       49416  g  c  c  c  a   
 4  ENSG00000260861.6          20          1577339       39194  a  t  t  g  a   
 
   B6  ... B97 B98 B99 B100 B101 B102 B103 B104 B105 label  
 0  c  ...   a   a   a    a    t    g    g    a    g  True  
 1  a  ...   t   a   g    c    a    a    c    t    t  True  
 2  t  ...   c   a   t    c    a    g    g    g    t  True  
 3  c  ...   g   c   a    c    c    t    a    g    g  True  
 4  t  ...   c   a   t    c    a    g    g    g    t  True  
 
 [5 rows x 110 columns],
 (22391, 110))

In [4]:
seq_cols = [col for col in df.columns if col.startswith("B")]
df_model = df[['gene_id'] +seq_cols + ["label"]].copy()

df_model["label"] = (
    df_model["label"].astype(str).str.lower().map({"true": 1, "false": 0})
)
df_model["label"].value_counts(dropna=False)


def split_by_gene(data, test_size=0.2, random_state=42):
    
    splitter = GroupShuffleSplit(test_size=test_size, n_splits=1, random_state=random_state)
    idx_train, idx_test = next(splitter.split(data, groups=data["gene_id"]))

    return data.iloc[idx_train].copy(), data.iloc[idx_test].copy()



train_val, test_data = split_by_gene(df_model, test_size=0.2, random_state=42)
train_data, val_data = split_by_gene(train_val, test_size=0.25, random_state=42)


train = train_data.drop(columns=["gene_id"])
val = val_data.drop(columns=["gene_id"])
test = test_data.drop(columns=["gene_id"]) 

train.shape, val.shape, test_data.shape





((13849, 106), (3154, 106), (5388, 107))

In [5]:
predictor = TabularPredictor(
    label="label", 
    problem_type="binary",
    eval_metric="f1",
    path="../models/autogluon_ie")


predictor.fit(
    train_data=train,
    tuning_data=val,
    time_limit=200,
    presets="medium_quality_faster_train",
)

Preset alias specified: 'medium_quality_faster_train' maps to 'medium_quality'.
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.5.0
Python Version:     3.11.14
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Thu Jun  5 18:30:46 UTC 2025
CPU Count:          12
Pytorch Version:    2.6.0
CUDA Version:       12.6
GPU Memory:         GPU 0: 6.00/6.00 GB
Total GPU Memory:   Free: 6.00 GB, Allocated: 0.00 GB, Total: 6.00 GB
GPU Count:          1
Memory Avail:       8.67 GB / 11.55 GB (75.1%)
Disk Space Avail:   633.64 GB / 951.65 GB (66.6%)
Presets specified: ['medium_quality_faster_train']
Using hyperparameters preset: hyperparameters='default'
Beginning AutoGluon training ... Time limit = 200s
AutoGluon will save models to "/mnt/c/Users/Carlos/OneDrive/Documentos/Genome-Transition_AutoGluon/training/models/autogluon_ie"
Train Data Rows:    13849
Train Data Columns: 105
Tuning Data Rows:    3154
Tuning Data Columns: 105
Label Column:     

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x74fac3272d10>

In [6]:
perf = predictor.evaluate(test, silent=True)
perf

{'f1': 0.9745795854517012,
 'accuracy': 0.9758723088344469,
 'balanced_accuracy': np.float64(0.9753814307794215),
 'mcc': 0.9518792789249667,
 'roc_auc': np.float64(0.9975451258555973),
 'precision': 0.9865399841646872,
 'recall': 0.9629057187017002}

In [7]:
predictor.leaderboard(val, silent=True).head(10)

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.977816,0.977816,f1,2.089034,1.526821,122.161103,0.130306,0.00467,0.458946,2,True,11
1,NeuralNetFastAI,0.97561,0.97561,f1,1.765712,1.439432,111.541734,1.765712,1.439432,111.541734,1,True,8
2,LightGBMLarge,0.970149,0.970149,f1,0.041678,0.026344,1.629374,0.041678,0.026344,1.629374,1,True,10
3,CatBoost,0.969938,0.969938,f1,0.080118,0.037147,29.935992,0.080118,0.037147,29.935992,1,True,5
4,LightGBMXT,0.969483,0.969483,f1,0.048928,0.035391,2.601035,0.048928,0.035391,2.601035,1,True,1
5,LightGBM,0.967656,0.967656,f1,0.223286,0.024591,1.831339,0.223286,0.024591,1.831339,1,True,2
6,XGBoost,0.965152,0.965152,f1,0.193016,0.082719,10.160422,0.193016,0.082719,10.160422,1,True,9
7,ExtraTreesGini,0.952894,0.952894,f1,0.877841,0.151179,1.408628,0.877841,0.151179,1.408628,1,True,6
8,ExtraTreesEntr,0.95054,0.95054,f1,0.81229,0.253863,1.617816,0.81229,0.253863,1.617816,1,True,7
9,RandomForestGini,0.94139,0.94139,f1,0.649788,0.139753,1.535313,0.649788,0.139753,1.535313,1,True,3


In [8]:
predictor.feature_importance(
    val,
    subsample_size=min(500, len(val)),
    num_shuffle_sets=3
).head(10)

Computing feature importance via permutation shuffling for 105 features using 500 rows with 3 shuffle sets...
	193.64s	= Expected runtime (64.55s per shuffle set)
	62.81s	= Actual runtime (Completed 3 of 3 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
B100,0.195267,0.011749,0.000602,3,0.262592,0.127942
B99,0.139149,0.016632,0.002364,3,0.234451,0.043847
B98,0.032643,0.011614,0.019848,3,0.099191,-0.033904
B95,0.016675,0.007551,0.031031,3,0.059943,-0.026594
B91,0.012956,0.004097,0.015878,3,0.036434,-0.010521
B94,0.011234,0.005118,0.031369,3,0.040561,-0.018092
B96,0.011166,0.000228,6.9e-05,3,0.012472,0.009859
B92,0.010635,0.006386,0.051049,3,0.047225,-0.025955
B89,0.0098,0.009695,0.111049,3,0.065354,-0.045755
B93,0.008747,0.005998,0.063724,3,0.043118,-0.025623


In [9]:
save_output = BaselineMetrics(transition="IE",
    perf=perf,
    train_df=train,
    val_df=val,
    test_df=test_data,)

save_output.save_metrics()