In [None]:
import sys
import os

sys.path.append(os.path.abspath(".."))  # sube de notebooks a training
sys.path.append(os.path.abspath("../.."))  # sube a ra√≠z del proyecto

import pandas as pd
from sklearn.model_selection import GroupShuffleSplit
from autogluon.tabular import TabularPredictor
from training.src.print_baseline_metrics import BaselineMetrics

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("../../data/data_ei.csv")
df.head() , df.shape

(             gene_id  chromosome  global_position  Intron_Start B1 B2 B3 B4  \
 0  ENSG00000260861.6          20          1539337          1192  a  t  g  g   
 1  ENSG00000260861.6          20          1556746         18601  c  c  t  c   
 2  ENSG00000260861.6          20          1551040         12895  a  a  g  t   
 3  ENSG00000260861.6          20          1577696         39551  g  g  t  g   
 4  ENSG00000260861.6          20          1551051         12906  t  a  t  t   
 
   B5 B6 B7 B8 B9 B10 B11 B12  label  
 0  c  c  t  g  t   c   c   c   True  
 1  a  a  t  a  g   t   t   t   True  
 2  g  a  g  g  t   g   t   t   True  
 3  g  a  g  g  a   c   a   c   True  
 4  t  c  a  a  g   a   t   c   True  ,
 (22391, 17))

In [3]:
seq_cols = [col for col in df.columns if col.startswith("B")]
df_model = df[['gene_id'] +seq_cols + ["label"]].copy()

df_model["label"] = (
    df_model["label"].astype(str).str.lower().map({"true": 1, "false": 0})
)
df_model["label"].value_counts(dropna=False)


def split_by_gene(data, test_size=0.2, random_state=42):
    
    splitter = GroupShuffleSplit(test_size=test_size, n_splits=1, random_state=random_state)
    idx_train, idx_test = next(splitter.split(data, groups=data["gene_id"]))

    return data.iloc[idx_train].copy(), data.iloc[idx_test].copy()



train_val, test_data = split_by_gene(df_model, test_size=0.2, random_state=42)
train_data, val_data = split_by_gene(train_val, test_size=0.25, random_state=42)


train = train_data.drop(columns=["gene_id"])
val = val_data.drop(columns=["gene_id"])
test = test_data.drop(columns=["gene_id"]) 

train.shape, val.shape, test_data.shape


((13849, 13), (3154, 13), (5388, 14))

In [4]:
predictor = TabularPredictor(
    label="label", 
    problem_type="binary",
    eval_metric="f1",
    path="../models/autogluon_ei")


predictor.fit(
    train_data=train,
    tuning_data=val,
    time_limit=200,
    presets="medium_quality_faster_train",
)


Preset alias specified: 'medium_quality_faster_train' maps to 'medium_quality'.
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.5.0
Python Version:     3.11.14
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Thu Jun  5 18:30:46 UTC 2025
CPU Count:          12
Pytorch Version:    2.6.0
CUDA Version:       12.6
GPU Memory:         GPU 0: 6.00/6.00 GB
Total GPU Memory:   Free: 6.00 GB, Allocated: 0.00 GB, Total: 6.00 GB
GPU Count:          1
Memory Avail:       7.48 GB / 11.55 GB (64.8%)
Disk Space Avail:   631.63 GB / 951.65 GB (66.4%)
Presets specified: ['medium_quality_faster_train']
Using hyperparameters preset: hyperparameters='default'
Beginning AutoGluon training ... Time limit = 200s
AutoGluon will save models to "/mnt/c/Users/Carlos/OneDrive/Documentos/Genome-Transition_AutoGluon/training/models/autogluon_ei"
Train Data Rows:    13849
Train Data Columns: 12
Tuning Data Rows:    3154
Tuning Data Columns: 12
Label Column:       

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7096225bf490>

In [5]:
perf = predictor.evaluate(test, silent=True)
perf


{'f1': 0.9839781164517389,
 'accuracy': 0.9847809948032665,
 'balanced_accuracy': np.float64(0.9843331861338043),
 'mcc': 0.9697147512040935,
 'roc_auc': np.float64(0.9980722841686906),
 'precision': 0.9952569169960475,
 'recall': 0.972952086553323}

In [6]:
predictor.leaderboard(val,silent=True).head(10)


Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,NeuralNetFastAI,0.995701,0.995701,f1,0.972857,0.575451,122.805197,0.972857,0.575451,122.805197,1,True,8
1,WeightedEnsemble_L2,0.995701,0.995701,f1,0.983109,0.577041,123.089109,0.010251,0.00159,0.283912,2,True,12
2,XGBoost,0.994845,0.994845,f1,0.070047,0.014824,2.118175,0.070047,0.014824,2.118175,1,True,9
3,LightGBM,0.994399,0.994399,f1,0.01754,0.00781,0.966798,0.01754,0.00781,0.966798,1,True,2
4,LightGBMXT,0.993971,0.993971,f1,0.017663,0.009269,1.501227,0.017663,0.009269,1.501227,1,True,1
5,CatBoost,0.992235,0.992235,f1,0.028214,0.008202,7.622237,0.028214,0.008202,7.622237,1,True,5
6,LightGBMLarge,0.992228,0.992228,f1,0.063461,0.008888,-0.761776,0.063461,0.008888,-0.761776,1,True,11
7,ExtraTreesGini,0.991387,0.991387,f1,0.220205,0.093733,0.694851,0.220205,0.093733,0.694851,1,True,6
8,RandomForestGini,0.990525,0.990525,f1,0.205544,0.104852,1.036091,0.205544,0.104852,1.036091,1,True,3
9,ExtraTreesEntr,0.990082,0.990082,f1,0.242629,0.090863,0.65099,0.242629,0.090863,0.65099,1,True,7


In [7]:
predictor.feature_importance(
    val,
    subsample_size=min(500, len(val)),
    num_shuffle_sets=3
).head(10)

Computing feature importance via permutation shuffling for 12 features using 500 rows with 3 shuffle sets...
	6.78s	= Expected runtime (2.26s per shuffle set)
	4.17s	= Actual runtime (Completed 3 of 3 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
B6,0.206883,0.013609,0.00072,3,0.284863,0.128903
B7,0.143493,0.022602,0.004085,3,0.273007,0.013979
B5,0.036782,0.013606,0.021356,3,0.114749,-0.041184
B10,0.032324,0.006749,0.00711,3,0.070994,-0.006346
B9,0.006369,0.004149,0.058552,3,0.030142,-0.017404
B4,0.005475,7.6e-05,3.2e-05,3,0.005908,0.005042
B8,0.005415,0.00469,0.09176,3,0.03229,-0.02146
B1,0.002736,7.6e-05,0.000128,3,0.00317,0.002302
B2,0.000939,0.001626,0.211325,3,0.010258,-0.00838
B11,0.000928,0.001608,0.211325,3,0.010144,-0.008287


In [8]:
save_output = BaselineMetrics(transition="EI",
    perf=perf,
    train_df=train,
    val_df=val,
    test_df=test_data,)

save_output.save_metrics()