In [1]:
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit
from autogluon.tabular import TabularPredictor

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("../../data/data_ei.csv")
df.head() , df.shape

(             gene_id  chromosome  global_position  Intron_Start B1 B2 B3 B4  \
 0  ENSG00000260861.6          20          1539337          1192  a  t  g  g   
 1  ENSG00000260861.6          20          1556746         18601  c  c  t  c   
 2  ENSG00000260861.6          20          1551040         12895  a  a  g  t   
 3  ENSG00000260861.6          20          1577696         39551  g  g  t  g   
 4  ENSG00000260861.6          20          1551051         12906  t  a  t  t   
 
   B5 B6 B7 B8 B9 B10 B11 B12  label  
 0  c  c  t  g  t   c   c   c   True  
 1  a  a  t  a  g   t   t   t   True  
 2  g  a  g  g  t   g   t   t   True  
 3  g  a  g  g  a   c   a   c   True  
 4  t  c  a  a  g   a   t   c   True  ,
 (22391, 17))

In [3]:
seq_cols = [col for col in df.columns if col.startswith("B")]
df_model = df[['gene_id'] +seq_cols + ["label"]].copy()

df_model["label"] = (
    df_model["label"].astype(str).str.lower().map({"true": 1, "false": 0})
)
df_model["label"].value_counts(dropna=False)


def split_by_gene(data, test_size=0.2, random_state=42):
    
    splitter = GroupShuffleSplit(test_size=test_size, n_splits=1, random_state=random_state)
    idx_train, idx_test = next(splitter.split(data, groups=data["gene_id"]))

    return data.iloc[idx_train].copy(), data.iloc[idx_test].copy()



train_val, test_data = split_by_gene(df_model, test_size=0.2, random_state=42)
train_data, val_data = split_by_gene(train_val, test_size=0.25, random_state=42)


train = train_data.drop(columns=["gene_id"])
val = val_data.drop(columns=["gene_id"])
test = test_data.drop(columns=["gene_id"]) 

train.shape, val.shape, test_data.shape


((13849, 13), (3154, 13), (5388, 14))

In [4]:
predictor = TabularPredictor(
    label="label", 
    problem_type="binary",
    eval_metric="f1",
    path="../models/autogluon_ei")


predictor.fit(
    train_data=train,
    tuning_data=val,
    time_limit=200,
    presets="medium_quality_faster_train",
)


Preset alias specified: 'medium_quality_faster_train' maps to 'medium_quality'.
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.5.0
Python Version:     3.11.14
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Thu Jun  5 18:30:46 UTC 2025
CPU Count:          12
Pytorch Version:    2.6.0
CUDA Version:       12.6
GPU Memory:         GPU 0: 6.00/6.00 GB
Total GPU Memory:   Free: 6.00 GB, Allocated: 0.00 GB, Total: 6.00 GB
GPU Count:          1
Memory Avail:       9.04 GB / 11.55 GB (78.3%)
Disk Space Avail:   639.10 GB / 951.65 GB (67.2%)
Presets specified: ['medium_quality_faster_train']
Using hyperparameters preset: hyperparameters='default'
Beginning AutoGluon training ... Time limit = 200s
AutoGluon will save models to "/mnt/c/Users/Carlos/OneDrive/Documentos/Genome-Transition_AutoGluon/training/models/autogluon_ei"
Train Data Rows:    13849
Train Data Columns: 12
Tuning Data Rows:    3154
Tuning Data Columns: 12
Label Column:       

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7416812b1ad0>

In [None]:
perf = predictor.evaluate(test, silent=True)
perf


{'f1': 0.9858334950514264,
 'accuracy': 0.9864513734224202,
 'balanced_accuracy': np.float64(0.9862621439611393),
 'mcc': 0.9728871878567557,
 'roc_auc': np.float64(0.9971952969750497),
 'precision': 0.9902534113060428,
 'recall': 0.98145285935085}

In [6]:
predictor.leaderboard(val,silent=True).head(10)


Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,XGBoost,0.994845,0.994845,f1,0.069634,0.013392,-0.581863,0.069634,0.013392,-0.581863,1,True,9
1,WeightedEnsemble_L2,0.994845,0.994845,f1,0.075816,0.014846,-0.333156,0.006182,0.001453,0.248706,2,True,11
2,LightGBM,0.994399,0.994399,f1,0.014329,0.006821,0.831948,0.014329,0.006821,0.831948,1,True,2
3,NeuralNetFastAI,0.994399,0.994399,f1,0.815529,0.564623,178.46544,0.815529,0.564623,178.46544,1,True,8
4,LightGBMXT,0.993971,0.993971,f1,0.017037,0.00732,1.040491,0.017037,0.00732,1.040491,1,True,1
5,CatBoost,0.992235,0.992235,f1,0.020585,0.007098,8.606702,0.020585,0.007098,8.606702,1,True,5
6,LightGBMLarge,0.992228,0.992228,f1,0.018979,0.009421,1.000651,0.018979,0.009421,1.000651,1,True,10
7,ExtraTreesGini,0.991387,0.991387,f1,0.223761,0.080109,0.740456,0.223761,0.080109,0.740456,1,True,6
8,RandomForestGini,0.990525,0.990525,f1,0.181598,0.113909,0.789417,0.181598,0.113909,0.789417,1,True,3
9,ExtraTreesEntr,0.990082,0.990082,f1,0.235775,0.091054,0.686477,0.235775,0.091054,0.686477,1,True,7


In [7]:
predictor.feature_importance(
    val,
    subsample_size=min(500, len(val)),
    num_shuffle_sets=3
).head(10)

Computing feature importance via permutation shuffling for 12 features using 500 rows with 3 shuffle sets...
	2.1s	= Expected runtime (0.7s per shuffle set)
	0.46s	= Actual runtime (Completed 3 of 3 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
B6,0.21575,0.020028,0.00143,3,0.330512,0.100987
B7,0.205641,0.018119,0.001289,3,0.309466,0.101816
B10,0.032244,0.002943,0.001383,3,0.049109,0.015379
B5,0.027203,0.008533,0.015633,3,0.076097,-0.021691
B8,0.017152,0.007591,0.02976,3,0.060649,-0.026345
B9,0.008148,0.002902,0.019893,3,0.02478,-0.008483
B4,0.003694,0.001666,0.030805,3,0.013242,-0.005853
B3,0.003689,0.001591,0.028378,3,0.012804,-0.005426
B11,0.000928,0.001608,0.211325,3,0.010143,-0.008286
B12,0.000903,0.001565,0.211325,3,0.009868,-0.008062
