In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from autogluon.tabular import TabularPredictor

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('../../data/data_ze.csv')
df.head(), df.shape

(             gene_id  chromosome  global_position  First_Exon_Start B1 B2 B3  \
 0  ENSG00000260861.6          20          1539145              1000  c  t  g   
 1  ENSG00000260861.6          20          1550939             12794  g  c  c   
 2  ENSG00000260861.6          20          1550961             12816  a  g  g   
 3  ENSG00000171873.8          20          4219631              1000  g  g  t   
 4  ENSG00000101282.9          20           957453              1000  g  a  g   
 
   B4 B5 B6  ... B542 B543 B544 B545 B546 B547 B548 B549 B550 label  
 0  a  a  t  ...    g    a    g    c    a    g    g    c    t  True  
 1  a  a  g  ...    g    t    c    g    t    a    a    g    c  True  
 2  t  c  t  ...    t    g    t    a    a    a    t    c    a  True  
 3  g  c  g  ...    g    a    g    c    c    g    c    c    g  True  
 4  g  a  c  ...    c    g    g    c    g    c    g    c    c  True  
 
 [5 rows x 555 columns],
 (2606, 555))

In [3]:
seq_cols = [col for col in df.columns if col.startswith('B')]
df_model = df[seq_cols + ['label']]
df_model['label'] = (
    df_model['label'].astype(str).str.lower().map({'true': 1, 'false': 0})
)

df_model['label'].value_counts(dropna=False)

df_model.head()

train_val, test = train_test_split(
    df_model, 
    test_size=0.2, 
    random_state=42,
    stratify=df_model['label']
)

train, val = train_test_split(
    train_val,
    test_size=0.25,
    random_state=42,
    stratify=train_val['label']
)

train.shape, val.shape, test.shape, train_val.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_model['label'] = (


((1563, 551), (521, 551), (522, 551), (2084, 551))

In [4]:
predictor = TabularPredictor(
    label='label',
    problem_type='binary',
    eval_metric='f1',
    path='../models/autogluon_ez'
)

predictor.fit(
    train_data=train,
    tuning_data=val,
    time_limit=200,
    presets='medium_quality_faster_train'
)

Preset alias specified: 'medium_quality_faster_train' maps to 'medium_quality'.
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.5.0
Python Version:     3.11.14
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Thu Jun  5 18:30:46 UTC 2025
CPU Count:          12
Pytorch Version:    2.6.0
CUDA Version:       12.6
GPU Memory:         GPU 0: 6.00/6.00 GB
Total GPU Memory:   Free: 6.00 GB, Allocated: 0.00 GB, Total: 6.00 GB
GPU Count:          1
Memory Avail:       7.55 GB / 11.55 GB (65.4%)
Disk Space Avail:   639.93 GB / 951.65 GB (67.2%)
Presets specified: ['medium_quality_faster_train']
Using hyperparameters preset: hyperparameters='default'
Beginning AutoGluon training ... Time limit = 200s
AutoGluon will save models to "/mnt/c/Users/Carlos/OneDrive/Documentos/Genome-Transition_AutoGluon/training/models/autogluon_ez"
Train Data Rows:    1563
Train Data Columns: 550
Tuning Data Rows:    521
Tuning Data Columns: 550
Label Column:       

[1000]	valid_set's binary_logloss: 0.552784	valid_set's f1: 0.707207


	0.7136	 = Validation score   (f1)
	3.19s	 = Training   runtime
	0.15s	 = Validation runtime
Fitting model: RandomForestGini ... Training model for up to 192.29s of the 192.29s of remaining time.
	Fitting with cpus=12, gpus=0
	0.6489	 = Validation score   (f1)
	0.92s	 = Training   runtime
	0.13s	 = Validation runtime
Fitting model: RandomForestEntr ... Training model for up to 191.09s of the 191.09s of remaining time.
	Fitting with cpus=12, gpus=0
	0.6368	 = Validation score   (f1)
	0.83s	 = Training   runtime
	0.22s	 = Validation runtime
Fitting model: CatBoost ... Training model for up to 189.85s of the 189.85s of remaining time.
	Fitting with cpus=6, gpus=0
	0.6841	 = Validation score   (f1)
	76.75s	 = Training   runtime
	0.11s	 = Validation runtime
Fitting model: ExtraTreesGini ... Training model for up to 112.93s of the 112.93s of remaining time.
	Fitting with cpus=12, gpus=0
	0.6558	 = Validation score   (f1)
	0.94s	 = Training   runtime
	0.14s	 = Validation runtime
Fitting model

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7acd8ef5ba90>

In [5]:
perf = predictor.evaluate(test)
perf

{'f1': 0.6832579185520362,
 'accuracy': 0.7318007662835249,
 'balanced_accuracy': np.float64(0.7236912393162394),
 'mcc': 0.454450569890237,
 'roc_auc': np.float64(0.8326210826210826),
 'precision': 0.7259615384615384,
 'recall': 0.6452991452991453}

In [6]:
predictor.leaderboard(test, silent=True)

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,XGBoost,0.694444,0.721839,f1,0.310297,0.09102,6.495535,0.310297,0.09102,6.495535,1,True,9
1,CatBoost,0.687225,0.684096,f1,0.104743,0.105341,76.745447,0.104743,0.105341,76.745447,1,True,5
2,LightGBMXT,0.686364,0.710112,f1,0.07106,0.137655,2.12019,0.07106,0.137655,2.12019,1,True,1
3,WeightedEnsemble_L2,0.683258,0.756757,f1,1.786216,1.42948,191.194976,0.016836,0.001744,0.208805,2,True,11
4,NeuralNetTorch,0.678937,0.673428,f1,0.3139,0.409123,28.549455,0.3139,0.409123,28.549455,1,True,10
5,NeuralNetFastAI,0.677966,0.713978,f1,0.319626,0.399984,74.14628,0.319626,0.399984,74.14628,1,True,8
6,LightGBM,0.675676,0.713636,f1,0.216685,0.153826,3.187908,0.216685,0.153826,3.187908,1,True,2
7,RandomForestEntr,0.645333,0.636842,f1,0.219728,0.224266,0.829021,0.219728,0.224266,0.829021,1,True,4
8,RandomForestGini,0.641509,0.648936,f1,0.232565,0.133038,0.917915,0.232565,0.133038,0.917915,1,True,3
9,ExtraTreesEntr,0.607242,0.601671,f1,0.228487,0.194955,0.923465,0.228487,0.194955,0.923465,1,True,7


In [7]:
predictor.feature_importance(test).head(10)

Computing feature importance via permutation shuffling for 550 features using 522 rows with 5 shuffle sets...
	5364.84s	= Expected runtime (1072.97s per shuffle set)
	1157.97s	= Actual runtime (Completed 5 of 5 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
B431,0.012648,0.004389,0.001493,5,0.021686,0.00361
B415,0.010818,0.005392,0.005468,5,0.021919,-0.000284
B206,0.010809,0.006742,0.011532,5,0.02469,-0.003073
B306,0.010587,0.002126,0.000185,5,0.014964,0.006209
B541,0.010237,0.006553,0.012528,5,0.023731,-0.003256
B200,0.010217,0.001269,2.8e-05,5,0.012828,0.007605
B142,0.01,0.003446,0.001454,5,0.017095,0.002905
B547,0.008749,0.005029,0.008846,5,0.019104,-0.001606
B383,0.008723,0.004472,0.006023,5,0.01793,-0.000485
B424,0.008455,0.003325,0.002363,5,0.015301,0.001608
