In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from autogluon.tabular import TabularPredictor

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('../../data/data_ze.csv')
df.head(), df.shape

(             gene_id  chromosome  global_position  First_Exon_Start B1 B2 B3  \
 0  ENSG00000260861.6          20          1539145              1000  c  t  g   
 1  ENSG00000260861.6          20          1550939             12794  g  c  c   
 2  ENSG00000260861.6          20          1550961             12816  a  g  g   
 3  ENSG00000171873.8          20          4219631              1000  g  g  t   
 4  ENSG00000101282.9          20           957453              1000  g  a  g   
 
   B4 B5 B6  ... B542 B543 B544 B545 B546 B547 B548 B549 B550 label  
 0  a  a  t  ...    g    a    g    c    a    g    g    c    t  True  
 1  a  a  g  ...    g    t    c    g    t    a    a    g    c  True  
 2  t  c  t  ...    t    g    t    a    a    a    t    c    a  True  
 3  g  c  g  ...    g    a    g    c    c    g    c    c    g  True  
 4  g  a  c  ...    c    g    g    c    g    c    g    c    c  True  
 
 [5 rows x 555 columns],
 (2606, 555))

In [4]:
seq_cols = [col for col in df.columns if col.startswith('B')]
df_model = df[seq_cols + ['label']]
df_model['label'] = (
    df_model['label'].astype(str).str.lower().map({'true': 1, 'false': 0})
)

df_model['label'].value_counts(dropna=False)

df_model.head()

train_val, test = train_test_split(
    df_model, 
    test_size=0.2, 
    random_state=42,
    stratify=df_model['label']
)

train, val = train_test_split(
    train_val,
    test_size=0.25,
    random_state=42,
    stratify=train_val['label']
)

train.shape, val.shape, test.shape, train_val.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_model['label'] = (


((1563, 551), (521, 551), (522, 551), (2084, 551))

In [5]:
predictor = TabularPredictor(
    label='label',
    problem_type='binary',
    eval_metric='f1',
    path='../models/autogluon_ze'
)

predictor.fit(
    train_data=train,
    tuning_data=val,
    time_limit=200,
    presets='medium_quality_faster_train'
)


Preset alias specified: 'medium_quality_faster_train' maps to 'medium_quality'.
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.5.0
Python Version:     3.11.14
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Thu Jun  5 18:30:46 UTC 2025
CPU Count:          12
Pytorch Version:    2.6.0
CUDA Version:       12.6
GPU Memory:         GPU 0: 6.00/6.00 GB
Total GPU Memory:   Free: 6.00 GB, Allocated: 0.00 GB, Total: 6.00 GB
GPU Count:          1
Memory Avail:       8.65 GB / 11.55 GB (74.9%)
Disk Space Avail:   640.43 GB / 951.65 GB (67.3%)
Presets specified: ['medium_quality_faster_train']
Using hyperparameters preset: hyperparameters='default'
Beginning AutoGluon training ... Time limit = 200s
AutoGluon will save models to "/mnt/c/Users/Carlos/OneDrive/Documentos/Genome-Transition_AutoGluon/training/models/autogluon_ze"
Train Data Rows:    1563
Train Data Columns: 550
Tuning Data Rows:    521
Tuning Data Columns: 550
Label Column:       

[1000]	valid_set's binary_logloss: 0.552784	valid_set's f1: 0.707207


	0.7136	 = Validation score   (f1)
	6.81s	 = Training   runtime
	0.26s	 = Validation runtime
Fitting model: RandomForestGini ... Training model for up to 185.32s of the 185.32s of remaining time.
	Fitting with cpus=12, gpus=0
	0.6489	 = Validation score   (f1)
	1.59s	 = Training   runtime
	0.25s	 = Validation runtime
Fitting model: RandomForestEntr ... Training model for up to 183.26s of the 183.26s of remaining time.
	Fitting with cpus=12, gpus=0
	0.6368	 = Validation score   (f1)
	1.73s	 = Training   runtime
	0.21s	 = Validation runtime
Fitting model: CatBoost ... Training model for up to 181.10s of the 181.09s of remaining time.
	Fitting with cpus=6, gpus=0
	0.6841	 = Validation score   (f1)
	122.12s	 = Training   runtime
	0.29s	 = Validation runtime
Fitting model: ExtraTreesGini ... Training model for up to 58.55s of the 58.55s of remaining time.
	Fitting with cpus=12, gpus=0
	0.6558	 = Validation score   (f1)
	1.55s	 = Training   runtime
	0.19s	 = Validation runtime
Fitting model:

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7f3921708a10>

In [6]:
perf = predictor.evaluate(test)
perf

{'f1': 0.6828193832599119,
 'accuracy': 0.7241379310344828,
 'balanced_accuracy': np.float64(0.718349358974359),
 'mcc': 0.4398162563985052,
 'roc_auc': np.float64(0.8229166666666665),
 'precision': 0.7045454545454546,
 'recall': 0.6623931623931624}

In [7]:
predictor.leaderboard(test, silent=True)

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,CatBoost,0.687225,0.684096,f1,0.192505,0.290768,122.122854,0.192505,0.290768,122.122854,1,True,5
1,LightGBMXT,0.686364,0.710112,f1,0.319709,0.226084,3.434854,0.319709,0.226084,3.434854,1,True,1
2,WeightedEnsemble_L2,0.682819,0.751678,f1,1.097463,1.162435,184.019024,0.022453,0.00211,0.305174,2,True,9
3,NeuralNetFastAI,0.677966,0.713978,f1,0.722315,0.613078,54.781353,0.722315,0.613078,54.781353,1,True,8
4,LightGBM,0.675676,0.713636,f1,0.16019,0.256478,6.809643,0.16019,0.256478,6.809643,1,True,2
5,RandomForestEntr,0.645333,0.636842,f1,-1.592195,0.208427,1.731489,-1.592195,0.208427,1.731489,1,True,4
6,RandomForestGini,0.641509,0.648936,f1,0.405172,0.246695,1.587389,0.405172,0.246695,1.587389,1,True,3
7,ExtraTreesEntr,0.607242,0.601671,f1,0.55704,0.327561,1.192445,0.55704,0.327561,1.192445,1,True,7
8,ExtraTreesGini,0.606557,0.655827,f1,0.423162,0.19375,1.545378,0.423162,0.19375,1.545378,1,True,6


In [8]:
predictor.feature_importance(test).head(10)

Computing feature importance via permutation shuffling for 550 features using 522 rows with 5 shuffle sets...
	4729.31s	= Expected runtime (945.86s per shuffle set)
	647.38s	= Actual runtime (Completed 5 of 5 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
B317,0.013497,0.008415,0.011517,5,0.030823,-0.003829
B493,0.011705,0.007655,0.013401,5,0.027468,-0.004057
B469,0.00965,0.003061,0.001067,5,0.015952,0.003348
B533,0.008763,0.002941,0.001318,5,0.014818,0.002708
B142,0.008549,0.002422,0.000696,5,0.013535,0.003563
B523,0.008471,0.005655,0.014292,5,0.020116,-0.003173
B520,0.008446,0.006327,0.020266,5,0.021472,-0.00458
B365,0.008419,0.001263,5.9e-05,5,0.011019,0.00582
B219,0.008178,0.003961,0.004955,5,0.016334,2.1e-05
B415,0.007601,0.002821,0.001912,5,0.01341,0.001793
