In [1]:
import flexynesis 
import torch, os
torch.set_num_threads(12)

# Modeling Breast Cancer Subtypes 

In [2]:
if not os.path.exists("brca_metabric_processed"):
    !wget -O brca_metabric.tgz "https://bimsbstatic.mdc-berlin.de/akalin/buyar/flexynesis-benchmark-datasets/brca_metabric_processed.tgz" && tar -xzvf brca_metabric.tgz

In [16]:
data_importer = flexynesis.DataImporter(path ='./brca_metabric_processed/', 
                                       data_types = ['gex', 'cna'], 
                                        concatenate=False, min_features=1000, top_percentile=0.1)

In [None]:
train_dataset, test_dataset = data_importer.import_data()

importing  ./brca_metabric_processed/train/clin.csv
importing  ./brca_metabric_processed/train/gex.csv
importing  ./brca_metabric_processed/train/cna.csv
importing  ./brca_metabric_processed/test/clin.csv
importing  ./brca_metabric_processed/test/gex.csv
importing  ./brca_metabric_processed/test/cna.csv
Imputing NA values to median of features, affected # of features  12  # of rows: 9


100%|██████████| 9/9 [00:00<00:00, 1847.44it/s]

Number of NA values:  0
DataFrame gex - Removed 0 features.





Imputing NA values to median of features, affected # of features  208  # of rows: 162


 81%|████████▏ | 132/162 [00:09<00:02, 13.57it/s]

In [None]:
train_dataset.ann.keys()

In [None]:
tuner = flexynesis.HyperparameterTuning(train_dataset, 
                                        model_class = flexynesis.DirectPred, 
                                        target_variables = "CLAUDIN_SUBTYPE,CHEMOTHERAPY,THREEGENE",
                                        batch_variables = None,
                                        config_name = "DirectPred", 
                                        config_path = './conf.yaml',
                                        n_iter=10, plot_losses=True)    

In [None]:
model, best_params = tuner.perform_tuning()
best_params

In [None]:
model

In [None]:
y_pred_dict = model.predict(test_dataset)

In [None]:
metrics_df = flexynesis.evaluate_wrapper(y_pred_dict, test_dataset)
metrics_df

In [None]:
for var in model.target_variables:
    model.compute_feature_importance(var, steps = 30)

In [None]:
def get_important_features(model, var, top=20):
    # Ensure that the variable exists in the model's feature importances
    if var not in model.feature_importances:
        print(f"No feature importances found for variable: {var}")
        return None

    # Fetch the dataframe for the specified variable
    df_imp = model.feature_importances[var]

    top_features = df_imp.groupby(['target_class']).apply(lambda x: x.nlargest(top, 'importance')).reset_index(drop=True)

    return top_features


In [None]:
f = 'THREEGENE'
top_features = get_important_features(model, f, top=5)

In [None]:
ds = test_dataset
E = model.transform(ds)

In [None]:
flexynesis.plot_dim_reduced(E, ds.ann[f].numpy(), color_type = 'categorical', method='umap')

In [None]:
df = flexynesis.subset_assays_by_features(test_dataset, top_features.groupby('layer')['name'].apply(list).to_dict())

In [None]:
top_features

In [None]:
df

In [None]:
flexynesis.plot_dim_reduced(df, ds.ann[f].numpy(), color_type = 'categorical', method='umap')