In [1]:
import os
import numpy as np
import pickle
import pandas as pd
import gc

### Load files by batches into .parquet files (100 files per batch) 

In [4]:
def create_parquet_batches(batch_size, in_folder, out_folder):
    
    os.makedirs(out_folder, exist_ok=True) 
    files = [os.path.join(in_folder, f) for f in os.listdir(in_folder) if f.endswith(".pkl")]
    
    results = []
    
    for i, file in enumerate(files):
        with open(file, "rb") as f:
            data = pickle.load(f)
    
        outer_key = next(iter(data))
        inner_dict = data[outer_key]
    
        row = {"file": os.path.basename(file), "id": outer_key}
    
        for key, value in inner_dict.items():
            row[key] = value
    
        results.append(row)
    
        if (i + 1) % batch_size == 0 or (i + 1) == len(files):
            df = pd.DataFrame(results)
            output_path = os.path.join(out_folder, f"batch_{i // batch_size}.parquet")
            df.to_parquet(output_path, index=False)
            print(f"Сохранён {output_path} ({len(results)} строк)")
    
            results.clear()
            gc.collect()

In [39]:
create_parquet_batches(100, "barcodes/train/subset", "train_parquet")

Сохранён train_parquet\batch_0.parquet (100 строк)
Сохранён train_parquet\batch_1.parquet (100 строк)
Сохранён train_parquet\batch_2.parquet (100 строк)
Сохранён train_parquet\batch_3.parquet (100 строк)
Сохранён train_parquet\batch_4.parquet (100 строк)
Сохранён train_parquet\batch_5.parquet (100 строк)
Сохранён train_parquet\batch_6.parquet (100 строк)
Сохранён train_parquet\batch_7.parquet (100 строк)
Сохранён train_parquet\batch_8.parquet (100 строк)
Сохранён train_parquet\batch_9.parquet (100 строк)
Сохранён train_parquet\batch_10.parquet (100 строк)
Сохранён train_parquet\batch_11.parquet (100 строк)
Сохранён train_parquet\batch_12.parquet (100 строк)
Сохранён train_parquet\batch_13.parquet (100 строк)
Сохранён train_parquet\batch_14.parquet (100 строк)
Сохранён train_parquet\batch_15.parquet (100 строк)
Сохранён train_parquet\batch_16.parquet (100 строк)
Сохранён train_parquet\batch_17.parquet (100 строк)
Сохранён train_parquet\batch_18.parquet (100 строк)
Сохранён train_parquet

### Concat several .parquet files in one

In [4]:
def concat_parquet(folder):

    files = [os.path.join(folder, f) for f in os.listdir(folder) if f.endswith(".parquet")]
    
    dfs = [pd.read_parquet(f) for f in files]
    concat = pd.concat(dfs, ignore_index=True) 
    return concat

In [6]:
train = concat_parquet("train_parquet/subset")

In [5]:
test = concat_parquet("test_parquet/subset")

### Separate barcodes and labels

In [6]:
barcodes_train = train['barcodes']
BP_train = train['label_BP']
MF_train = train['label_MF']
CC_train = train['label_CC']

barcodes_test = test['barcodes']
BP_test = test['label_BP']
MF_test = test['label_MF']
CC_test = test['label_CC']

### Vectorize barcodes using Persistence Landscapes

In [9]:
from gtda.diagrams import PersistenceLandscape

def vectorization(barcodes, n_layers, n_bins):
    vecs = []
    for i, barcode in enumerate(barcodes):
        barcode = np.array([np.stack(el) for el in np.vstack(barcode)])
        PL = PersistenceLandscape(n_layers=n_layers, n_bins=n_bins)
        landscape = PL.fit_transform(barcode)
        landscape_flatten = landscape.flatten()
        vecs.append(landscape_flatten)
        if i % 100 == 0:
            print(f"✅ Обработано {i} образцов")
    vecs = np.array([el.tolist() for el in vecs])
    return vecs

In [14]:
train_tda = vectorization(barcodes_train,2,10)

✅ Обработано 0 образцов
✅ Обработано 100 образцов
✅ Обработано 200 образцов
✅ Обработано 300 образцов
✅ Обработано 400 образцов
✅ Обработано 500 образцов
✅ Обработано 600 образцов
✅ Обработано 700 образцов
✅ Обработано 800 образцов
✅ Обработано 900 образцов
✅ Обработано 1000 образцов


In [10]:
test_tda = vectorization(barcodes_test,2,10)

✅ Обработано 0 образцов
✅ Обработано 100 образцов
✅ Обработано 200 образцов
✅ Обработано 300 образцов


In [19]:
CC_train = np.vstack(CC_train)
CC_test = np.vstack(CC_test)

In [12]:
MF_train = np.vstack(MF_train)
MF_test = np.vstack(MF_test)

In [22]:
BP_train = np.vstack(BP_train)
BP_test = np.vstack(BP_test)

### LightGBM for Cellular Component (CC) label

In [25]:
from sklearn.multioutput import MultiOutputClassifier
from lightgbm import LGBMClassifier

model = LGBMClassifier(n_estimators=20, learning_rate=0.1, max_depth=3,class_weight='balanced', n_jobs=1, random_state=42)
clf_cc = MultiOutputClassifier(model, n_jobs=-1)
clf_cc.fit(train_tda,CC_train)

In [28]:
import joblib

joblib.dump(clf_cc, "cc_clf.pkl")

['cc_clf.pkl']

In [29]:
from metric.f1_max_score import count_f1_max

preds_cc = clf_cc.predict(test_tda)
f1_CC = count_f1_max(preds_cc,CC_test)

In [30]:
f1_CC

0.34513187408447266

### LightGBM for Molecular Function (MF) label

In [31]:
model = LGBMClassifier(n_estimators=20, learning_rate=0.1, max_depth=3,class_weight='balanced', n_jobs=1, random_state=42)
clf_mf = MultiOutputClassifier(model, n_jobs=-1)
clf_mf.fit(train_tda,MF_train)

In [32]:
joblib.dump(clf_mf, "mf_clf.pkl")

['mf_clf.pkl']

In [33]:
preds_mf = clf_mf.predict(test_tda)
f1_MF = count_f1_max(preds_mf,MF_test)

In [34]:
f1_MF

0.24193865060806274

### LightGBM for Biological Process (BP) label

In [35]:
model = LGBMClassifier(n_estimators=20, learning_rate=0.1, max_depth=3,class_weight='balanced', n_jobs=1, random_state=42)
clf_bp = MultiOutputClassifier(model, n_jobs=-1)
clf_bp.fit(train_tda,BP_train)



In [36]:
joblib.dump(clf_bp, "bp_clf.pkl")

['bp_clf.pkl']

In [37]:
preds_bp = clf_bp.predict(test_tda)
f1_BP = count_f1_max(preds_bp,BP_test)

In [38]:
f1_BP

0.22964271903038025