In [None]:
import os
from collections import Counter
import datetime
import pickle
import numpy as np
from datasets import load_from_disk
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from tqdm import tqdm

# Load datasets
train_dataset = load_from_disk("example_input_files/cell_classification/cell_type_annotation/cell_type_train_data.dataset")
eval_dataset = load_from_disk("example_input_files/cell_classification/cell_type_annotation/cell_type_test_data.dataset")

dataset_list, evalset_list, organ_list, target_dict_list = [], [], [], []

for organ in Counter(train_dataset["organ_major"]).keys():
    if organ in ["bone_marrow"]:  
        continue
    elif organ == "immune":
        organ_ids = ["immune", "bone_marrow"]
        organ_list += ["immune"]
    else:
        organ_ids = [organ]
        organ_list += [organ]
    
    def if_organ(example):
        return example["organ_major"] in organ_ids
    trainset_organ = train_dataset.filter(if_organ, num_proc=16)
    
    celltype_counter = Counter(trainset_organ["cell_type"])
    total_cells = sum(celltype_counter.values())
    cells_to_keep = [k for k, v in celltype_counter.items() if v > (0.005 * total_cells)]
    
    def if_not_rare_celltype(example):
        return example["cell_type"] in cells_to_keep
    trainset_organ_subset = trainset_organ.filter(if_not_rare_celltype, num_proc=16)
    
    trainset_organ_shuffled = trainset_organ_subset.shuffle(seed=42)
    trainset_organ_shuffled = trainset_organ_shuffled.rename_column("cell_type", "label")
    trainset_organ_shuffled = trainset_organ_shuffled.remove_columns("organ_major")
    
    target_names = list(Counter(trainset_organ_shuffled["label"]).keys())
    target_name_id_dict = dict(zip(target_names, range(len(target_names))))
    target_dict_list.append(target_name_id_dict)
    
    def classes_to_ids(example):
        example["label"] = target_name_id_dict[example["label"]]
        return example
    labeled_trainset = trainset_organ_shuffled.map(classes_to_ids, num_proc=16)
    
    labeled_train_split = labeled_trainset.select(range(0, round(len(labeled_trainset) * 0.8)))
    labeled_eval_split = labeled_trainset.select(range(round(len(labeled_trainset) * 0.8), len(labeled_trainset)))
    
    trained_labels = list(Counter(labeled_train_split["label"]).keys())
    def if_trained_label(example):
        return example["label"] in trained_labels
    labeled_eval_split_subset = labeled_eval_split.filter(if_trained_label, num_proc=16)
    
    dataset_list.append(labeled_train_split)
    evalset_list.append(labeled_eval_split_subset)

trainset_dict = dict(zip(organ_list, dataset_list))
traintargetdict_dict = dict(zip(organ_list, target_dict_list))
evalset_dict = dict(zip(organ_list, evalset_list))

In [2]:
def extract_features(dataset):
    seqs = dataset["input_ids"]
    max_len = max(len(s) for s in seqs)
    padded = np.zeros((len(seqs), max_len), dtype=np.int64)
    for i, s in tqdm(enumerate(seqs), desc="padding...", colour="blue"):
        padded[i, :len(s)] = s
    X = np.mean(padded, axis=1)[:, None]  # simple mean pooling
    y = np.array(dataset["label"])
    return X, y

results = {}

for organ in organ_list:
    print(f"\n===== Organ: {organ} =====")
    organ_trainset = trainset_dict[organ]
    organ_evalset = evalset_dict[organ]
    
    X_train, y_train = extract_features(organ_trainset)
    X_test, y_test = extract_features(organ_evalset)
    
    classifiers = {
        "RandomForest": RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1),
        # "SVM": make_pipeline(StandardScaler(), SVC(kernel="rbf", probability=True, random_state=42)),
        "LogisticRegression": make_pipeline(StandardScaler(), LogisticRegression(max_iter=500, multi_class="multinomial"))
    }
    
    organ_results = {}
    for clf_name, clf in tqdm(classifiers.items(), desc=f"{organ} models", leave=False):
        print(f"Training {clf_name}...")
        clf.fit(X_train, y_train)
        preds = clf.predict(X_test)
        acc = accuracy_score(y_test, preds)
        macro_f1 = f1_score(y_test, preds, average="macro")
        weighted_f1 = f1_score(y_test, preds, average="weighted")
        organ_results[clf_name] = {
            "accuracy": acc,
            "macro_f1": macro_f1,
            "weighted_f1": weighted_f1
        }
        print(f"{clf_name} - Acc: {acc:.4f}, Macro F1: {macro_f1:.4f}, Weighted F1: {weighted_f1:.4f}")
    
    results[organ] = organ_results



===== Organ: spleen =====


padding...: 12330it [00:00, 76763.11it/s]
padding...: 3083it [00:00, 75593.59it/s]
spleen models:   0%|          | 0/2 [00:00<?, ?it/s]

Training RandomForest...


                                                            

RandomForest - Acc: 0.5864, Macro F1: 0.1947, Weighted F1: 0.5845
Training LogisticRegression...
LogisticRegression - Acc: 0.7415, Macro F1: 0.1419, Weighted F1: 0.6331

===== Organ: kidney =====


padding...: 35199it [00:00, 54605.10it/s]
padding...: 8800it [00:00, 57420.64it/s]
kidney models:   0%|          | 0/2 [00:00<?, ?it/s]

Training RandomForest...


kidney models:  50%|█████     | 1/2 [00:01<00:01,  1.65s/it]

RandomForest - Acc: 0.1755, Macro F1: 0.0826, Weighted F1: 0.1772
Training LogisticRegression...


                                                            

LogisticRegression - Acc: 0.3287, Macro F1: 0.0713, Weighted F1: 0.2267

===== Organ: lung =====


padding...: 26098it [00:00, 63650.72it/s]
padding...: 6525it [00:00, 61571.18it/s]
lung models:   0%|          | 0/2 [00:00<?, ?it/s]

Training RandomForest...


lung models:  50%|█████     | 1/2 [00:00<00:00,  1.05it/s]

RandomForest - Acc: 0.2077, Macro F1: 0.0910, Weighted F1: 0.2066
Training LogisticRegression...


                                                          

LogisticRegression - Acc: 0.3099, Macro F1: 0.0761, Weighted F1: 0.2399

===== Organ: brain =====


padding...: 10656it [00:00, 67287.79it/s]
padding...: 2664it [00:00, 75149.65it/s]
brain models:   0%|          | 0/2 [00:00<?, ?it/s]

Training RandomForest...


brain models:  50%|█████     | 1/2 [00:00<00:00,  2.21it/s]

RandomForest - Acc: 0.7459, Macro F1: 0.1863, Weighted F1: 0.7495
Training LogisticRegression...


                                                           

LogisticRegression - Acc: 0.8622, Macro F1: 0.1543, Weighted F1: 0.7985

===== Organ: placenta =====


padding...: 7415it [00:00, 54391.55it/s]
padding...: 1854it [00:00, 57379.91it/s]
placenta models:   0%|          | 0/2 [00:00<?, ?it/s]

Training RandomForest...


                                                              

RandomForest - Acc: 0.6009, Macro F1: 0.3471, Weighted F1: 0.5959
Training LogisticRegression...
LogisticRegression - Acc: 0.7406, Macro F1: 0.2836, Weighted F1: 0.6302

===== Organ: immune =====


padding...: 20562it [00:00, 74370.86it/s]
padding...: 5140it [00:00, 70895.86it/s]
immune models:   0%|          | 0/2 [00:00<?, ?it/s]

Training RandomForest...


immune models:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s]

RandomForest - Acc: 0.2008, Macro F1: 0.1312, Weighted F1: 0.2005
Training LogisticRegression...


                                                            

LogisticRegression - Acc: 0.2749, Macro F1: 0.0921, Weighted F1: 0.1488

===== Organ: large_intestine =====


padding...: 39678it [00:00, 74202.67it/s]
padding...: 9920it [00:00, 77582.36it/s]
large_intestine models:   0%|          | 0/2 [00:00<?, ?it/s]

Training RandomForest...


large_intestine models:  50%|█████     | 1/2 [00:01<00:01,  1.47s/it]

RandomForest - Acc: 0.2541, Macro F1: 0.0983, Weighted F1: 0.2556
Training LogisticRegression...


                                                                     

LogisticRegression - Acc: 0.3095, Macro F1: 0.0843, Weighted F1: 0.2555

===== Organ: pancreas =====


padding...: 21934it [00:00, 63776.95it/s]
padding...: 5484it [00:00, 71125.95it/s]
pancreas models:   0%|          | 0/2 [00:00<?, ?it/s]

Training RandomForest...


pancreas models:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s]

RandomForest - Acc: 0.2438, Macro F1: 0.1438, Weighted F1: 0.2424
Training LogisticRegression...


                                                              

LogisticRegression - Acc: 0.3485, Macro F1: 0.1330, Weighted F1: 0.2601

===== Organ: liver =====


padding...: 22427it [00:00, 64230.25it/s]
padding...: 5607it [00:00, 62494.75it/s]
liver models:   0%|          | 0/2 [00:00<?, ?it/s]

Training RandomForest...


liver models:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s]

RandomForest - Acc: 0.2814, Macro F1: 0.1262, Weighted F1: 0.2809
Training LogisticRegression...


                                                           

LogisticRegression - Acc: 0.3512, Macro F1: 0.0738, Weighted F1: 0.2633




In [4]:
def extract_features(dataset):
    seqs = dataset["input_ids"]
    max_len = max(len(s) for s in seqs)
    padded = np.zeros((len(seqs), max_len), dtype=np.int64)
    for i, s in tqdm(enumerate(seqs), desc="padding...", colour="blue"):
        padded[i, :len(s)] = s
    X = np.mean(padded, axis=1)[:, None]  # simple mean pooling
    y = np.array(dataset["label"])
    return X, y

results = {}

for organ in organ_list:
    print(f"\n===== Organ: {organ} =====")
    organ_trainset = trainset_dict[organ]
    organ_evalset = evalset_dict[organ]
    
    X_train, y_train = extract_features(organ_trainset)
    X_test, y_test = extract_features(organ_evalset)
    
    classifiers = {
        # "RandomForest": RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1),
        "SVM": make_pipeline(StandardScaler(), SVC(kernel="rbf", probability=True, random_state=42)),
        # "LogisticRegression": make_pipeline(StandardScaler(), LogisticRegression(max_iter=500, multi_class="multinomial"))
    }
    
    organ_results = {}
    for clf_name, clf in tqdm(classifiers.items(), desc=f"{organ} models", leave=False):
        print(f"Training {clf_name}...")
        clf.fit(X_train, y_train)
        preds = clf.predict(X_test)
        acc = accuracy_score(y_test, preds)
        macro_f1 = f1_score(y_test, preds, average="macro")
        weighted_f1 = f1_score(y_test, preds, average="weighted")
        organ_results[clf_name] = {
            "accuracy": acc,
            "macro_f1": macro_f1,
            "weighted_f1": weighted_f1
        }
        print(f"{clf_name} - Acc: {acc:.4f}, Macro F1: {macro_f1:.4f}, Weighted F1: {weighted_f1:.4f}")
    
    results[organ] = organ_results



===== Organ: spleen =====


padding...: 12330it [00:00, 74149.68it/s]
padding...: 3083it [00:00, 79566.32it/s]
spleen models:   0%|          | 0/1 [00:00<?, ?it/s]

Training SVM...


                                                            

SVM - Acc: 0.7434, Macro F1: 0.1421, Weighted F1: 0.6340

===== Organ: kidney =====


padding...: 35199it [00:00, 54654.42it/s]
padding...: 8800it [00:00, 54786.08it/s]
kidney models:   0%|          | 0/1 [00:00<?, ?it/s]

Training SVM...


                                                             

SVM - Acc: 0.3340, Macro F1: 0.0731, Weighted F1: 0.2334

===== Organ: lung =====


padding...: 26098it [00:00, 63652.31it/s]
padding...: 6525it [00:00, 63915.46it/s]
lung models:   0%|          | 0/1 [00:00<?, ?it/s]

Training SVM...


                                                           

SVM - Acc: 0.3137, Macro F1: 0.0773, Weighted F1: 0.2438

===== Organ: brain =====


padding...: 10656it [00:00, 73057.45it/s]
padding...: 2664it [00:00, 75210.35it/s]
brain models:   0%|          | 0/1 [00:00<?, ?it/s]

Training SVM...


                                                           

SVM - Acc: 0.8622, Macro F1: 0.1543, Weighted F1: 0.7985

===== Organ: placenta =====


padding...: 7415it [00:00, 54724.23it/s]
padding...: 1854it [00:00, 57124.05it/s]
placenta models:   0%|          | 0/1 [00:00<?, ?it/s]

Training SVM...


                                                              

SVM - Acc: 0.7406, Macro F1: 0.2836, Weighted F1: 0.6302

===== Organ: immune =====


padding...: 20562it [00:00, 74360.35it/s]
padding...: 5140it [00:00, 73610.91it/s]
immune models:   0%|          | 0/1 [00:00<?, ?it/s]

Training SVM...


                                                            

SVM - Acc: 0.2969, Macro F1: 0.1286, Weighted F1: 0.2058

===== Organ: large_intestine =====


padding...: 39678it [00:00, 78336.69it/s]
padding...: 9920it [00:00, 77432.63it/s]
large_intestine models:   0%|          | 0/1 [00:00<?, ?it/s]

Training SVM...


                                                                      

SVM - Acc: 0.3850, Macro F1: 0.1027, Weighted F1: 0.3283

===== Organ: pancreas =====


padding...: 21934it [00:00, 76007.99it/s]
padding...: 5484it [00:00, 75661.05it/s]
pancreas models:   0%|          | 0/1 [00:00<?, ?it/s]

Training SVM...


                                                              

SVM - Acc: 0.3769, Macro F1: 0.1398, Weighted F1: 0.2843

===== Organ: liver =====


padding...: 22427it [00:00, 65347.56it/s]
padding...: 5607it [00:00, 66067.53it/s]
liver models:   0%|          | 0/1 [00:00<?, ?it/s]

Training SVM...


                                                           

SVM - Acc: 0.3820, Macro F1: 0.1061, Weighted F1: 0.3183


