In [1]:
import argparse
from utils import prepare_save_dir
from models.STELLAR import STELLAR
import numpy as np
import os
import torch
import pandas as pd
from datasets.datasets import GraphDataset, load_tonsilbe_data, load_hubmap_data
from datasets.load_d4ls import load_full_anndata
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

pd.set_option('display.max_columns', None)

In [9]:
def get_edge_index(pos, sample_ids, distance_thres):
    # construct edge indexes when there is region information
    edge_list = []
    sample_ids_unique = np.unique(sample_ids)
    for sample_id in sample_ids_unique:
        locs = np.where(sample_ids == sample_id)[0]
        pos_region = pos[locs, :]
        dists = pairwise_distances(pos_region)
        dists_mask = dists < distance_thres
        np.fill_diagonal(dists_mask, 0)
        region_edge_list = np.transpose(np.nonzero(dists_mask)).tolist()
        for i, j in region_edge_list:
            edge_list.append([locs[i], locs[j]])
    return edge_list

In [10]:
def get_train_test_masks(train_anndata, test_count=0):
    sample_ids = train_anndata.obs["sample_id"]
    sample_ids_unique = np.unique(sample_ids)

    sample_ids_idx = np.random.choice(np.arange(len(sample_ids_unique)), test_count, replace=False)
    test_sample_ids_mask = np.zeros_like(sample_ids_unique, dtype=bool)
    test_sample_ids_mask[sample_ids_idx] = True

    test_unique_sample_ids = sample_ids_unique[test_sample_ids_mask]

    test_mask = sample_ids.isin(test_unique_sample_ids)
    train_mask = ~test_mask

    return train_mask, test_mask

In [11]:
def prepare_data(train_anndata, make_graph=False):
    train_mask, test_mask = get_train_test_masks(train_anndata)

    X = train_anndata.layers['exprs']
    X_train = X[train_mask]
    X_test = X[test_mask]

    pos = train_anndata.obs[["Pos_X", "Pos_Y"]].values
    pos_train = pos[train_mask]
    pos_test = pos[test_mask]

    if make_graph:
        sample_ids = train_anndata.obs["sample_id"]
        test_sample_ids = sample_ids[test_mask]
        train_sample_ids = sample_ids[train_mask]

        edges_train = get_edge_index(pos_train, train_sample_ids, 10)
        edges_test = get_edge_index(pos_test, test_sample_ids, 10)
    else:
        edges_train = None
        edges_test = None

    cell_types = np.sort(list(set(train_anndata.obs["cell_labels"].values))).tolist()
    # we here map class in texts to categorical numbers and also save an inverse_dict to map the numbers back to texts
    cell_type_dict = {}
    inverse_dict = {}
    for i, cell_type in enumerate(cell_types):
        cell_type_dict[cell_type] = i
        inverse_dict[i] = cell_type
        
    Y_train = train_anndata.obs["cell_labels"].values[train_mask]
    Y_test = train_anndata.obs["cell_labels"].values[test_mask]

    Y_train = np.array([cell_type_dict[x] for x in Y_train])
    Y_test = np.array([cell_type_dict[x] for x in Y_test])

    return X_train, Y_train, edges_train, X_test, Y_test, edges_test, inverse_dict



In [12]:
train_anndata = load_full_anndata()

X_train, Y_train, edges_train, X_test, Y_test, edges_test, inverse_dict = prepare_data(train_anndata)

In [20]:
X_train.shape

(236791, 40)

In [21]:
Y_train.shape

(236791,)

In [34]:
X_test

array([], shape=(0, 40), dtype=float64)

In [22]:
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)

In [30]:
best_xgb_param_grid = {
    "n_estimators": [50],
    "max_depth": [3],
    "learning_rate": [0.3],
    'objective': ['multi:softmax']
}

xgb_grid_search = GridSearchCV(XGBClassifier(), param_grid=best_xgb_param_grid, cv=5, n_jobs=-1, return_train_score=True, verbose=3)

In [31]:
xgb_grid_search.fit(X_train_scaled, Y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


In [33]:
pd.DataFrame(xgb_grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,param_objective,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,50.036386,4.905789,0.65866,0.103495,0.3,3,30,multi:softmax,"{'learning_rate': 0.3, 'max_depth': 3, 'n_esti...",0.913702,0.948879,0.951434,0.947971,0.799738,0.912345,0.05799,3,0.958793,0.953656,0.953509,0.954306,0.962652,0.956583,0.003603
1,63.673606,3.357773,0.780867,0.135105,0.3,3,40,multi:softmax,"{'learning_rate': 0.3, 'max_depth': 3, 'n_esti...",0.912414,0.950737,0.950399,0.949174,0.800625,0.91267,0.057895,2,0.963237,0.957975,0.958286,0.95912,0.9666,0.961044,0.003359
2,88.284336,2.29457,1.133108,0.185735,0.3,3,50,multi:softmax,"{'learning_rate': 0.3, 'max_depth': 3, 'n_esti...",0.913871,0.951307,0.949723,0.949407,0.800118,0.912885,0.058111,1,0.966616,0.961849,0.961907,0.96281,0.969398,0.964516,0.003002


In [35]:
xgb_grid_search.predict(X_train_scaled)

array([ 6,  7,  6, ...,  3,  7, 11])