In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import squidpy as sp

from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler

In [2]:
filename_mouse_embryo = '../Downloads/E9.5_E1S1.MOSTA.h5ad'
filename_mouse_brain = '../Downloads/Mouse_brain_cell_bin.h5ad'

K_best_features = [10, 20, 50, 100, 200, 500, 1000, 2000]

In [3]:
def read_file(filename):
    adata = sc.read_h5ad(filename)

    return adata

In [4]:
def preprocess_file(adata):
    #get rid of cells with fewer than 200 genes
    sc.pp.filter_cells(adata, min_genes=200)
    #get rid of genes that are found in fewer than 3 cells
    sc.pp.filter_genes(adata, min_cells=3)
    #get rid of cells whose annotation is unknown
    adata = adata[~adata.obs.annotation.isin(['Unknown'])]
    #data normalization
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)
    return adata

In [5]:
def select_best_features(adata, k_best_value):
    feature_selector = SelectKBest(k = k_best_value)
    y = adata.obs.annotation
    adata.feature_selected = np.zeros((len(adata.obs.annotation),2))
    adata.feature_selected[:,0]= adata.obsm['spatial'][:,0]
    adata.feature_selected[:,1]= adata.obsm['spatial'][:,1]
    
    feature_selected = feature_selector.fit_transform(adata.X,y)
    adata.feature_selected = np.hstack((adata.feature_selected, feature_selected.toarray() ))
    scaler = StandardScaler()
    adata.feature_selected = scaler.fit_transform(adata.feature_selected,y)

    return adata 

In [7]:
adata = read_file(filename_mouse_embryo)
adata = preprocess_file(adata)


  view_to_actual(adata)


In [8]:
adata.obsm['spatial'][:,0].shape
adata.shape

(5870, 23014)

In [9]:
select_best_features(adata, 100)
adata.feature_selected.shape

(5870, 102)

In [None]:
adata.feature_selected[0,:]

In [17]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=10)
skf.get_n_splits(adata.feature_selected, adata.obs.annotation)
for i, (train_index, test_index) in enumerate(skf.split(adata.feature_selected, adata.obs.annotation)):
    print(f"Fold {i}:")
    #print(f"  Train: index={train_index}")
    print(train_index.shape)
    test_X = adata.feature_selected[train_index]
    print(test_X.shape)
    print(test_index.shape)
    #print(f"  Test:  index={test_index}")


Fold 0:
(5283,)
(5283, 102)
(587,)
Fold 1:
(5283,)
(5283, 102)
(587,)
Fold 2:
(5283,)
(5283, 102)
(587,)
Fold 3:
(5283,)
(5283, 102)
(587,)
Fold 4:
(5283,)
(5283, 102)
(587,)
Fold 5:
(5283,)
(5283, 102)
(587,)
Fold 6:
(5283,)
(5283, 102)
(587,)
Fold 7:
(5283,)
(5283, 102)
(587,)
Fold 8:
(5283,)
(5283, 102)
(587,)
Fold 9:
(5283,)
(5283, 102)
(587,)


In [21]:
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier()


In [37]:
def get_all_predictions(X, y, clf, k):
    strtfdKFold = StratifiedKFold(n_splits=k)
    kfold = strtfdKFold.split(X, y)
    predictions = None

    for fold_index, (train, test) in enumerate(kfold):
        clf.fit(X[train], y[train])
        if predictions is None:
            predictions = clf.predict(X)
        else:
            predictions = np.vstack((predictions, clf.predict(X)))
    return predictions

In [54]:
def func(predictions, y, l):
    new_annotations = y.copy()
    for col_index in range(predictions.shape[1]):
        column = predictions[:,col_index].T
        count_different = len(column[column != y[col_index]])
        if count_different <= l:
            continue
        count_different_by_class = dict()
        for label in column[column != y[col_index]]:
            if label in count_different_by_class:
                count_different_by_class[label] += 1
            else:
                count_different_by_class[label] = 1
        label_with_max = max(count_different_by_class, key=lambda key: count_different_by_class[key])
        max_value = count_different_by_class[label_with_max]
        if max_value > l:
            print(col_index, y[col_index], label_with_max)
            new_annotations[col_index] = label_with_max
    return new_annotations

In [52]:
func(result, adata.obs.annotation, 2)

2 Cavity Mesenchyme


In [53]:
adata.obs.annotation[2]

'Cavity'

In [43]:
result[:,0].T

array(['Mesenchyme', 'Mesenchyme', 'Mesenchyme', 'Mesenchyme',
       'Mesenchyme', 'Mesenchyme', 'Mesenchyme', 'Mesenchyme',
       'Mesenchyme', 'Mesenchyme'], dtype='<U17')

In [31]:
result = get_all_predictions(adata.feature_selected, adata.obs.annotation, sgd, 10)

In [32]:
result.shape

(10, 5870)

In [33]:
result

array([['Mesenchyme', 'Cavity', 'Mesenchyme', ..., 'Cavity', 'Cavity',
        'Cavity'],
       ['Mesenchyme', 'Cavity', 'Mesenchyme', ..., 'Cavity', 'Cavity',
        'Cavity'],
       ['Mesenchyme', 'Cavity', 'Cavity', ..., 'Cavity', 'Cavity',
        'Cavity'],
       ...,
       ['Mesenchyme', 'Cavity', 'Cavity', ..., 'Cavity', 'Cavity',
        'Cavity'],
       ['Mesenchyme', 'Cavity', 'Cavity', ..., 'Cavity', 'Cavity',
        'Cavity'],
       ['Mesenchyme', 'Cavity', 'Mesenchyme', ..., 'Cavity', 'Cavity',
        'Cavity']], dtype='<U17')

In [None]:
from sklearn.model_selection import cross_val_predict

cv = cross_val_predict(sgd, adata.feature_selected, adata.obs.annotation, cv=10)


In [None]:
adata.obs['new_ann'] =cv;
adata.uns['new_ann_colors']=adata.uns['annotation_colors']
sp.pl.spatial_scatter(adata,shape=None, color=["new_ann","annotation"])

In [None]:
adata.obs['new_ann']
cv

In [None]:
adata.obs.annotation.shape

In [None]:
adata.obs.annotation.unique

In [None]:
adata.obsm