In [1]:
import pandas as pd
import scanpy as sc
import numpy as np

  from pandas.core.index import RangeIndex


In [2]:
raw_counts = pd.read_csv('/nfs/team283/brainData/human_fetal/Polioudakis2019/raw_counts_mat.csv')

In [3]:
raw_counts.index = np.array(raw_counts['Unnamed: 0'])
raw_counts = raw_counts.iloc[:,1:np.shape(raw_counts)[1]]

In [4]:
metadata = pd.read_csv('/nfs/team283/brainData/human_fetal/Polioudakis2019/cell_metadata.csv')

In [5]:
metadata.index = metadata['Cell']
metadata = metadata.reindex(np.array(raw_counts.columns))
metadata['Cluster'] = metadata['Cluster'].astype(str)

In [6]:
adata = sc.AnnData(X=np.array(raw_counts).T, obs=metadata, var=np.array(raw_counts.index))

In [7]:
def selectFeatures(adata, groupName, n_features = 2000):
    # Subsets adata to features that best distinguish a group given in adata.obs[groupName]
    if 'rank_genes_groups' in adata.uns.keys():
        print('Using existing ranked genes...')
    else:
        uniqueClusters = np.unique(adata.obs[groupName])
        sc.tl.rank_genes_groups(adata, 'Cluster', n_genes=int(np.round(len(adata.var)/10)))
    ranked_features = np.unique([int(item) for sublist in adata.uns['rank_genes_groups']['names'] for item in sublist])
    if n_features > len(ranked_features):
        print('Maximum number of features: ' + str(len(ranked_features)))
        selected_features = ranked_features
        return adata[:,selected_features]
    else:
        i = 1
        selected_features = []
        while len(np.unique(selected_features)) < n_features:
            selected_features = [int(item) for sublist in adata.uns['rank_genes_groups']['names'][:][:i] for item in sublist]
            i += 1
        return adata[:,np.unique(selected_features)[:n_features]]

In [8]:
test1 = selectFeatures(adata, 'Cluster', n_features = 30000)
test1

... storing 'Cell' as categorical
... storing 'Cluster' as categorical
... storing 'Subcluster' as categorical
... storing 'Layer' as categorical
... storing 'Index' as categorical
... storing 'Library' as categorical
... storing 'Phase' as categorical


Maximum number of features: 25563


View of AnnData object with n_obs × n_vars = 33986 × 25563 
    obs: 'Cell', 'Cluster', 'Subcluster', 'Donor', 'Layer', 'Gestation_week', 'Index', 'Library', 'Number_genes_detected', 'Number_UMI', 'Percentage_mitochondrial', 'S_phase_score', 'G2M_phase_score', 'Phase'
    var: 0
    uns: 'rank_genes_groups'

In [9]:
test2 = selectFeatures(adata, 'Cluster', n_features = 3000)
test2

Using existing ranked genes...


View of AnnData object with n_obs × n_vars = 33986 × 3000 
    obs: 'Cell', 'Cluster', 'Subcluster', 'Donor', 'Layer', 'Gestation_week', 'Index', 'Library', 'Number_genes_detected', 'Number_UMI', 'Percentage_mitochondrial', 'S_phase_score', 'G2M_phase_score', 'Phase'
    var: 0
    uns: 'rank_genes_groups'