# Cell-cell communication analysis

### Preparing the signle cell RNA sequencing (scRNA-seq) reference dataset for CellPhoneDB

From [Young et al., Science 2018](https://science.sciencemag.org/content/361/6402/594.editor-summary)

In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
from scipy.io import mmread
import seaborn as sns
import scanpy as sc
import anndata
import matplotlib as mpl
import scipy
import matplotlib.pyplot as plt

sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
#sc.logging.print_versions()
sc.settings.set_figure_params(dpi=80)  # low dpi (dots per inch) yields small inline figures

In [2]:
reference_data_path = './Young_Science_2018_data/'
nanostring_data_path = '../KidneyDataset/'

## Reading the data dowloaded from [here](https://science.sciencemag.org/highwire/filestream/713964/field_highwire_adjunct_files/6/aat1699_DataS1.gz.zip)

In [3]:
# raw count data
kidney_data = mmread(reference_data_path + 'tableOfCounts.mtx').T
kidney_data = kidney_data.tocsr()

# column data
kidney_data_cols = pd.read_csv(reference_data_path + 'tableOfCounts_colLabels.tsv',
                              sep='\t', index_col=0)
kidney_data_cols.set_index('DropletID', inplace = True)

# row data
kidney_data_rows = pd.read_csv(reference_data_path + 'tableOfCounts_rowLabels.tsv',
                              sep='\t', index_col=0)
kidney_data_rows.set_index('Symbol', inplace=True)

# read in metadata to retreive cell type labels and match up barcodes
# downloaded from here: https://science.sciencemag.org/highwire/filestream/713964/field_highwire_adjunct_files/5/aat1699-Young-TablesS1-S12-revision2.xlsx
kidney_data_meta = pd.read_excel(reference_data_path + 'aat1699-Young-TablesS1-S12-revision2.xlsx',
                                 engine='openpyxl', header=1,
                                 sheet_name=10)
kidney_data_meta.set_index('DropletID', inplace=True)

# get rid of NaNs in ClusterID column
kidney_data_meta['ClusterID'] = [elem if str(elem) != 'nan' else 'NA' for elem in kidney_data_meta['ClusterID']]

In [4]:
# reading in the cell type expression profiles for cell types used in NanoString ROI deconvolution
Cell_Types_for_Spatial_Decon = pd.read_csv(nanostring_data_path + 'Cell_Types_for_Spatial_Decon.txt',
                                          sep='\t')

In [5]:
# making an adata object of the kidney scRNA-seq, all cells
adata_kidney = anndata.AnnData(X = kidney_data,
                               obs = kidney_data_cols,
                               var = kidney_data_rows
                              )

Variable names are not unique. To make them unique, call `.var_names_make_unique`.


In [6]:
adata_kidney.var_names_make_unique()

In [7]:
# adding metadata
for col in kidney_data_meta.columns:
    #print(col)
    adata_kidney.obs[col] = kidney_data_meta.loc[adata_kidney.obs_names, col]
# get rid of NaNs in ClusterID column
adata_kidney.obs['ClusterID'] = [elem if str(elem) != 'nan' else 'NA' for elem in adata_kidney.obs['ClusterID']]

In [8]:
# subsetting to only clusters that have been used in NanoString deconvolution
adata_kidney = adata_kidney[adata_kidney.obs['ClusterID'].isin(np.unique(Cell_Types_for_Spatial_Decon['ClusterID']))].copy()

In [9]:
# add values from Cell_Types_for_Spatial_Decon['cell_type_general'] to  kidney_data_meta['cell_type_general']
def add_cell_type_general(adata_obj, barcode):
    curr_ClusterID = adata_obj.obs.loc[barcode, 'ClusterID']
    
    ct_general = Cell_Types_for_Spatial_Decon.set_index('ClusterID').loc[curr_ClusterID, 'cell_type_general']
    
    return(ct_general)

adata_kidney.obs['barcode'] = adata_kidney.obs.index 
adata_kidney.obs['cell_type_general'] = adata_kidney.obs['barcode'].apply(lambda x: add_cell_type_general(adata_kidney, x))

## Processing and saving scRNA-seq count data for CellPhoneDB

In [10]:
# normalising raw counts
sc.pp.normalize_per_cell(adata_kidney, counts_per_cell_after=1e4)

# normalised count values
adata_count = anndata.AnnData(X=adata_kidney.X, var=adata_kidney.var, obs=adata_kidney.obs)

# cell type assignment
df_meta = pd.DataFrame(data={'Cell':list(adata_count.obs.index),
                             'cell_type':[ 'celltype_'+str(i) for i in adata_kidney.obs['cell_type_general']] })

normalizing by total count per cell
    finished (0:00:00): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)


In [11]:
# saving the .h5ad file with counts
adata_count.write('./adata_for_cellphone.h5ad')

# saving the metadata file with cell type assignment
df_meta.to_csv('./cellphonedb_meta.tsv', sep = '\t')

... storing 'SangerID' as categorical
... storing 'Barcode' as categorical
... storing 'ClusterID' as categorical
... storing 'Compartment' as categorical
... storing 'Source' as categorical
... storing 'cell_type_general' as categorical


# Preparing a table of expression proportions

Matrix of genes (rows) per celltypes (columns) containing the proportion [0-1] of cells in a celltype expressing the gene

In [20]:
df_expr_matrix = pd.DataFrame(adata_count.X.T.todense(), columns=adata_count.obs_names, index=adata_count.var_names)

In [21]:
df_expr_matrix_per_cell_type = {}

for ct in np.unique(adata_count.obs['cell_type_general']):
    curr_subset_of_barcodes = list(adata_count[adata_count.obs['cell_type_general'] == ct].obs_names)
    df_expr_matrix_per_cell_type[ct] = df_expr_matrix.loc[:,curr_subset_of_barcodes]

In [22]:
df_percentage_expressed = pd.DataFrame(index = df_expr_matrix.index,
                                      columns=np.unique(adata_count.obs['cell_type_general']))

In [23]:
for col in df_percentage_expressed.columns:
    #print(col)
    df_percentage_expressed.loc[:,col] = df_expr_matrix_per_cell_type[col].astype(bool).sum(axis=1)/df_expr_matrix_per_cell_type[col].shape[1]

In [25]:
# save 
df_percentage_expressed.to_csv('./PercentExpressed_for_cellphone.csv')