In [None]:
# #To be used on cloud platforms

# ! python -c "import pykeen" || pip install pykeen
# ! python -c "import scprep" || pip install scprep

In [1]:
import os, pandas as pd, numpy as np, networkx, scprep
import rpy2, rpy2.situation


In [2]:
print(os.getcwd())
data_dir = f"{os.getcwd()}/data"
kge_dir = f"{os.getcwd()}/KGEs"

/Users/ferran/PhD/yaleCollab


# DataBases

The integrated database is the object ingested by pykeen's *TriplesFactory*.

In [3]:
print(pd.read_csv(f"{data_dir}/dCellChat.tsv", sep="\t").head())
print(pd.read_csv(f"{data_dir}/dSTRING.tsv", sep="\t").head())

    head       relation    tail pathway
0  Tgfb1  LRinteraction  Tgfbr1    TGFb
1  Tgfb1  LRinteraction  Tgfbr2    TGFb
2  Tgfb2  LRinteraction  Tgfbr1    TGFb
3  Tgfb2  LRinteraction  Tgfbr2    TGFb
4  Tgfb3  LRinteraction  Tgfbr1    TGFb
    head  relation   tail
0  Gnai3  FIstring  Prkca
1  Gnai3  FIstring  Rgs18
2  Gnai3  FIstring  Gpsm2
3  Gnai3  FIstring  Adrb3
4  Gnai3  FIstring  Gnat2


In [4]:
pd.read_csv(f"{data_dir}/dINT_simpl.tsv", sep="\t", header=None)

Unnamed: 0,0,1,2
0,Tgfb1,LRinteraction,Tgfbr1
1,Tgfb1,LRinteraction,Tgfbr2
2,Tgfb2,LRinteraction,Tgfbr1
3,Tgfb2,LRinteraction,Tgfbr2
4,Tgfb3,LRinteraction,Tgfbr1
...,...,...,...
439018,Gm28635,FIstring,Arhgap35
439019,Gm28635,FIstring,Cdh17
439020,Gm28635,FIstring,Kif5b
439021,Gm28635,FIstring,Tjp3


From the integrated database, let's make a pandas dataframe that has only the 
source and target columns and then use that to make a NetworkX Graph object 
with `from_pandas_edgelist`

Then, once the scRNAseq has been loaded in, we can also subset the genes in 
this dataframe to those present in the scRNA-seq count matrix.

In [20]:
dEdgeList = pd.read_csv(f"{data_dir}/dINT_simpl.tsv", sep="\t", header=None)[[0,2]]
print(dEdgeList.shape)

#Filter rows if either column not in list(dGenesSEQ)
dEdgeList = dEdgeList.loc[dEdgeList[0].isin(list(dGenesSEQ)) & dEdgeList[2].isin(list(dGenesSEQ))]
dEdgeList

(439023, 2)


Unnamed: 0,0,2
0,Tgfb1,Tgfbr1
1,Tgfb1,Tgfbr2
2,Tgfb2,Tgfbr1
3,Tgfb2,Tgfbr2
4,Tgfb3,Tgfbr1
...,...,...
438929,Paqr8,Efhc1
438930,Teddm3,Tmem198b
438931,Teddm3,Slc7a15
438943,Cys1,Fut2


# scRNAseq data

We will have to use rpy to import the seurat object with the data. From there we already have code (from RNA Velo NBs) to extract metadata. Need to extract count matrices.

In [6]:
for i in rpy2.situation.iter_info(): # Print Rpy2 info
    print(i)

[1mrpy2 version:[0m
3.4.5
[1mPython version:[0m
3.9.13 (main, Aug  7 2022, 01:33:23) 
[Clang 13.1.6 (clang-1316.0.21.2.5)]
[1mLooking for R's HOME:[0m
    Environment variable R_HOME: None
    Calling `R RHOME`: /usr/local/Cellar/r/4.2.1_2/lib/R
    Environment variable R_LIBS_USER: None
[1mR's additions to LD_LIBRARY_PATH:[0m
/usr/local/lib/R/library/stats/libs/
[1mR version:[0m
    In the PATH: R version 4.2.1 (2022-06-23) -- "Funny-Looking Kid"
    Loading R library from rpy2: OK
[1mAdditional directories to load R packages from:[0m
None
[1mC extension compilation:[0m
  include:
  ['/usr/local/Cellar/r/4.2.1_2/lib/R/include']
  libraries:
  ['R', 'pcre2-8', 'lzma', 'bz2', 'z', 'icucore', 'dl', 'm', 'iconv']
  library_dirs:
  ['/usr/local/opt/gettext/lib', '/usr/local/opt/readline/lib', '/usr/local/opt/xz/lib', '/usr/local/lib', '/usr/local/opt/gettext/lib', '/usr/local/opt/readline/lib', '/usr/local/opt/xz/lib', '/usr/local/lib', '/usr/local/Cellar/r/4.2.1_2/lib/R/lib'

## Prep data

In [3]:
%load_ext rpy2.ipython

In [4]:
%%R -i data_dir
library(tidyverse)
library(Seurat)

paste0(data_dir)

R[write to console]: ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

R[write to console]: ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
✔ tibble  3.1.7     ✔ dplyr   1.0.9
✔ tidyr   1.2.0     ✔ stringr 1.4.0
✔ readr   2.1.2     ✔ forcats 0.5.1

R[write to console]: ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()

R[write to console]: Attaching SeuratObject

R[write to console]: Attaching sp



[1] "/Users/ferran/PhD/yaleCollab/data"


In [5]:
%%R

d_fibWT = readRDS(paste0(data_dir,"/Omics/","d_fibWT.rds"))
d_fibWT

An object of class Seurat 
40940 features across 3698 samples within 2 assays 
Active assay: SCT (20073 features, 10000 variable features)
 1 other assay present: RNA
 3 dimensional reductions calculated: pca, umap, phate


Export list of genes detected in scRNA-seq dataset.
This will be used to subset the Knowledge Graph

In [6]:
%%R -o dGenesSEQ

dGenesSEQ = rownames(d_fibWT@assays$RNA@data)
# d_fibWT@assays$RNA@data
# d_fibWT@assays$RNA@counts

In [11]:
len(list(dGenesSEQ))

20867

### Normalise and denoise gene expression data

First subset to genes in KG, then use MAGIC to smooth, and then normalise so 
that the gene expression distributions are equally big for each cell.

# Project cell data on gene graph

First we'll need to build and adjancency matrix from the KG. 
Then we will compute DEMD with the adjacency matrix and the distributions of 
normalised gene expression for each cell on our dataset.

## Adjacency matrix from KG

Using [networkx](https://networkx.org/documentation/stable/install.html) package

In [23]:
#First make graph
G = networkx.from_pandas_edgelist(dEdgeList, 0, 2)
G

<networkx.classes.graph.Graph at 0x192362670>

In [24]:
#Adjacency matrix
AdjMat = networkx.adjacency_matrix(G)
AdjMat

  AdjMat = networkx.adjacency_matrix(G)


<12229x12229 sparse matrix of type '<class 'numpy.int64'>'
	with 347279 stored elements in Compressed Sparse Row format>

## Compute Distances