## Mathew: find overlapping genes on opposite strands

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import scanpy as sc
import anndata as ad
import matplotlib.pyplot as plt
from venny4py.venny4py import *
import scvelo as scv
import os
import json
from pyroe import load_fry
import ensembl_rest
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from typing import Union
import seaborn as sns
import upsetplot as up

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

### read in data

In [2]:
velocyto = sc.read_h5ad("../data/mathew_velocyto_sto.h5ad")
alevin = sc.read_h5ad("../data/mathew_alevin_sto.h5ad")
alevin = alevin[velocyto.obs_names, velocyto.var_names]

### find overlapping genes

In [3]:
gene_pos_df = velocyto.var[["Chromosome", "Start", "End", "Strand"]]

In [4]:
# check number of genes on forward and reverse strand
print(len(gene_pos_df[gene_pos_df["Strand"] == "+"]))
print(len(gene_pos_df[gene_pos_df["Strand"] == "-"]))

7451
7443


In [5]:
def overlapping(
    chr_1: Union[int, str],
    start_1: int,
    end_1: int,
    strand_1: str,
    chr_2: Union[int, str],
    start_2: int,
    end_2: int,
    strand_2: str,
    opposing: bool = True
) -> bool:
    """
    Check if two genes overlap.
    :param chr_1: chromosome of first gene
    :param start_1: start position of first gene
    :param end_1: end position of first gene
    :param strand_1: strand of first gene
    :param chr_2: chromosome of second gene
    :param start_2: start position of second gene
    :param end_2: end position of second gene
    :param strand_2: strand of second gene
    :param opposing: consider genes on opposing or same strand
    :return:
    """
    if chr_1 != chr_2:
        return False
    if opposing and strand_1 == strand_2:
        return False
    elif not opposing and strand_1 != strand_2:
        return False
    assert start_1 < end_1 and start_2 < end_2, "Start must be less than end"
    if start_1 > end_2 or start_2 > end_1:
        return False
    return True

In [6]:
overlapping_matrix = gene_pos_df.apply(
    lambda x: gene_pos_df.apply(
        lambda y: overlapping(*x, *y, opposing=True), axis=1
    ), axis=1
)

In [7]:
# get indices of overlapping genes
idx_1, idx_2 = np.where(overlapping_matrix)
gene_1 = overlapping_matrix.iloc[idx_1].index
gene_2 = overlapping_matrix.iloc[idx_2].index

overlapping_genes = gene_pos_df.loc[gene_1]
overlapping_genes["overlapping_gene"] = gene_2

In [8]:
print("Number of overlaping genes:", len(overlapping_genes)/2)

Number of overlaping genes: 1555.0


### get additional gene information

In [9]:
# get additional information of genes
gene_info = pd.DataFrame({"counts cellranger": [velocyto[:,gene].layers["counts"].sum() for gene in velocyto.var_names],
                        "counts salmon": [alevin[:,gene].layers["counts"].sum() for gene in alevin.var_names],
                        "spliced counts velocyto": np.array(velocyto.layers["spliced"].sum(axis=0)).flatten(),
                        "unspliced counts velocyto": np.array(velocyto.layers["unspliced"].sum(axis=0)).flatten(),
                        "spliced counts alevin": np.array(alevin.layers["spliced"].sum(axis=0)).flatten(),
                        "unspliced counts alevin": np.array(alevin.layers["unspliced"].sum(axis=0)).flatten()},
                        index = velocyto.var_names)

In [10]:
# get mean counts
gene_info["mean counts cellranger"] = gene_info.apply(lambda x: float(velocyto[:,x.name].layers["counts"].mean()), axis=1)
gene_info["mean counts salmon"] = gene_info.apply(lambda x: float(alevin[:,x.name].layers["counts"].mean()), axis=1)

In [32]:
overlapping_genes = pd.merge(overlapping_genes, gene_info, left_index=True, right_index=True, how='inner')
overlapping_genes.reset_index(level=1, inplace=True)

### calculate correlations of cellranger counts with velocyto unspliced counts

In [13]:
# get pearson correlation per gene
def gene_pearson(adata, gene1, gene2):
    x = adata[:,gene1].layers["counts"].toarray().ravel()
    y = adata[:,gene2].layers["unspliced"].toarray().ravel()
    return pearsonr(x,y)[0]

def gene_pval(adata, gene1, gene2):
    x = adata[:,gene1].layers["counts"].toarray().ravel()
    y = adata[:,gene2].layers["unspliced"].toarray().ravel()
    return pearsonr(x,y)[1]

In [37]:
overlapping_genes["pearson"] = overlapping_genes.apply(lambda x: gene_pearson(velocyto, x.name, x["overlapping_gene"]), axis=1)
overlapping_genes["pearson_pval"] = overlapping_genes.apply(lambda x: gene_pval(velocyto, x.name, x["overlapping_gene"]), axis=1)

In [38]:
# get spearman correlation per gene
def gene_spearman(adata, gene1, gene2):
    x = adata[:,gene1].layers["counts"].toarray().ravel()
    y = adata[:,gene2].layers["unspliced"].toarray().ravel()
    return spearmanr(x,y)

In [39]:
overlapping_genes["spearman"] = overlapping_genes.apply(lambda x: gene_spearman(velocyto, x.name, x["overlapping_gene"]), axis=1)
overlapping_genes["spearman_pval"] = overlapping_genes["spearman"].apply(lambda x: x[1])
overlapping_genes["spearman"] = overlapping_genes["spearman"].apply(lambda x: x[0])

### calculate overlap length

In [40]:
def overlap_len(row):
    overlap = min(row["End"], gene_pos_df.loc[row["overlapping_gene"]]["End"]) - max(row["Start"], gene_pos_df.loc[row["overlapping_gene"]]["Start"])
    return overlap

In [41]:
# get total lenght of overlap
overlapping_genes["overlap_length"] = overlapping_genes.apply(lambda x: overlap_len(x), axis=1)

In [42]:
# set multi-index
overlapping_genes.set_index("overlapping_gene", append=True, inplace=True)

In [43]:
# correlation of overlap length with Pearson correlation
pearsonr(overlapping_genes.dropna()["pearson"], overlapping_genes.dropna()["overlap_length"])

PearsonRResult(statistic=0.33979915057869636, pvalue=6.092947569602767e-82)

In [44]:
# correlation of overlap length with Pearson p-values
pearsonr(overlapping_genes.dropna()["pearson_pval"], overlapping_genes.dropna()["overlap_length"])

PearsonRResult(statistic=-0.20446606862954536, pvalue=1.1413482688633664e-29)

In [45]:
# correlation of overlap length with Spearman correlation
pearsonr(overlapping_genes.dropna()["spearman"], overlapping_genes.dropna()["overlap_length"])

PearsonRResult(statistic=0.3258831014324236, pvalue=3.8291459033656237e-75)

In [46]:
# correlation of overlap length with Spearman correlation
pearsonr(overlapping_genes.dropna()["spearman_pval"], overlapping_genes.dropna()["overlap_length"])

PearsonRResult(statistic=-0.12942861617836002, pvalue=1.1198966342413892e-12)

### calculate proportion of overlap 

In [47]:
# calculate relative overlap
overlapping_genes["rel_overlap"] = overlapping_genes.apply(lambda x: x["overlap_length"] / (x["End"] - x["Start"]), axis=1)

In [48]:
# correlation of overlap length with Pearson correlation
pearsonr(overlapping_genes.dropna()["pearson"], overlapping_genes.dropna()["rel_overlap"])

PearsonRResult(statistic=0.532684352093226, pvalue=1.7729710589427928e-219)

In [49]:
# correlation of overlap length with Pearson p-values
pearsonr(overlapping_genes.dropna()["pearson_pval"], overlapping_genes.dropna()["rel_overlap"])

PearsonRResult(statistic=-0.2871459492182551, pvalue=5.041298691657061e-58)

In [50]:
# correlation of overlap length with Spearman correlation
spearmanr(overlapping_genes.dropna()["spearman"], overlapping_genes.dropna()["rel_overlap"])

SignificanceResult(statistic=0.5151973417264634, pvalue=5.0515428767056815e-203)

In [51]:
# correlation of overlap length with Spearman correlation
spearmanr(overlapping_genes.dropna()["spearman_pval"], overlapping_genes.dropna()["rel_overlap"])

SignificanceResult(statistic=-0.5041129431339368, pvalue=4.430429806228507e-193)

### save

In [52]:
# save
gene_pos_df.to_pickle("../data/mathew_gene_positions.pkl.gz", compression='gzip')
gene_info.to_csv("../data/mathew_gene_info.csv")
overlapping_genes.to_pickle("../data/mathew_overlapping_genes.pkl.gz", compression='gzip')