# Goal:
## Deconvolute bulk samples with single cell data
# Date:
## May 9th 2024
# Author:
## Carsten Knutsen


In [None]:
import os 
import pandas as pd
import numpy as np
import scanpy as sc
import seaborn as sns 
from rnasieve.preprocessing import model_from_raw_counts
from anndata import AnnData
import scipy
import random
from statannotations.Annotator import Annotator
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 10})
output = '/home/carsten/alvira_bioinformatics/uterus/data/figures/bulk_deconvolution/'
os.makedirs(output, exist_ok = True)


## Function to go from anndata objects of bulk and single cell to a dataframe and model of deconvolution

In [None]:
def sc_ct_proportion_from_bulk(sc_adata, bulk_adata, obs):
    cell_type_array_dict = {}
    for ct in sc_adata.obs[obs].unique():
        ct_adata = sc_adata[sc_adata.obs[obs] == ct, :]
        cell_type_array_dict[ct] = ct_adata.X.toarray().T
    bulk_array = bulk_adata.X.T
    model, psis = model_from_raw_counts(cell_type_array_dict, bulk_array,normalization=False)
    df = model.predict(psis)
    df.index = bulk_adata.obs_names    
    return df, model

# Load in bulk RNAseq data

In [None]:
data1=pd.read_csv('/home/carsten/alvira_bioinformatics/uterus/data/bulk/Result_X202SC24043118-Z01-F001_Homo_sapiens/3.Quant/1.Count/gene_count.csv',
                  delimiter='\t'

                   )
gene_dict = pd.Series(index=data1['gene_id'].values, data=data1['gene_name'].values).to_dict()
data1.index = data1['gene_id']
data1 = data1.iloc[:,1:9]
gene_dict

In [None]:
data2=pd.read_csv('/home/carsten/alvira_bioinformatics/uterus/data/bulk/Result_X202SC22123874-Z01-F001/3.Quant/1.Count/gene_count.csv',
                  delimiter='\t'

                   )
data2.index = data2['gene_id']
data2 = data2.iloc[:,1:13]
data2

In [None]:
bulk_data = pd.concat([data1, data2], axis=1)
bulk_data

In [None]:
sample_group = {'pt11':'TL-BC',
               'pt13':'TL-BC',
                'pt21':'TL-BC',
                'pt25':'TL-BC', 
                'pt38':'TL-BC',
                'pt22':'TL-GC',
                'pt24':'TL-GC',
                'pt36':'TL-GC',
                'pt37':'TL-GC',
                'pt41':'TL-GC',
                'pt33':'TL-GC',
                'pt4':'TL-GC',
                'PTL99':'PTL',
                'PTL108':'PTL',
                'PTL109':'PTL',
                'PTL133':'PTL',
                'PTNL48':'PTNL',
                'PTNL101':'PTNL',
                'PTNL111':'PTNL',
                'PTNL119':'PTNL',

                
               }

In [None]:
bulk_adata = AnnData(bulk_data.T)
sc.pp.normalize_total(bulk_adata, target_sum=1e6)
sc.pp.log1p(bulk_adata,base=10)
bulk_adata.var_names = [gene_dict[x] for x in bulk_adata.var_names]
bulk_adata.var_names_make_unique()
bulk_adata.obs['Treatment'] = [sample_group[x] for x in bulk_adata.obs_names]

## Load in single cell RNAseq
## Normalize and log matrix
## Adjust Metadata

In [None]:
sc_adata = sc.read('/home/carsten/alvira_bioinformatics/uterus/data/single_cell_files/scanpy_files/uterus_processed_celltyped.gz.h5ad')
sc_adata.X = sc_adata.layers['log10'].copy()
sc_adata.uns['log1p']['base']=10

In [None]:
merscope_genes = pd.read_csv('/home/carsten/alvira_bioinformatics/uterus/data/pilot/240325_merscope_planning/fpkm_ls_final.csv',header=None, index_col=0)
merscope_genes = merscope_genes.index.tolist()
overlap_hv = [x for x in sc_adata.var.loc[sc_adata.var['highly_variable']==True].index if x in bulk_adata.var_names]
overlap = [x for x in sc_adata.var_names if x in bulk_adata.var_names]
overlap_merscope= [x for x in merscope_genes if x in sc_adata.var_names]

In [None]:
df, model = sc_ct_proportion_from_bulk(sc_adata[:,overlap_hv], bulk_adata[:,overlap_hv], 'Cell Subtype')
perc_df_ct = df *100
perc_df_ct

In [None]:
df, model = sc_ct_proportion_from_bulk(sc_adata[:,overlap_hv], bulk_adata[:,overlap_hv], 'Lineage')
perc_df_lin = df *100
perc_df_lin