## Visualization of sample contribution to datasets and composition (ESCC, CRC, and LUAD)
In the following notebook we want to see the relative sample sizes and the per sample malignant/ non-malignant ration

In [None]:
import os
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import math

from load_data import load_datasets
from constants import BASE_PATH_DATA

In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [None]:
plt.rcParams.update({'pdf.fonttype':42, 'font.family':'sans-serif', 'font.sans-serif':'Arial', 'font.size':14})

In [None]:
def get_proportions(adata):
    sample_counts = adata.obs.groupby('sample_id').apply(lambda x: (len(x), len(x[x.malignant_key=='malignant']), len(x[x.malignant_key=='non-malignant'])))
    sample_counts = pd.DataFrame(sample_counts.tolist(), index=sample_counts.index, columns=['nr_cells', 'nr_mal_cells', 'nr_non-mal_cells'])
    sample_counts['pct_cells'] = sample_counts['nr_cells']/sum(sample_counts['nr_cells'])
    sample_counts['pct_mal_cells'] = sample_counts['nr_mal_cells']/sample_counts['nr_cells']
    sample_counts['pct_non-mal_cells'] = sample_counts['nr_non-mal_cells']/sample_counts['nr_cells']
    sample_counts = sample_counts.sort_values(by='nr_cells')
    return sample_counts
    

def plot_pie(dataset, adata, storing_path, figsize=(10,10)):
    adata.obs.sample_id.value_counts().plot.pie(autopct='%1.1f%%',figsize=figsize, cmap='gist_rainbow')
    plt.tight_layout()
    plt.savefig(os.path.join(base_storing_path, f'{dataset}_overall_proprotion.svg'))
    plt.show()
    
    
def plot_sid_pies(sample_counts, factor=2,figsize=(10,10)):
    nr_samples = len(sample_counts)
    interesting_cols = sample_counts[['nr_mal_cells', 'nr_non-mal_cells']].copy()
    interesting_cols.columns =['malignant', 'non-malignant']
    
    nr_cols = int(math.ceil(nr_samples/6))
    layout = (6, nr_cols)
    
    if factor>0:
        fs = (12*factor, 4*factor)
    else:
        fs = figsize

    interesting_cols.T.plot.pie(subplots=True, layout=layout,figsize=fs, legend=False, title=f'{dataset.upper()} malignancy composition per sample');
    
    plt.tight_layout()
    plt.savefig(os.path.join(base_storing_path, f'{dataset}_persid_mal_proprotion.svg'))
    
    fig = plt.gcf()
    
    print(fig.get_size_inches())
    plt.show()

In [None]:
plt.rcParams.update({'pdf.fonttype':42, 'font.family':'sans-serif', 'font.sans-serif':'Arial', 'font.size':14})

In [None]:
norm_method='mean'

if norm_method=='median':
    appendix = '_med'
elif norm_method=='CP10k':
    appendix = '_cp10k'
else:
    appendix = ''

In [None]:
base_storing_path = os.path.join(BASE_PATH_DATA, 'data_proportions')

## ESCC

In [None]:
dataset='escc'

adata = load_datasets(dataset, preprocessed=True, norm_method=norm_method)

In [None]:
sample_counts = get_proportions(adata)
sample_counts.head()

In [None]:
sample_counts.to_csv(os.path.join(base_storing_path, f'{dataset}_sample_cell_proportions.csv'))

In [None]:
plot_pie(dataset, adata, base_storing_path)

In [None]:
plot_sid_pies(sample_counts)

## CRC

In [None]:
dataset='crc'

adata = load_datasets(dataset, preprocessed=True, norm_method=norm_method)

In [None]:
sample_counts = get_proportions(adata)
sample_counts.head()

In [None]:
sample_counts.to_csv(os.path.join(base_storing_path, f'{dataset}_sample_cell_proportions.csv'))

In [None]:
plot_pie(dataset, adata, base_storing_path)

In [None]:
plot_sid_pies(sample_counts)

## LUAD

In [None]:
dataset='luad'

adata = load_datasets(dataset, preprocessed=True, norm_method=norm_method)

In [None]:
sample_counts = get_proportions(adata)
sample_counts.head()

In [None]:
sample_counts.to_csv(os.path.join(base_storing_path, f'{dataset}_sample_cell_proportions.csv'))

In [None]:
plot_pie(dataset, adata, base_storing_path)

In [None]:
plot_sid_pies(sample_counts, factor=0)