In [None]:
import glob
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from statsmodels.stats.multitest import multipletests
import matplotlib.pyplot as plt
import os

import scanpy as sc
import seaborn as sns

from scroutines import basicu

import warnings
from statsmodels.tools.sm_exceptions import ConvergenceWarning
from statsmodels.tools.sm_exceptions import ValueWarning
from tqdm import tqdm

In [None]:
f1 = '/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/data/v1_multiome/superdupermegaRNA_hasraw_multiome.h5ad'
f2 = '/u/home/f/f7xiesnm/v1_multiome/multiome_cell_sex_assignment_saumya.csv'
meta  = sc.read(f1, backed='r').obs
df_sex = pd.read_csv(f2)
meta = meta.join(df_sex.set_index('cell'))
meta

In [None]:
sample_conditions = ['P6', 'P8', 'P10', 'P12', 'P17', 'P12DR', 'P14DR', 'P17DR', 'P21DR'] # no P14 P21
subclasses = ['Astro', 
              'L2/3', 'L4', 'L5IT', 'L6IT', 
              'L5PT', 'L5NP', 'L6CT', 'L6b', 
              'Lamp5', 'Pvalb', 'Sst', 'Vip',
              'OD', 'OPC', 'Micro',
             ] # no others

ddir = '/u/home/f/f7xiesnm/v1_multiome/atac_fragments/pmat_snap_v2/organized'

cells_both_all = []
for subclass in subclasses:
    for exp_cond in sample_conditions:
        # atac cells
        subclass_cure = subclass.replace('/', '')
        f = f'{ddir}/pmat_{subclass_cure}_consensus_{exp_cond}.h5ad'
        metasub_atac = sc.read(f, backed='r').obs
        cells_atac = metasub_atac.index
        
        # rna cells
        metasub = meta[((meta['Age']==exp_cond) & (meta['Subclass']==subclass))]
        cells_rna = metasub.index.values
        cells_both = np.intersect1d(cells_atac, cells_rna)

        print(subclass, exp_cond, len(cells_both)/len(cells_rna))
        cells_both_all.append(cells_both)
        
cells_both_all = np.hstack(cells_both_all)  

In [None]:
print(meta.shape)
meta = meta.loc[cells_both_all]
print(meta.shape)

In [None]:
meta['Age'] = meta['Age'].astype(str)
meta['Sample'] = meta['Sample'].astype(str)
meta['Subclass'] = meta['Subclass'].astype(str)
print(meta.shape)

# filter sex assignment (remove undetermined)
meta = meta[meta['sex']!='nan']
print(meta.shape)

# filter condition - at least 2 samples having both sex
meta = meta[~meta['Age'].isin(['P14', 'P21'])]
print(meta.shape)

# filter subclass - at least 10 cells in any sample
subclass_abundance = meta.groupby(['Subclass', 'Sample']).size().unstack().fillna(0)
subclass_abundance_pass = subclass_abundance[subclass_abundance.min(axis=1) > 10]
subclasses = subclass_abundance_pass.index.values
meta = meta[meta['Subclass'].isin(subclasses)]
print(meta.shape)

In [None]:
uniq_subclasses = np.unique(meta['Subclass'])
uniq_conditions = np.unique(meta['Age'])
print(uniq_subclasses)
print(uniq_conditions)

In [None]:
meta_counts = meta.groupby(['Sample', 'sex', 'Subclass']).size().unstack().fillna(0)
meta_counts

In [None]:
meta_counts.describe()

In [None]:
meta_counts2 = (meta.groupby(['Subclass', 'Age', 'Sample', 'sex']).size()
                    .groupby(['Subclass', 'Age', 'Sample']).min() # smaller among M & F
                    .groupby(['Subclass', 'Age']).max() # largest sample
                )
case_thresholds = meta_counts2 > 100
case_thresholds_map = case_thresholds.unstack().T

sns.heatmap(case_thresholds_map, cmap='rocket_r')

In [None]:
meta_counts2.unstack().T

In [None]:
ddir = '/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/results_sexual_dimorphism/'

In [None]:
df_merge = []
for (subclass, condition), isok in case_thresholds.items():
    
    subclass_cure = subclass.replace('/', '')
    
    filename = os.path.join(ddir, f'ATAC_{condition}_{subclass_cure}.csv')
    
    df = pd.read_csv(filename, index_col=0)
    df['max_cp10k'] = np.max(df.iloc[:,5:], axis=1)
    
    x = df['log2fc']
    y = df['qval']
    z = df['max_cp10k']

    x_th = np.log2(2)
    y_th = 0.05
    z_th = 0.1

    cond_sig = np.all([np.abs(x) > x_th, 
                               y < y_th,
                               z > z_th,
                       df['converged'].values,
                      ], axis=0)

    
    print(subclass, condition, np.sum(cond_sig))
    df_sig = df[cond_sig][['gene', 'log2fc', 'qval', 'converged', 'max_cp10k']].copy()
    df_sig['condition'] = condition
    df_sig['subclass'] = subclass
    df_sig['chr']  = df_sig['gene'].apply(lambda x: x.split(':')[0])
    
    if not isok: 
        print('skip:', subclass, condition)
    else:
        df_merge.append(df_sig)
    
df_merge = pd.concat(df_merge)
df_merge

In [None]:
subclasses = df_merge['subclass'].unique()
len(subclasses), subclasses

In [None]:
df_merge_add_all = []

for subclass in subclasses: 
    df_merge_add = df_merge[df_merge['subclass']==subclass].groupby('gene')['condition'].agg(lambda x: ' '.join(x)).sort_values()
    df_merge_add = df_merge_add.to_frame() 
    df_merge_add['n_condition'] = df_merge_add['condition'].apply(lambda x: len(x.split(' ')))
    # df_merge_add['n_condition_nr'] = df_merge_add['condition'].apply(lambda x: len([_x for _x in x.split(' ') if not _x.endswith('DR')]))
    df_merge_add = df_merge_add.sort_values('n_condition', ascending=False)
    
    df_merge_add = df_merge_add[df_merge_add['n_condition'] > 1]
    df_merge_add['subclass'] = subclass
    df_merge_add['chr'] = [region.split(':')[0] for region in df_merge_add.index.values]
    
    df_merge_add_all.append(df_merge_add)

df_merge_add_all = pd.concat(df_merge_add_all)
df_merge_add_all

In [None]:
n_instances = df_merge_add_all.groupby('chr')['n_condition'].sum()
(n_instances/np.sum(n_instances)).sort_values(ascending=False)

In [None]:
df_merge_add_all['chr'].value_counts()/len(df_merge_add_all)

In [None]:
df_merge_add_all['subclass'].value_counts()

In [None]:
df_merge_add_all.to_csv('check_ATAC.csv')