In [42]:
import os
import pandas as pd
import numpy as np
from Bio import SeqIO
import pickle
import random
from multiprocessing import Pool

In [2]:

data_root = '/data1/APA/Paul_ALS_Data/bams_in/subscelltype_bamfiles/Mapper_outs/'
os.listdir(data_root)

['Astrocytes_sALSvsCTRL',
 'IN-SST_C9ALSvsCTRL',
 'L4_C9ALSvsCTRL',
 'Inhibitory_sALSvsCTRL',
 'L5-6-CC_C9ALSvsCTRL',
 'Excitatory_sALSvsCTRL',
 'L5-6_C9ALSvsCTRL',
 'Excitatory_C9ALSvsCTRL',
 'Astrocytes_C9ALSvsCTRL',
 'L2-3_sALSvsCTRL',
 'AST-FB_sALSvsCTRL',
 'Oligodendrocytes_C9ALSvsCTRL',
 'get_fasta_for_switches.sh',
 'sALS_ALL_training_test_data.pkl',
 'train_data.npy',
 'OPC_C9ALSvsCTRL',
 'all_seqs_celltypes_input',
 'valid_data.npy',
 'REDU_plots',
 'tst_train_data.npy',
 'IN-VIP_C9ALSvsCTRL',
 'L4_sALSvsCTRL',
 'AST-PP_C9ALSvsCTRL',
 'IN-VIP_sALSvsCTRL',
 'AST-PP_sALSvsCTRL',
 'L5-6-CC_sALSvsCTRL',
 'all_seqs_celltypes_input.pkl',
 'IN-PV_C9ALSvsCTRL',
 'c9als_all_seqs_celltypes_input.pkl',
 'Oligodendrocytes_sALSvsCTRL',
 'IN-SST_sALSvsCTRL',
 'OPC_sALSvsCTRL',
 'L2-3_C9ALSvsCTRL',
 'test_data.npy',
 'Microglia_sALSvsCTRL',
 'Inhibitory_C9ALSvsCTRL',
 'C9ALS_ALL_training_test_data.pkl',
 'Microglia_C9ALSvsCTRL',
 'Endothelial_ALSvsCTRL',
 'TF_modisco',
 'IN-PV_sALSvsCTRL',
 

In [4]:
# C9ALS first # keep main celltyeps only
ct = ['Excitatory_C9ALSvsCTRL', 'Astrocytes_C9ALSvsCTRL','Oligodendrocytes_C9ALSvsCTRL',
'OPC_C9ALSvsCTRL', 'Inhibitory_C9ALSvsCTRL', 'Microglia_C9ALSvsCTRL']

In [6]:
sequences_dict = {}
for ct_cn in ct:
    inp_fa = data_root + "/{}/switch_DNA_sequence.fa".format(ct_cn)
    inp_fa = SeqIO.parse(inp_fa, "fasta")
    for rec in inp_fa:
        if rec.id not in sequences_dict:
            sequences_dict[rec.id] = str(rec.seq)
        else:
            continue 
print(len(sequences_dict))
celltypes = [e.split('_')[0] for e in ct]
celltypes = sorted(celltypes)

103275


In [36]:
def transcribe_positive_strand(seq):
    """ input is the 5' to 3' coding squence
        so the RNA will be exact sequence except
        U instead of T
    """
    return(seq.replace('T','U'))

def transcribe_negative_strand(seq):
    """ input is the 5' to 3' template squence
        so the function complement and returns
        the reverse of sequence
    """
    complement = {'A': 'U', 'C': 'G', 'G': 'C', 'T': 'A'}
    return "".join(complement.get(base, base) for base in reversed(seq))

In [37]:
transcribed_sequences = {}
for key,value in sequences_dict.items():
    strand = key.split(':')[-1]
    if strand == '+':
        transcribed_sequences[key] = transcribe_positive_strand(value)
    else:
        transcribed_sequences[key] = transcribe_negative_strand(value)

In [38]:
len(transcribed_sequences)

103275

In [40]:
def get_1h_seq(seq):
    """
    This is a simple code to get one-hot representaton of the
    senquences.
    --------------
    Arguments:
    seq: RNA sequence in fasta format
    """
    nt_code = {
        "N": [0, 0, 0, 0],
        "A": [1, 0, 0, 0],
        "C": [0, 1, 0, 0],
        "G": [0, 0, 1, 0],
        "U": [0, 0, 0, 1],
    }

    seq_1h = [nt_code[nt] for nt in seq]
    return np.array(seq_1h)

def makes_seqs_ready(seq,max_len,left_pad_max):
    """
    takes in the sequence, max len and max left pad
    and returns the one hot representation of sequence with maximum pad
    """
    if len(seq) < max_len:
        diff = max_len - len(seq)
        lp = random.randint(0, left_pad_max)
        lp = min(lp, diff)
        seq = get_1h_seq(
            "N" * (lp) + str(seq) + "N" * (max_len - len(seq) - lp)
        )
    else:
        seq = get_1h_seq(str(seq))
    seq = np.array(seq, dtype=np.int8).swapaxes(0, 1)
    
    return seq


In [43]:
one_hot_values = { key : makes_seqs_ready(seq, 16000, 10) for key, seq in transcribed_sequences.items()}

In [47]:
keys = list(one_hot_values.keys())
seqs = [transcribed_sequences[k] for k in keys]
indexes = list(range(len(keys)))
onehots = [one_hot_values[k] for k in keys]

seqs_data = pd.DataFrame([indexes, keys, seqs, onehots]).T
seqs_data.columns = ['switch_index', 'switch_id', 'sequence', 'one_hot']
seqs_data

Unnamed: 0,switch_index,switch_id,sequence,one_hot
0,0,chr12:AACS:125140928:125143316:+,GUGAGGCGGGACAAACUUGUCUUCCUCACACCCAUCUUACUUCCUC...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,..."
1,1,chr15:AAGAB:67201028:67201672:-,GGAGGUUAAGGAGAAAUCUUUUUUUUCCUCAGUAUAUUGUAAGAGA...,"[[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,..."
2,2,chr15:AAGAB:67201028:67202710:-,ACUACAUCAUAAACAUGUCUUUGAAACCCGUCUCCCAUCUUCUAGU...,"[[0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,..."
3,3,chr15:AAGAB:67201672:67202710:-,ACUACAUCAUAAACAUGUCUUUGAAACCCGUCUCCCAUCUUCUAGU...,"[[0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1,..."
4,4,chr15:AAGAB:67202710:67217079:-,AUCUAUUAUAGUUCAAUCCCCAGUAAUGCAGAUGGAGGAGGAUUAG...,"[[1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1,..."
...,...,...,...,...
103270,103270,chr1:ZZZ3:77626869:77627900:-,GAAACGUUCUCCAGGUAAUUGCUUUUUGAAACCGAGGUUGAGCAUU...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,..."
103271,103271,chr1:ZZZ3:77626869:77631145:-,CAAUUUGAGGAGAAUAAUCUUAGUCCUAAUGAAACAAAUGCAACUG...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,..."
103272,103272,chr1:ZZZ3:77627900:77629840:-,AGAUCAAAGUAUCAAAUCCUGGUUGUGGGACAUAAUAUUUUUUUUG...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,..."
103273,103273,chr1:ZZZ3:77627900:77631145:-,CAAUUUGAGGAGAAUAAUCUUAGUCCUAAUGAAACAAAUGCAACUG...,"[[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,..."


In [48]:
seqs_data.to_csv('/data1/APA/Paul_ALS_Data/bams_in/subscelltype_bamfiles/Mapper_outs/data_for_DL/All_sequences.csv')

In [35]:
import scipy.stats as ss

def rank_to_normal(rank, c, n):
    # Standard quantile function
    x = (rank - c) / (n - 2*c + 1)
    return ss.norm.ppf(x)

def rank_INT(series, c=3.0/8, stochastic=True):
    """ Perform rank-based inverse normal transformation on pandas series.
        If stochastic is True ties are given rank randomly, otherwise ties will
        share the same value. NaN values are ignored.
        Args:
            param1 (pandas.Series):   Series of values to transform
            param2 (Optional[float]): Constand parameter (Bloms constant)
            param3 (Optional[bool]):  Whether to randomise rank of ties
        
        Returns:
            pandas.Series
    """

    # Check input
    #assert(isinstance(series, pd.Series))
    #assert(isinstance(c, float))
    #assert(isinstance(stochastic, bool))

    # Set seed
    np.random.seed(123)

    # Take original series indexes
    orig_idx = series.index

    # Drop NaNs
    series = series.loc[~pd.isnull(series)]

    # Get ranks
    if stochastic == True:
        # Shuffle by index
        series = series.loc[np.random.permutation(series.index)]
        # Get rank, ties are determined by their position in the series (hence
        # why we randomised the series)
        rank = ss.rankdata(series, method="ordinal")
    else:
        # Get rank, ties are averaged
        rank = ss.rankdata(series, method="average")

    # Convert numpy array back to series
    rank = pd.Series(rank, index=series.index)

    # Convert rank to normal distribution
    transformed = rank.apply(rank_to_normal, c=c, n=len(rank))
    
    return transformed[orig_idx]

def get_sig_lfc(df, name):
    tmp_df =  df.loc[df['switch_name'] == name]
    lfc = round(float(tmp_df['LFC_rand_INT']),4)
    return(lfc)
    
def get_sig_mult(val):
    if val > 1.3:
        return 1
    else:
        return 0
## this will take couple of minutes ~ 10-15 min 
labels = {}
for key in sequences_dict.keys():
    labels[key] = {}
for ct in celltypes:
    df_name = data_root + ct + "_C9ALSvsCTRL/APAlog_res_metadata_added.tsv"
    inp_df = pd.read_csv(df_name, sep='\t')
    inp_df['sig_multiplyer'] = inp_df['negative_logFDR'].apply(get_sig_mult) 
    inp_df['sig_LFC_PA_Usage'] = inp_df['sig_multiplyer'] * inp_df['LFC_PA_Usage']
    inp_df['LFC_rand_INT'] = rank_INT(inp_df['sig_LFC_PA_Usage'])
    for key in list(labels.keys()):
        if key in inp_df['switch_name'].values:
            res = get_sig_lfc(inp_df, key)
            labels[key][ct] = res
        else:
            continue

KeyError: '[28289, 32420] not in index'

In [34]:
inp_df

Unnamed: 0,transcript,p_devtest,fdr_p_devtest,ref_site,alt_site,b_intercept,p_intercept,b_ConditionALS_pathology,p_ConditionALS_pathology,strand,multiplyer,LFC_PA_Usage,negative_logFDR,switch_width,bed,correction_multiplyer,switch_name,sig_multiplyer,sig_LFC_PA_Usage,LFC_rand_INT
0,A2ML1,0.010296,0.05251,chr12:8875813:+,chr12:8876783:+,1.180695,2.457383e-24,-0.561181,0.001745,+,1,-0.561181,2.758217,970.0,"chr12,8875813,8876783",1,chr12:A2ML1:8875813:8876783:+,1,-0.561181,-1.297083
1,A2ML1,0.010296,0.05251,chr12:8875813:+,chr12:8877345:+,1.317617,8.862662e-31,-0.369732,0.032501,+,1,-0.369732,1.488106,1532.0,"chr12,8875813,8877345",1,chr12:A2ML1:8875813:8877345:+,1,-0.369732,-0.887375
2,A2ML1,0.010296,0.05251,chr12:8875813:+,chr12:8879773:+,-0.152677,3.064243e-01,-0.155796,0.490108,+,1,-0.155796,0.309708,3960.0,"chr12,8875813,8879773",1,chr12:A2ML1:8875813:8879773:+,0,-0.000000,0.341955
3,A2ML1,0.010296,0.05251,chr12:8876783:+,chr12:8877345:+,0.136922,7.507363e-02,0.191449,0.143880,+,1,0.191449,0.841998,562.0,"chr12,8876783,8877345",1,chr12:A2ML1:8876783:8877345:+,0,0.000000,0.207048
4,A2ML1,0.010296,0.05251,chr12:8876783:+,chr12:8879773:+,-1.333372,2.422306e-27,0.405385,0.038119,+,1,0.405385,1.418855,2990.0,"chr12,8876783,8879773",1,chr12:A2ML1:8876783:8879773:+,1,0.405385,1.020036
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69733,ZZZ3,1.000000,1.00000,chr1:77580998:-,chr1:77618234:-,1.360542,3.884981e-47,0.245579,0.077971,-,-1,-0.245579,1.108069,37236.0,"chr1,77580998,77618234",1,chr1:ZZZ3:77580998:77618234:-,0,-0.000000,0.506905
69734,ZZZ3,1.000000,1.00000,chr1:77580998:-,chr1:77629840:-,0.910625,6.463584e-20,-0.159584,0.290767,-,-1,0.159584,0.536454,48842.0,"chr1,77580998,77629840",1,chr1:ZZZ3:77580998:77629840:-,0,0.000000,0.358886
69735,ZZZ3,1.000000,1.00000,chr1:77593449:-,chr1:77618234:-,1.389197,5.343132e-48,0.659493,0.000025,-,-1,-0.659493,4.600880,24785.0,"chr1,77593449,77618234",1,chr1:ZZZ3:77593449:77618234:-,1,-0.659493,-1.495519
69736,ZZZ3,1.000000,1.00000,chr1:77593449:-,chr1:77629840:-,0.939280,1.086136e-20,0.254330,0.127841,-,-1,-0.254330,0.893330,36391.0,"chr1,77593449,77629840",1,chr1:ZZZ3:77593449:77629840:-,0,-0.000000,0.239367


In [12]:
tst = pd.read_csv('/data1/APA/Paul_ALS_Data/bams_in/subscelltype_bamfiles/Mapper_outs/Excitatory_C9ALSvsCTRL/APAlog_res_metadata_added.tsv', sep='\t')
tst[tst['correction_multiplyer'] != 1]

Unnamed: 0,transcript,p_devtest,fdr_p_devtest,ref_site,alt_site,b_intercept,p_intercept,b_ConditionALS_pathology,p_ConditionALS_pathology,strand,multiplyer,LFC_PA_Usage,negative_logFDR,switch_width,bed,correction_multiplyer,switch_name,sig_LFC_PA_Usage
785,AFF3,1.0,1.0,chr2:100005422:-,chr2:99547286:-,-0.08562685,0.5237758,0.816864,2.447591e-06,-,-1,0.816864,5.611261,458136,"chr2,100005422,99547286",-1,chr2:AFF3:99547286:100005422:-,0.816864
786,AFF3,1.0,1.0,chr2:100005422:-,chr2:99547618:-,0.4441747,0.0001905063,0.0802,0.625978,-,-1,0.0802,0.203441,457804,"chr2,100005422,99547618",-1,chr2:AFF3:99547618:100005422:-,0.0
787,AFF3,1.0,1.0,chr2:100005422:-,chr2:99600938:-,-0.3909173,0.007529864,-0.669605,0.003602089,-,-1,-0.669605,2.443446,404484,"chr2,100005422,99600938",-1,chr2:AFF3:99600938:100005422:-,-0.669605
788,AFF3,1.0,1.0,chr2:100005422:-,chr2:99633785:-,0.2633557,0.03306239,-0.129038,0.4597127,-,-1,-0.129038,0.337514,371637,"chr2,100005422,99633785",-1,chr2:AFF3:99633785:100005422:-,0.0
789,AFF3,1.0,1.0,chr2:100005422:-,chr2:99642923:-,0.2987577,0.01483785,0.049487,0.7708457,-,-1,0.049487,0.113033,362499,"chr2,100005422,99642923",-1,chr2:AFF3:99642923:100005422:-,0.0
790,AFF3,1.0,1.0,chr2:100005422:-,chr2:99644397:-,0.6837243,1.988849e-09,-0.735404,1.937619e-05,-,-1,-0.735404,4.712732,361025,"chr2,100005422,99644397",-1,chr2:AFF3:99644397:100005422:-,-0.735404
791,AFF3,1.0,1.0,chr2:100005422:-,chr2:99815054:-,0.5907781,3.386167e-07,-0.578265,0.000764942,-,-1,-0.578265,3.116371,190368,"chr2,100005422,99815054",-1,chr2:AFF3:99815054:100005422:-,-0.578265
792,AFF3,1.0,1.0,chr2:100005422:-,chr2:99868018:-,0.1202549,0.3460733,-0.276682,0.1327645,-,-1,-0.276682,0.876918,137404,"chr2,100005422,99868018",-1,chr2:AFF3:99868018:100005422:-,0.0
1842,AP2A2,0.061881,0.149725,chr11:1010921:+,chr11:927935:+,-2.181857,1.368889e-63,0.294951,0.06904877,+,1,-0.294951,1.160844,82986,"chr11,1010921,927935",-1,chr11:AP2A2:927935:1010921:+,0.0
1843,AP2A2,0.061881,0.149725,chr11:1012240:+,chr11:927935:+,-2.276764,1.039013e-69,0.491348,0.00244563,+,1,-0.491348,2.611609,84305,"chr11,1012240,927935",-1,chr11:AP2A2:927935:1012240:+,-0.491348


In [32]:
- np.log10(0.05)

1.3010299956639813

In [31]:
- np.log10(4.597127e-01)

0.3375134982863807