In [6]:
import os
import pandas as pd
import numpy as np
from Bio import SeqIO
import pickle
import random
from multiprocessing import Pool

In [14]:

data_root = '/data1/APA/Paul_ALS_Data/bams_in/subscelltype_bamfiles/Mapper_outs/'
os.listdir(data_root)

['Astrocytes_sALSvsCTRL',
 'IN-SST_C9ALSvsCTRL',
 'L4_C9ALSvsCTRL',
 'Inhibitory_sALSvsCTRL',
 'L5-6-CC_C9ALSvsCTRL',
 'Excitatory_sALSvsCTRL',
 'L5-6_C9ALSvsCTRL',
 'Excitatory_C9ALSvsCTRL',
 'Astrocytes_C9ALSvsCTRL',
 'L2-3_sALSvsCTRL',
 'AST-FB_sALSvsCTRL',
 'Oligodendrocytes_C9ALSvsCTRL',
 'get_fasta_for_switches.sh',
 'sALS_ALL_training_test_data.pkl',
 'train_data.npy',
 'OPC_C9ALSvsCTRL',
 'all_seqs_celltypes_input',
 'valid_data.npy',
 'data_for_DL',
 'REDU_plots',
 'tst_train_data.npy',
 'IN-VIP_C9ALSvsCTRL',
 'L4_sALSvsCTRL',
 'AST-PP_C9ALSvsCTRL',
 'IN-VIP_sALSvsCTRL',
 'AST-PP_sALSvsCTRL',
 'L5-6-CC_sALSvsCTRL',
 'all_seqs_celltypes_input.pkl',
 'IN-PV_C9ALSvsCTRL',
 'c9als_all_seqs_celltypes_input.pkl',
 'Oligodendrocytes_sALSvsCTRL',
 'IN-SST_sALSvsCTRL',
 'OPC_sALSvsCTRL',
 'L2-3_C9ALSvsCTRL',
 'test_data.npy',
 'Microglia_sALSvsCTRL',
 'Inhibitory_C9ALSvsCTRL',
 'C9ALS_ALL_training_test_data.pkl',
 'Microglia_C9ALSvsCTRL',
 'Endothelial_ALSvsCTRL',
 'TF_modisco',
 'IN-P

In [3]:
ct = [ct for ct in os.listdir(data_root) if '_C9ALS' in ct ]
ct

['IN-SST_C9ALSvsCTRL',
 'L4_C9ALSvsCTRL',
 'L5-6-CC_C9ALSvsCTRL',
 'L5-6_C9ALSvsCTRL',
 'Excitatory_C9ALSvsCTRL',
 'Astrocytes_C9ALSvsCTRL',
 'Oligodendrocytes_C9ALSvsCTRL',
 'OPC_C9ALSvsCTRL',
 'IN-VIP_C9ALSvsCTRL',
 'AST-PP_C9ALSvsCTRL',
 'IN-PV_C9ALSvsCTRL',
 'L2-3_C9ALSvsCTRL',
 'Inhibitory_C9ALSvsCTRL',
 'Microglia_C9ALSvsCTRL',
 'AST-FB_C9ALSvsCTRL']

In [4]:
# C9ALS first # keep main celltyeps only
len(ct)

15

In [5]:
sequences_dict = {}
for ct_cn in ct:
    inp_fa = data_root + "/{}/switch_DNA_sequence.fa".format(ct_cn)
    inp_fa = SeqIO.parse(inp_fa, "fasta")
    for rec in inp_fa:
        if rec.id not in sequences_dict:
            sequences_dict[rec.id] = str(rec.seq)
        else:
            continue 
print(len(sequences_dict))


109095


In [6]:
celltypes = [e.split('_')[0] for e in ct]
celltypes = sorted(celltypes)

In [8]:
def transcribe_positive_strand(seq):
    """ input is the 5' to 3' coding squence
        so the RNA will be exact sequence except
        U instead of T
    """
    return(seq.replace('T','U'))

def transcribe_negative_strand(seq):
    """ input is the 5' to 3' template squence
        so the function complement and returns
        the reverse of sequence
    """
    complement = {'A': 'U', 'C': 'G', 'G': 'C', 'T': 'A'}
    return "".join(complement.get(base, base) for base in reversed(seq))

In [8]:
transcribed_sequences = {}
for key,value in sequences_dict.items():
    strand = key.split(':')[-1]
    if strand == '+':
        transcribed_sequences[key] = transcribe_positive_strand(value)
    else:
        transcribed_sequences[key] = transcribe_negative_strand(value)

In [9]:
len(transcribed_sequences)

109095

In [1]:
def get_1h_seq(seq):
    """
    This is a simple code to get one-hot representaton of the
    senquences.
    --------------
    Arguments:
    seq: RNA sequence in fasta format
    """
    nt_code = {
        "N": [0, 0, 0, 0],
        "A": [1, 0, 0, 0],
        "C": [0, 1, 0, 0],
        "G": [0, 0, 1, 0],
        "U": [0, 0, 0, 1],
    }

    seq_1h = [nt_code[nt] for nt in seq]
    return np.array(seq_1h)

def makes_seqs_ready(seq,max_len,left_pad_max):
    """
    takes in the sequence, max len and max left pad
    and returns the one hot representation of sequence with maximum pad
    """
    if len(seq) < max_len:
        diff = max_len - len(seq)
        lp = random.randint(0, left_pad_max)
        lp = min(lp, diff)
        seq = get_1h_seq(
            "N" * (lp) + str(seq) + "N" * (max_len - len(seq) - lp)
        )
    else:
        seq = get_1h_seq(str(seq))
    seq = np.array(seq, dtype=np.int8).swapaxes(0, 1)
    
    return seq


In [10]:
keys = list(transcribed_sequences.keys())
seqs = [transcribed_sequences[k] for k in keys]
indexes = list(range(len(keys)))

seqs_data = pd.DataFrame([indexes, keys, seqs]).T
seqs_data.columns = ['switch_index', 'switch_id', 'sequence']
seqs_data

Unnamed: 0,switch_index,switch_id,sequence
0,0,chr12:AACS:125140928:125143316:+,GUGAGGCGGGACAAACUUGUCUUCCUCACACCCAUCUUACUUCCUC...
1,1,chr4:AADAT:170060271:170060673:-,UGCAUGAAUAAUGUAUCACAACCUACUGAGCUGCGUAAAUUGUCAU...
2,2,chr2:AAK1:69457997:69461526:-,UUGUAUUCUUAGUAGAGAUGGGGUUUCACCAUGUUGGCCAGGUUGG...
3,3,chr2:AAK1:69457997:69464011:-,CGAAGAAUCAAGAUUAGUCCCUCCACUAAAUCAAGAUUUAGUACUC...
4,4,chr2:AAK1:69457997:69465314:-,CUCGGCCUCUCCUUAUGGAGUCUGAAGAAGAAGAUGAGAGCUGCAG...
...,...,...,...
109090,109090,chr19:ZNF460:57298458:57305152:+,AGAAUAGCUAGGACUACUGGUAUGCGCUAUCAUGACCAGCUAAUCU...
109091,109091,chr19:ZNF460:57304422:57305152:+,ACCAGAGUUUUUGUUAUUACUUUGACAUAUUCUGGUGCUUUCCUUU...
109092,109092,chr19:ZNF460:57305152:57306833:+,GUGAUCCGCCCACCUCAGCCUCCCAAAGUGCUGGGAUUACAGGCGU...
109093,109093,chr19:ZNF460:57305152:57308529:+,GUGAUCCGCCCACCUCAGCCUCCCAAAGUGCUGGGAUUACAGGCGU...


In [11]:
seqs_data.to_csv('/data1/APA/Paul_ALS_Data/bams_in/subscelltype_bamfiles/Mapper_outs/data_for_DL/C9ALS/All_sequences.csv')

In [11]:
import scipy.stats as ss

def rank_to_normal(rank, c, n):
    # Standard quantile function
    x = (rank - c) / (n - 2*c + 1)
    return ss.norm.ppf(x)

def rank_INT(series, c=3.0/8, stochastic=True):
    """ Perform rank-based inverse normal transformation on pandas series.
        If stochastic is True ties are given rank randomly, otherwise ties will
        share the same value. NaN values are ignored.
        Args:
            param1 (pandas.Series):   Series of values to transform
            param2 (Optional[float]): Constand parameter (Bloms constant)
            param3 (Optional[bool]):  Whether to randomise rank of ties
        
        Returns:
            pandas.Series
    """

    # Check input
    #assert(isinstance(series, pd.Series))
    #assert(isinstance(c, float))
    #assert(isinstance(stochastic, bool))

    # Set seed
    np.random.seed(123)

    # Take original series indexes
    orig_idx = series.index

    # Drop NaNs
    series = series.loc[~pd.isnull(series)]

    # Get ranks
    if stochastic == True:
        # Shuffle by index
        series = series.loc[np.random.permutation(series.index)]
        # Get rank, ties are determined by their position in the series (hence
        # why we randomised the series)
        rank = ss.rankdata(series, method="ordinal")
    else:
        # Get rank, ties are averaged
        rank = ss.rankdata(series, method="average")

    # Convert numpy array back to series
    rank = pd.Series(rank, index=series.index)

    # Convert rank to normal distribution
    transformed = rank.apply(rank_to_normal, c=c, n=len(rank))
    
    return transformed[orig_idx]

def get_sig_lfc(df, name):
    tmp_df =  df.loc[df['switch_name'] == name]
    lfc = round(float(tmp_df['LFC_rand_INT']),4)
    return(lfc)
    
def get_sig_mult(val):
    if val > 1.3:
        return 1
    else:
        return 0
## this will take couple of minutes ~ 10-15 min 
labels = {}
for key in sequences_dict.keys():
    labels[key] = {}
for ct in celltypes:
    print(ct)
    df_name = data_root + ct + "_C9ALSvsCTRL/APAlog_res_metadata_added.tsv"
    inp_df = pd.read_csv(df_name, sep='\t')
    ## get significant APA switches
    inp_df['sig_multiplyer'] = inp_df['negative_logFDR'].apply(get_sig_mult) 
    inp_df['sig_LFC_PA_Usage'] = inp_df['sig_multiplyer'] * inp_df['LFC_PA_Usage']
    inp_df = inp_df[~pd.isnull(inp_df['sig_LFC_PA_Usage'])] ## remove NaNs
    inp_df['LFC_rand_INT'] = rank_INT(inp_df['sig_LFC_PA_Usage']) 
    for i in range(inp_df.shape[0]):
        try:
            seq_id = inp_df.iloc[i]['switch_name']
            labels[seq_id][ct] = get_sig_lfc(inp_df, seq_id)
        except:
            continue

In [13]:
# get the pd dataframe from labels
seq_ids = list(labels.keys())
seq_ids_all = []
celltypes_ids = []
PA_vals = []
for seq_id in seq_ids:
    for ct in labels[seq_id].keys():
        celltypes_ids.append(ct)
        PA_vals.append(labels[seq_id][ct])
        seq_ids_all.append(seq_id)
print(len(seq_ids), len(celltypes_ids), len(PA_vals))


109095 317572 317572


In [14]:
print(len(seq_ids_all), len(celltypes_ids), len(PA_vals))

317572 317572 317572


In [15]:
seq_id_idx = dict(zip(seqs_data['switch_id'], seqs_data['switch_index']))
seq_id_idx

{'chr12:AACS:125140928:125143316:+': 0,
 'chr4:AADAT:170060271:170060673:-': 1,
 'chr2:AAK1:69457997:69461526:-': 2,
 'chr2:AAK1:69457997:69464011:-': 3,
 'chr2:AAK1:69457997:69465314:-': 4,
 'chr2:AAK1:69457997:69467864:-': 5,
 'chr2:AAK1:69457997:69472042:-': 6,
 'chr2:AAK1:69461526:69464011:-': 7,
 'chr2:AAK1:69461526:69465314:-': 8,
 'chr2:AAK1:69461526:69467864:-': 9,
 'chr2:AAK1:69461526:69472042:-': 10,
 'chr2:AAK1:69461526:69474509:-': 11,
 'chr2:AAK1:69464011:69465314:-': 12,
 'chr2:AAK1:69464011:69467864:-': 13,
 'chr2:AAK1:69464011:69472042:-': 14,
 'chr2:AAK1:69464011:69474509:-': 15,
 'chr2:AAK1:69465314:69467864:-': 16,
 'chr2:AAK1:69465314:69472042:-': 17,
 'chr2:AAK1:69465314:69474509:-': 18,
 'chr2:AAK1:69467864:69472042:-': 19,
 'chr2:AAK1:69467864:69474509:-': 20,
 'chr2:AAK1:69472042:69474509:-': 21,
 'chr6:AARS2:44299560:44300312:-': 22,
 'chr11:AASDHPPT:106097048:106098594:+': 23,
 'chr11:AASDHPPT:106097048:106098695:+': 24,
 'chr17:AATK:81110484:81117295:-': 25,


In [16]:
seq_idx = [seq_id_idx[seq_id] for seq_id in seq_ids_all]

In [17]:
label_data = pd.DataFrame([seq_ids_all,seq_idx,celltypes_ids, PA_vals]).T
label_data.columns = ['switch_id','switch_inx','celltype','APA_lfc']
label_data

Unnamed: 0,switch_id,switch_inx,celltype,APA_lfc
0,chr12:AACS:125140928:125143316:+,0,AST-PP,0.3208
1,chr12:AACS:125140928:125143316:+,0,Astrocytes,-0.2384
2,chr12:AACS:125140928:125143316:+,0,Excitatory,-1.2717
3,chr12:AACS:125140928:125143316:+,0,IN-SST,0.4777
4,chr12:AACS:125140928:125143316:+,0,Inhibitory,0.3739
...,...,...,...,...
317567,chr19:ZNF460:57298458:57305152:+,109090,AST-FB,-0.3944
317568,chr19:ZNF460:57304422:57305152:+,109091,AST-FB,0.742
317569,chr19:ZNF460:57305152:57306833:+,109092,AST-FB,-1.1709
317570,chr19:ZNF460:57305152:57308529:+,109093,AST-FB,-1.069


In [18]:
label_data.to_csv('/data1/APA/Paul_ALS_Data/bams_in/subscelltype_bamfiles/Mapper_outs/data_for_DL/C9ALS/All_labels.csv')

In [8]:
import pandas as pd
import numpy as np
import random
label_data = pd.read_csv('/data1/APA/Paul_ALS_Data/bams_in/subscelltype_bamfiles/Mapper_outs/data_for_DL/C9ALS/All_labels.csv',index_col=0)

In [5]:
# lets add a column for APA switch where we threshold the APA LFC to be higher than 0.25 or less than -0.25

def get_sig_lfcs(val):
    """ takes in the LFC value and returns a 2 dimensional numpy array
    if the LFC is greater than 0.25, then the array is [1,0]
    if the LFC is less than -0.25, then the array is [0,1]
    else the array is [0,0]
    """
    if val > 0.25:
        return np.array([1,0], dtype=np.float32)
    elif val < -0.25:
        return np.array([0,1], dtype=np.float32)
    else:
        return np.array([0,0],  dtype=np.float32)

label_data['sig_lfcs'] = label_data['APA_lfc'].apply(get_sig_lfcs)


In [6]:
label_data

Unnamed: 0,switch_id,switch_inx,celltype,APA_lfc,sig_lfcs
0,chr12:AACS:125140928:125143316:+,0,AST-PP,0.3208,"[1.0, 0.0]"
1,chr12:AACS:125140928:125143316:+,0,Astrocytes,-0.2384,"[0.0, 0.0]"
2,chr12:AACS:125140928:125143316:+,0,Excitatory,-1.2717,"[0.0, 1.0]"
3,chr12:AACS:125140928:125143316:+,0,IN-SST,0.4777,"[1.0, 0.0]"
4,chr12:AACS:125140928:125143316:+,0,Inhibitory,0.3739,"[1.0, 0.0]"
...,...,...,...,...,...
317567,chr19:ZNF460:57298458:57305152:+,109090,AST-FB,-0.3944,"[0.0, 1.0]"
317568,chr19:ZNF460:57304422:57305152:+,109091,AST-FB,0.7420,"[1.0, 0.0]"
317569,chr19:ZNF460:57305152:57306833:+,109092,AST-FB,-1.1709,"[0.0, 1.0]"
317570,chr19:ZNF460:57305152:57308529:+,109093,AST-FB,-1.0690,"[0.0, 1.0]"


In [12]:
# lets add celltype index column
celltype_dict = {celltype:i for i,celltype in enumerate(sorted(label_data['celltype'].unique()))}
label_data['celltype_idx'] =  label_data['celltype'].map(celltype_dict)
label_data

Unnamed: 0,switch_id,switch_inx,celltype,APA_lfc,sig_lfcs,celltype_idx
0,chr12:AACS:125140928:125143316:+,0,AST-PP,0.3208,"[1.0, 0.0]",1
1,chr12:AACS:125140928:125143316:+,0,Astrocytes,-0.2384,"[0.0, 0.0]",2
2,chr12:AACS:125140928:125143316:+,0,Excitatory,-1.2717,"[0.0, 1.0]",3
3,chr12:AACS:125140928:125143316:+,0,IN-SST,0.4777,"[1.0, 0.0]",5
4,chr12:AACS:125140928:125143316:+,0,Inhibitory,0.3739,"[1.0, 0.0]",7
...,...,...,...,...,...,...
317567,chr19:ZNF460:57298458:57305152:+,109090,AST-FB,-0.3944,"[0.0, 1.0]",0
317568,chr19:ZNF460:57304422:57305152:+,109091,AST-FB,0.7420,"[1.0, 0.0]",0
317569,chr19:ZNF460:57305152:57306833:+,109092,AST-FB,-1.1709,"[0.0, 1.0]",0
317570,chr19:ZNF460:57305152:57308529:+,109093,AST-FB,-1.0690,"[0.0, 1.0]",0


In [13]:
label_data.to_csv('/data1/APA/Paul_ALS_Data/bams_in/subscelltype_bamfiles/Mapper_outs/data_for_DL/C9ALS/All_labels.csv')

# lets repeat everything for sALS data and call it donw

In [13]:
ct = [ct for ct in os.listdir(data_root) if '_sALS' in ct]
ct

['Astrocytes_sALSvsCTRL',
 'Inhibitory_sALSvsCTRL',
 'Excitatory_sALSvsCTRL',
 'L2-3_sALSvsCTRL',
 'AST-FB_sALSvsCTRL',
 'L4_sALSvsCTRL',
 'IN-VIP_sALSvsCTRL',
 'AST-PP_sALSvsCTRL',
 'L5-6-CC_sALSvsCTRL',
 'Oligodendrocytes_sALSvsCTRL',
 'IN-SST_sALSvsCTRL',
 'OPC_sALSvsCTRL',
 'Microglia_sALSvsCTRL',
 'IN-PV_sALSvsCTRL']

In [15]:
# C9ALS first # keep main celltyeps only


celltypes = [e.split('_')[0] for e in ct]
celltypes = sorted(celltypes)

sequences_dict = {}
for ct_cn in ct:
    inp_fa = data_root + "/{}/switch_DNA_sequence.fa".format(ct_cn)
    inp_fa = SeqIO.parse(inp_fa, "fasta")
    for rec in inp_fa:
        if rec.id not in sequences_dict:
            sequences_dict[rec.id] = str(rec.seq)
        else:
            continue 
print(len(sequences_dict))

transcribed_sequences = {}
for key,value in sequences_dict.items():
    strand = key.split(':')[-1]
    if strand == '+':
        transcribed_sequences[key] = transcribe_positive_strand(value)
    else:
        transcribed_sequences[key] = transcribe_negative_strand(value)



keys = list(transcribed_sequences.keys())
seqs = [transcribed_sequences[k] for k in keys]
indexes = list(range(len(keys)))

seqs_data = pd.DataFrame([indexes, keys, seqs]).T
seqs_data.columns = ['switch_index', 'switch_id', 'sequence']
print(seqs_data.shape)
seqs_data.to_csv('/data1/APA/Paul_ALS_Data/bams_in/subscelltype_bamfiles/Mapper_outs/data_for_DL/sALS/All_sequences.csv')


## this will take couple of minutes ~ 10-15 min 
labels = {}
for key in sequences_dict.keys():
    labels[key] = {}
for ct in celltypes:
    print(ct)
    df_name = data_root + ct + "_sALSvsCTRL/APAlog_res_metadata_added.tsv"
    inp_df = pd.read_csv(df_name, sep='\t')
    ## get significant APA switches
    inp_df['sig_multiplyer'] = inp_df['negative_logFDR'].apply(get_sig_mult) 
    inp_df['sig_LFC_PA_Usage'] = inp_df['sig_multiplyer'] * inp_df['LFC_PA_Usage']
    inp_df = inp_df[~pd.isnull(inp_df['sig_LFC_PA_Usage'])] ## remove NaNs
    inp_df['LFC_rand_INT'] = rank_INT(inp_df['sig_LFC_PA_Usage']) 
    for i in range(inp_df.shape[0]):
        try:
            seq_id = inp_df.iloc[i]['switch_name']
            labels[seq_id][ct] = get_sig_lfc(inp_df, seq_id)
        except:
            continue

# get the pd dataframe from labels
seq_ids = list(labels.keys())
seq_ids_all = []
celltypes_ids = []
PA_vals = []
for seq_id in seq_ids:
    for ct in labels[seq_id].keys():
        celltypes_ids.append(ct)
        PA_vals.append(labels[seq_id][ct])
        seq_ids_all.append(seq_id)
print(len(seq_ids), len(celltypes_ids), len(PA_vals))
print(len(seq_ids_all), len(celltypes_ids), len(PA_vals))

seq_id_idx = dict(zip(seqs_data['switch_id'], seqs_data['switch_index']))
seq_idx = [seq_id_idx[seq_id] for seq_id in seq_ids_all]
label_data = pd.DataFrame([seq_ids_all,seq_idx,celltypes_ids, PA_vals]).T
label_data.columns = ['switch_id','switch_inx','celltype','APA_lfc']
label_data.to_csv('/data1/APA/Paul_ALS_Data/bams_in/subscelltype_bamfiles/Mapper_outs/data_for_DL/sALS/All_labels.csv')

108197
(108197, 3)
AST-FB
AST-PP
Astrocytes
Excitatory
IN-PV
IN-SST
IN-VIP
Inhibitory
L2-3
L4
L5-6-CC
Microglia
OPC
Oligodendrocytes
108197 280970 280970
280970 280970 280970


In [14]:
label_data = pd.read_csv('/data1/APA/Paul_ALS_Data/bams_in/subscelltype_bamfiles/Mapper_outs/data_for_DL/sALS/All_labels.csv', index_col=0)
label_data['sig_lfcs'] = label_data['APA_lfc'].apply(get_sig_lfcs)
# lets add celltype index column
celltype_dict = {celltype:i for i,celltype in enumerate(sorted(label_data['celltype'].unique()))}
label_data['celltype_idx'] =  label_data['celltype'].map(celltype_dict)
label_data.to_csv('/data1/APA/Paul_ALS_Data/bams_in/subscelltype_bamfiles/Mapper_outs/data_for_DL/sALS/All_labels.csv')


In [15]:
label_data

Unnamed: 0,switch_id,switch_inx,celltype,APA_lfc,sig_lfcs,celltype_idx
0,chr12:A2ML1:8875813:8876783:+,0,AST-PP,0.2438,"[0.0, 0.0]",1
1,chr12:A2ML1:8875813:8876783:+,0,Astrocytes,0.1231,"[0.0, 0.0]",2
2,chr12:A2ML1:8875813:8877345:+,1,AST-PP,0.7859,"[1.0, 0.0]",1
3,chr12:A2ML1:8875813:8877345:+,1,Astrocytes,1.0521,"[1.0, 0.0]",2
4,chr12:A2ML1:8875813:8879773:+,2,AST-PP,1.8421,"[1.0, 0.0]",1
...,...,...,...,...,...,...
280965,chr19:ZNF587:57865117:57865809:+,108192,IN-PV,0.2524,"[1.0, 0.0]",4
280966,chr19:ZNF708:21282804:21291160:-,108193,IN-PV,-0.5841,"[0.0, 1.0]",4
280967,chr19:ZNF708:21282804:21292760:-,108194,IN-PV,0.8849,"[1.0, 0.0]",4
280968,chr10:ZRANB1:124985288:124986464:+,108195,IN-PV,-0.9328,"[0.0, 1.0]",4
