In [3]:
import os
import pandas as pd
import numpy as np
from Bio import SeqIO
import pickle
import random
from multiprocessing import Pool
import scipy.stats as ss

In [4]:

data_root = '/data1/APA/Paul_ALS_Data/bams_in/subscelltype_bamfiles/Mapper_outs/'
os.listdir(data_root)

['get_fasta_for_PAs_neighbour_region_ref.sh',
 'apa_type_dist_allCTs.csv',
 'Astrocytes_sALSvsCTRL',
 'IN-SST_C9ALSvsCTRL',
 'L4_C9ALSvsCTRL',
 'Inhibitory_sALSvsCTRL',
 'L5-6-CC_C9ALSvsCTRL',
 'Excitatory_sALSvsCTRL',
 'L5-6_C9ALSvsCTRL',
 'Excitatory_C9ALSvsCTRL',
 'Astrocytes_C9ALSvsCTRL',
 'L2-3_sALSvsCTRL',
 'AST-FB_sALSvsCTRL',
 'Oligodendrocytes_C9ALSvsCTRL',
 'get_fasta_for_switches.sh',
 'sALS_ALL_training_test_data.pkl',
 'results',
 'train_data.npy',
 'V2',
 'OPC_C9ALSvsCTRL',
 'all_seqs_celltypes_input',
 'valid_data.npy',
 'data_for_DL',
 'REDU_plots',
 'tst_train_data.npy',
 'IN-VIP_C9ALSvsCTRL',
 'L4_sALSvsCTRL',
 'AST-PP_C9ALSvsCTRL',
 'IN-VIP_sALSvsCTRL',
 'AST-PP_sALSvsCTRL',
 'L5-6-CC_sALSvsCTRL',
 'all_seqs_celltypes_input.pkl',
 'IN-PV_C9ALSvsCTRL',
 'c9als_all_seqs_celltypes_input.pkl',
 'Oligodendrocytes_sALSvsCTRL',
 'get_fasta_for_PAs_neighbour_region_alt.sh',
 'IN-SST_sALSvsCTRL',
 'OPC_sALSvsCTRL',
 'L2-3_C9ALSvsCTRL',
 'test_data.npy',
 'APA_usage_volcano_pl

In [5]:
# functions block

def transcribe_positive_strand(seq):
    """ input is the 5' to 3' coding squence
        so the RNA will be exact sequence except
        U instead of T
    """
    return(seq.replace('T','U'))

def transcribe_negative_strand(seq):
    """ input is the 5' to 3' template squence
        so the function complement and returns
        the reverse of sequence
    """
    complement = {'A': 'U', 'C': 'G', 'G': 'C', 'T': 'A'}
    return "".join(complement.get(base, base) for base in reversed(seq))

def get_1h_seq(seq):
    """
    This is a simple code to get one-hot representaton of the
    senquences.
    --------------
    Arguments:
    seq: RNA sequence in fasta format
    """
    nt_code = {
        "N": [0, 0, 0, 0],
        "A": [1, 0, 0, 0],
        "C": [0, 1, 0, 0],
        "G": [0, 0, 1, 0],
        "U": [0, 0, 0, 1],
    }

    seq_1h = [nt_code[nt] for nt in seq]
    return np.array(seq_1h)

def makes_seqs_ready(seq,max_len,left_pad_max):
    """
    takes in the sequence, max len and max left pad
    and returns the one hot representation of sequence with maximum pad
    """
    if len(seq) < max_len:
        diff = max_len - len(seq)
        lp = random.randint(0, left_pad_max)
        lp = min(lp, diff)
        seq = get_1h_seq(
            "N" * (lp) + str(seq) + "N" * (max_len - len(seq) - lp)
        )
    else:
        seq = get_1h_seq(str(seq))
    seq = np.array(seq, dtype=np.int8).swapaxes(0, 1)
    
    return seq


def rank_to_normal(rank, c, n):
    # Standard quantile function
    x = (rank - c) / (n - 2*c + 1)
    return ss.norm.ppf(x)

def rank_INT(series, c=3.0/8, stochastic=True):
    """ Perform rank-based inverse normal transformation on pandas series.
        If stochastic is True ties are given rank randomly, otherwise ties will
        share the same value. NaN values are ignored.
        Args:
            param1 (pandas.Series):   Series of values to transform
            param2 (Optional[float]): Constand parameter (Bloms constant)
            param3 (Optional[bool]):  Whether to randomise rank of ties
        
        Returns:
            pandas.Series
    """

    # Check input
    #assert(isinstance(series, pd.Series))
    #assert(isinstance(c, float))
    #assert(isinstance(stochastic, bool))

    # Set seed
    np.random.seed(123)

    # Take original series indexes
    orig_idx = series.index

    # Drop NaNs
    series = series.loc[~pd.isnull(series)]

    # Get ranks
    if stochastic == True:
        # Shuffle by index
        series = series.loc[np.random.permutation(series.index)]
        # Get rank, ties are determined by their position in the series (hence
        # why we randomised the series)
        rank = ss.rankdata(series, method="ordinal")
    else:
        # Get rank, ties are averaged
        rank = ss.rankdata(series, method="average")

    # Convert numpy array back to series
    rank = pd.Series(rank, index=series.index)

    # Convert rank to normal distribution
    transformed = rank.apply(rank_to_normal, c=c, n=len(rank))
    
    return transformed[orig_idx]

def get_sig_lfc(df, name):
    tmp_df =  df.loc[df['switch_name'] == name]
    lfc = round(float(tmp_df['LFC_rand_INT']),4)
    return(lfc)
    
def get_sig_mult(val):
    if val > 1.3:
        return 1
    else:
        return 0

In [6]:
c9ALS_ct = [ct for ct in os.listdir(data_root) if '_C9ALS' in ct ]
sALS_ct = [ct for ct in os.listdir(data_root) if '_sALS' in ct ]
print(len(c9ALS_ct), len(sALS_ct))
print(c9ALS_ct[1:3])
print(sALS_ct[1:3])

15 14
['L4_C9ALSvsCTRL', 'L5-6-CC_C9ALSvsCTRL']
['Inhibitory_sALSvsCTRL', 'Excitatory_sALSvsCTRL']


In [31]:
# get the sequences for the C9 and sALS
c9_sequences_dict = {}
for ct_cn in c9ALS_ct:
    inp_fa_1 = data_root + "/{}/PAs_neighbour_region_alt_sequence.fa".format(ct_cn)
    inp_fa_1 = SeqIO.parse(inp_fa_1, "fasta")
    inp_fa_2 = data_root + "/{}/PAs_neighbour_region_ref_sequence.fa".format(ct_cn)
    inp_fa_2 = SeqIO.parse(inp_fa_2, "fasta")
    # iterate over the two files and add the sequences to the dictionary
    for rec1,rec2 in zip(inp_fa_1,inp_fa_2):
        if rec1.id not in c9_sequences_dict:
            c9_sequences_dict[rec1.id] = str(rec1.seq) + str(rec2.seq)
        else:
            continue

    
print(len(c9_sequences_dict))

182739


In [29]:
sALS_sequences_dict = {}
for ct_cn in sALS_ct:
    inp_fa_1 = data_root + "/{}/PAs_neighbour_region_alt_sequence.fa".format(ct_cn)
    inp_fa_1 = SeqIO.parse(inp_fa_1, "fasta")
    inp_fa_2 = data_root + "/{}/PAs_neighbour_region_ref_sequence.fa".format(ct_cn)
    inp_fa_2 = SeqIO.parse(inp_fa_2, "fasta")
    # iterate over the two files and add the sequences to the dictionary
    for rec1,rec2 in zip(inp_fa_1,inp_fa_2):
        if rec1.id not in sALS_sequences_dict:
            sALS_sequences_dict[rec1.id] = str(rec1.seq) + str(rec2.seq)
        else:
            continue
    
print(len(sALS_sequences_dict))

183581


In [33]:
# get the celltype names for C9 and sALS
c9_celltypes = [e.split('_')[0] for e in c9ALS_ct]
c9_celltypes = sorted(c9_celltypes)
sALS_celltyes = [e.split('_')[0] for e in sALS_ct]
sALS_celltyes = sorted(sALS_celltyes)

In [34]:
# lets transcribe the sequences for C9ALS
c9ALS_transcribed_sequences = {}
for key,value in c9_sequences_dict.items():
    strand = key.split(':')[-1]
    if strand == '+':
        c9ALS_transcribed_sequences[key] = transcribe_positive_strand(value)
    else:
        c9ALS_transcribed_sequences[key] = transcribe_negative_strand(value)
# for sALS
sALS_transcribed_sequences = {}
for key,value in sALS_sequences_dict.items():
    strand = key.split(':')[-1]
    if strand == '+':
        sALS_transcribed_sequences[key] = transcribe_positive_strand(value)
    else:
        sALS_transcribed_sequences[key] = transcribe_negative_strand(value)
#######

In [8]:
# c9ALS
keys = list(c9ALS_transcribed_sequences.keys())
seqs = [c9ALS_transcribed_sequences[k] for k in keys]
indexes = list(range(len(keys)))

c9ALS_seqs_data = pd.DataFrame([indexes, keys, seqs]).T
c9ALS_seqs_data.columns = ['switch_index', 'switch_id', 'sequence']

c9ALS_seqs_data.to_csv(data_root + 'V2/C9ALS_All_sequences.csv')
# sALS
keys = list(sALS_transcribed_sequences.keys())
seqs = [sALS_transcribed_sequences[k] for k in keys]
indexes = list(range(len(keys)))

sALS_seqs_data = pd.DataFrame([indexes, keys, seqs]).T
sALS_seqs_data.columns = ['switch_index', 'switch_id', 'sequence']
sALS_seqs_data.to_csv(data_root + 'V2/sALS_All_sequences.csv')
#####

In [None]:
c9ALS_seqs_data

In [17]:
## this will take couple of minutes ~ 10-15 min 
c9ALS_labels = {}
for key in c9_sequences_dict.keys():
    c9ALS_labels[key] = {}
for ct in c9_celltypes:
    print(ct)
    df_name = data_root + ct + "_C9ALSvsCTRL/APAlog_res_metadata_added_adj_pval.tsv"
    inp_df = pd.read_csv(df_name, sep='\t')
    ###
    ## get significant APA switches
    inp_df = inp_df[~pd.isnull(inp_df['sig_LFC_PA_Usage_2'])] ## remove NaNs
    inp_df['LFC_rand_INT'] = rank_INT(inp_df['sig_LFC_PA_Usage_2']) 
    for i in range(inp_df.shape[0]):
        try:
            seq_id = inp_df.iloc[i]['switch_name']
            c9ALS_labels[seq_id][ct] = inp_df.iloc[i]['LFC_rand_INT']
        except:
            continue
# get the pd dataframe from labels
c9_seq_ids = list(c9ALS_labels.keys())
seq_ids_all = []
celltypes_ids = []
PA_vals = []
for seq_id in c9_seq_ids:
    for ct in c9ALS_labels[seq_id].keys():
        celltypes_ids.append(ct)
        PA_vals.append(c9ALS_labels[seq_id][ct])
        seq_ids_all.append(seq_id)

AST-FB
AST-PP
Astrocytes
Excitatory
IN-PV
IN-SST
IN-VIP
Inhibitory
L2-3
L4
L5-6
L5-6-CC
Microglia
OPC
Oligodendrocytes


In [19]:
print(len(c9_seq_ids), len(celltypes_ids), len(PA_vals))
seq_id_idx = dict(zip(c9ALS_seqs_data['switch_id'], c9ALS_seqs_data['switch_index']))
seq_idx = [seq_id_idx[seq_id] for seq_id in seq_ids_all]
c9_label_data = pd.DataFrame([seq_ids_all,seq_idx,celltypes_ids, PA_vals]).T
c9_label_data.columns = ['switch_id','switch_inx','celltype','APA_lfc']
c9_label_data.to_csv('/data1/APA/Paul_ALS_Data/bams_in/subscelltype_bamfiles/Mapper_outs/V2/C9ALS_All_labels.csv')
c9_label_data

109095 317572 317572


Unnamed: 0,switch_id,switch_inx,celltype,APA_lfc
0,chr12:AACS:125140928:125143316:+,0,AST-PP,0.358851
1,chr12:AACS:125140928:125143316:+,0,Astrocytes,-0.282396
2,chr12:AACS:125140928:125143316:+,0,Excitatory,-1.273854
3,chr12:AACS:125140928:125143316:+,0,IN-SST,0.478505
4,chr12:AACS:125140928:125143316:+,0,Inhibitory,0.382285
...,...,...,...,...
317567,chr19:ZNF460:57298458:57305152:+,109090,AST-FB,-0.392558
317568,chr19:ZNF460:57304422:57305152:+,109091,AST-FB,0.758601
317569,chr19:ZNF460:57305152:57306833:+,109092,AST-FB,-1.196236
317570,chr19:ZNF460:57305152:57308529:+,109093,AST-FB,-1.108408


In [20]:
## this will take couple of minutes ~ 10-15 min 
sALS_labels = {}
for key in sALS_sequences_dict.keys():
    sALS_labels[key] = {}
for ct in sALS_celltyes:
    print(ct)
    df_name = data_root + ct + "_sALSvsCTRL/APAlog_res_metadata_added_adj_pval.tsv"
    inp_df = pd.read_csv(df_name, sep='\t')
    ###
    ## get significant APA switches
    inp_df = inp_df[~pd.isnull(inp_df['sig_LFC_PA_Usage_2'])] ## remove NaNs
    inp_df['LFC_rand_INT'] = rank_INT(inp_df['sig_LFC_PA_Usage_2']) 
    for i in range(inp_df.shape[0]):
        try:
            seq_id = inp_df.iloc[i]['switch_name']
            sALS_labels[seq_id][ct] = inp_df.iloc[i]['LFC_rand_INT']
        except:
            continue
# get the pd dataframe from labels
sALS_seq_ids = list(sALS_labels.keys())
seq_ids_all = []
celltypes_ids = []
PA_vals = []
for seq_id in sALS_seq_ids:
    for ct in sALS_labels[seq_id].keys():
        celltypes_ids.append(ct)
        PA_vals.append(sALS_labels[seq_id][ct])
        seq_ids_all.append(seq_id)
print(len(sALS_seq_ids), len(celltypes_ids), len(PA_vals))
seq_id_idx = dict(zip(sALS_seqs_data['switch_id'], sALS_seqs_data['switch_index']))
seq_idx = [seq_id_idx[seq_id] for seq_id in seq_ids_all]
sALS_label_data = pd.DataFrame([seq_ids_all,seq_idx,celltypes_ids, PA_vals]).T
sALS_label_data.columns = ['switch_id','switch_inx','celltype','APA_lfc']
sALS_label_data.to_csv('/data1/APA/Paul_ALS_Data/bams_in/subscelltype_bamfiles/Mapper_outs/V2/sALS_All_labels.csv')
sALS_label_data

AST-FB
AST-PP
Astrocytes
Excitatory
IN-PV
IN-SST
IN-VIP
Inhibitory
L2-3
L4
L5-6-CC
Microglia
OPC
Oligodendrocytes
108197 280970 280970


Unnamed: 0,switch_id,switch_inx,celltype,APA_lfc
0,chr12:A2ML1:8875813:8876783:+,0,AST-PP,0.32252
1,chr12:A2ML1:8875813:8876783:+,0,Astrocytes,0.183424
2,chr12:A2ML1:8875813:8877345:+,1,AST-PP,0.791097
3,chr12:A2ML1:8875813:8877345:+,1,Astrocytes,1.053118
4,chr12:A2ML1:8875813:8879773:+,2,AST-PP,1.842229
...,...,...,...,...
280965,chr19:ZNF587:57865117:57865809:+,108192,IN-PV,0.053053
280966,chr19:ZNF708:21282804:21291160:-,108193,IN-PV,-0.592886
280967,chr19:ZNF708:21282804:21292760:-,108194,IN-PV,0.887524
280968,chr10:ZRANB1:124985288:124986464:+,108195,IN-PV,-1.005287


In [None]:
# check if its the same as previous version

In [7]:
data_root = '/data1/APA/Paul_ALS_Data/bams_in/subscelltype_bamfiles/Mapper_outs/'

In [4]:
c9_als = pd.read_csv('/data1/APA/Paul_ALS_Data/bams_in/subscelltype_bamfiles/Mapper_outs/V2/C9ALS_All_labels.csv', index_col=0)
c9_als

Unnamed: 0,switch_id,switch_inx,celltype,APA_lfc
0,chr12:AACS:125140928:125143316:+,0,AST-PP,0.358851
1,chr12:AACS:125140928:125143316:+,0,Astrocytes,-0.282396
2,chr12:AACS:125140928:125143316:+,0,Excitatory,-1.273854
3,chr12:AACS:125140928:125143316:+,0,IN-SST,0.478505
4,chr12:AACS:125140928:125143316:+,0,Inhibitory,0.382285
...,...,...,...,...
317567,chr19:ZNF460:57298458:57305152:+,109090,AST-FB,-0.392558
317568,chr19:ZNF460:57304422:57305152:+,109091,AST-FB,0.758601
317569,chr19:ZNF460:57305152:57306833:+,109092,AST-FB,-1.196236
317570,chr19:ZNF460:57305152:57308529:+,109093,AST-FB,-1.108408


In [5]:
c9_als[c9_als['switch_id']=='chr2:AAK1:69457997:69461526:-']

Unnamed: 0,switch_id,switch_inx,celltype,APA_lfc
16,chr2:AAK1:69457997:69461526:-,2,AST-PP,1.725703
17,chr2:AAK1:69457997:69461526:-,2,Astrocytes,1.635238
18,chr2:AAK1:69457997:69461526:-,2,Excitatory,-0.11255
19,chr2:AAK1:69457997:69461526:-,2,IN-PV,0.91605
20,chr2:AAK1:69457997:69461526:-,2,IN-SST,0.677908
21,chr2:AAK1:69457997:69461526:-,2,IN-VIP,1.427146
22,chr2:AAK1:69457997:69461526:-,2,Inhibitory,1.070113
23,chr2:AAK1:69457997:69461526:-,2,L2-3,-0.097318
24,chr2:AAK1:69457997:69461526:-,2,L4,0.418467
25,chr2:AAK1:69457997:69461526:-,2,Microglia,1.426603


In [8]:
# lets read in the seq data for c9ALS
c9ALS_seqs_data = pd.read_csv(data_root + 'V2/C9ALS_All_sequences.csv', index_col=0)
c9ALS_seqs_data

Unnamed: 0,switch_index,switch_id,sequence
0,0,chr12:AACS:125140928:125143316:+,GUGAGGCGGGACAAACUUGUCUUCCUCACACCCAUCUUACUUCCUC...
1,1,chr4:AADAT:170060271:170060673:-,UGCAUGAAUAAUGUAUCACAACCUACUGAGCUGCGUAAAUUGUCAU...
2,2,chr2:AAK1:69457997:69461526:-,UUGUAUUCUUAGUAGAGAUGGGGUUUCACCAUGUUGGCCAGGUUGG...
3,3,chr2:AAK1:69457997:69464011:-,CGAAGAAUCAAGAUUAGUCCCUCCACUAAAUCAAGAUUUAGUACUC...
4,4,chr2:AAK1:69457997:69465314:-,CUCGGCCUCUCCUUAUGGAGUCUGAAGAAGAAGAUGAGAGCUGCAG...
...,...,...,...
109090,109090,chr19:ZNF460:57298458:57305152:+,AGAAUAGCUAGGACUACUGGUAUGCGCUAUCAUGACCAGCUAAUCU...
109091,109091,chr19:ZNF460:57304422:57305152:+,ACCAGAGUUUUUGUUAUUACUUUGACAUAUUCUGGUGCUUUCCUUU...
109092,109092,chr19:ZNF460:57305152:57306833:+,GUGAUCCGCCCACCUCAGCCUCCCAAAGUGCUGGGAUUACAGGCGU...
109093,109093,chr19:ZNF460:57305152:57308529:+,GUGAUCCGCCCACCUCAGCCUCCCAAAGUGCUGGGAUUACAGGCGU...


In [10]:
c9ALS_seqs_data['one_hot'] = c9ALS_seqs_data['sequence'].apply(makes_seqs_ready, args=(16000,10))

In [14]:
def split_data(df):
    """
    takes in the pandas dataframe and split to train, test and valid datasets
    """
    df = df[df['sequence'].notna()]
    test_index = np.array(random.sample(range(df.shape[0]), int(float(df.shape[0])*0.15)))
    mask = np.zeros(df.shape[0],dtype=bool)
    mask[test_index] = True
    df_test = df[mask]
    df_test.head()
    tmp_df = df[~mask]
    valid_index = np.array(random.sample(range(tmp_df.shape[0]), int(float(tmp_df.shape[0])*0.05)))
    mask = np.zeros(tmp_df.shape[0],dtype=bool)
    mask[valid_index] = True
    df_valid = tmp_df[mask]
    df_train = tmp_df[~mask]
    
    return (df_train, df_valid, df_test)


In [15]:
df_train, df_valid, df_test = split_data(c9ALS_seqs_data)
print(df_train.shape, df_valid.shape, df_test.shape)

(88095, 4) (4636, 4) (16364, 4)


In [16]:
df_train.head()

Unnamed: 0,switch_index,switch_id,sequence,one_hot
0,0,chr12:AACS:125140928:125143316:+,GUGAGGCGGGACAAACUUGUCUUCCUCACACCCAUCUUACUUCCUC...,"[[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,..."
1,1,chr4:AADAT:170060271:170060673:-,UGCAUGAAUAAUGUAUCACAACCUACUGAGCUGCGUAAAUUGUCAU...,"[[0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1,..."
4,4,chr2:AAK1:69457997:69465314:-,CUCGGCCUCUCCUUAUGGAGUCUGAAGAAGAAGAUGAGAGCUGCAG...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
5,5,chr2:AAK1:69457997:69467864:-,AAAAACCCACUUUUUAACAAAUAAUUUUGAUGGGUUAGACAUGUUC...,"[[0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0,..."
6,6,chr2:AAK1:69457997:69472042:-,UAUAUUGAGAAGGGCUAUGAAGGAUUUAUAUAAUCCUGAUGUUCUU...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,..."


In [17]:
# now filter the labels based on the switch_id in df_train, df_valid and df_test
df_train_labels = c9_als[c9_als['switch_id'].isin(df_train['switch_id'])]
print(df_train_labels.shape)

df_test_labels = c9_als[c9_als['switch_id'].isin(df_test['switch_id'])]
print(df_test_labels.shape)

df_valid_labels = c9_als[c9_als['switch_id'].isin(df_valid['switch_id'])]
print(df_valid_labels.shape)

(256225, 4)
(47790, 4)
(13557, 4)


In [18]:
# okk lets save the df_train, df_valid and df_test as npy files
np.save('/data1/APA/Paul_ALS_Data/bams_in/subscelltype_bamfiles/Mapper_outs/V2/C9ALS_train_seqs.npy', df_train)
np.save('/data1/APA/Paul_ALS_Data/bams_in/subscelltype_bamfiles/Mapper_outs/V2/C9ALS_valid_seqs.npy', df_valid)
np.save('/data1/APA/Paul_ALS_Data/bams_in/subscelltype_bamfiles/Mapper_outs/V2/C9ALS_test_seqs.npy', df_test)

In [25]:
# lets save the labels as npy files as well
np.save('/data1/APA/Paul_ALS_Data/bams_in/subscelltype_bamfiles/Mapper_outs/V2/C9ALS_train_labels.npy', df_train_labels)
np.save('/data1/APA/Paul_ALS_Data/bams_in/subscelltype_bamfiles/Mapper_outs/V2/C9ALS_valid_labels.npy', df_valid_labels)
np.save('/data1/APA/Paul_ALS_Data/bams_in/subscelltype_bamfiles/Mapper_outs/V2/C9ALS_test_labels.npy', df_test_labels)

In [29]:
## ok lets do the same for sALS and save the train valid and test data for sALS
sALS = pd.read_csv(data_root+'/V2/sALS_All_labels.csv', index_col=0)
sALS_seqs_data = pd.read_csv(data_root + 'V2/sALS_All_sequences.csv', index_col=0)
sALS_seqs_data['one_hot'] = sALS_seqs_data['sequence'].apply(makes_seqs_ready, args=(16000,10))

df_train, df_valid, df_test = split_data(sALS_seqs_data)
print(df_train.shape, df_valid.shape, df_test.shape)

# now filter the labels based on the switch_id in df_train, df_valid and df_test
df_train_labels = sALS[sALS['switch_id'].isin(df_train['switch_id'])]
print(df_train_labels.shape)
df_valid_labels = sALS[sALS['switch_id'].isin(df_valid['switch_id'])]
print(df_valid_labels.shape)
df_test_labels = sALS[sALS['switch_id'].isin(df_test['switch_id'])]
print(df_test_labels.shape)

# okk lets save the df_train, df_valid and df_test as npy files
np.save('/data1/APA/Paul_ALS_Data/bams_in/subscelltype_bamfiles/Mapper_outs/V2/sALS_train_seqs.npy', df_train)
np.save('/data1/APA/Paul_ALS_Data/bams_in/subscelltype_bamfiles/Mapper_outs/V2/sALS_valid_seqs.npy', df_valid)
np.save('/data1/APA/Paul_ALS_Data/bams_in/subscelltype_bamfiles/Mapper_outs/V2/sALS_test_seqs.npy', df_test)
# lets save the labels as npy files as well
np.save('/data1/APA/Paul_ALS_Data/bams_in/subscelltype_bamfiles/Mapper_outs/V2/sALS_train_labels.npy', df_train_labels)
np.save('/data1/APA/Paul_ALS_Data/bams_in/subscelltype_bamfiles/Mapper_outs/V2/sALS_valid_labels.npy', df_valid_labels)
np.save('/data1/APA/Paul_ALS_Data/bams_in/subscelltype_bamfiles/Mapper_outs/V2/sALS_test_labels.npy', df_test_labels)


(87370, 4) (4598, 4) (16229, 4)
(226977, 4)
(12028, 4)
(41965, 4)
