In [10]:
#Boilerplate imports
import pandas as pd
import numpy as np
import random
import scipy
import itertools
from collections import Counter
from collections import defaultdict
import functools
import math
from sklearn.model_selection import train_test_split

In [2]:
##Import data

#file paths
#low expression population
raws1path = 'D:/Projects/iSynPro/SynPro/Complete Raw data for sorted cells/S1.fa.uniq.tcga.all.csv'
raws3path = 'D:/Projects/iSynPro/SynPro/Complete Raw data for sorted cells/S3.fa.uniq.tcga.all.csv'
raws5path = 'D:/Projects/iSynPro/SynPro/Complete Raw data for sorted cells/S5.fa.uniq.tcga.all.csv'
#high expression population
raws2path = 'D:/Projects/iSynPro/SynPro/Complete Raw data for sorted cells/S2.fa.uniq.tcga.all.csv'
raws4path = 'D:/Projects/iSynPro/SynPro/Complete Raw data for sorted cells/S4.fa.uniq.tcga.all.csv'
raws6path = 'D:/Projects/iSynPro/SynPro/Complete Raw data for sorted cells/S6.fa.uniq.tcga.all.csv'

#note order of list; low expressors then high espressor datasets
path_list = [raws1path, raws3path, raws5path, raws2path, raws4path, raws6path]

#read in a csv with a single column, split to divide '# of hits' from nucleotide sequence, filter out sequences with < 5 hits
def read_in(filepath):
    dfs1 = pd.read_csv(filepath, engine='python', sep=None, header=None, names=['count','sequence'])
    dfs2 = dfs1.loc[dfs1['count'] > 5]
    return dfs2

#clean up read in sequences, remove sequencing artifact
def clean_read(raws1path):
    p1_raw_df = read_in(raws1path)
    #p1_raw_df['sequence'].replace(regex=True,inplace=True,to_replace=r'TCGAGTAGAGTCTAGACTCTACATTTTGACACCCCCA',value=r'')
    il2mpseq = 'ACATTTTGACACCCCCATAATATTTTTCCAGAATTAACAGTATAAATTGCATCTCTTGTTCAAGAGTTCCCTATCACTCTCTTTAATCACTACTCACAGTAACCTCAACTCCTG'
    p1_raw_df['sequence'] = ['{}{}'.format(s, il2mpseq) for s in p1_raw_df['sequence'].values]
    p1_raw_df = p1_raw_df.reset_index()
    del p1_raw_df['index']
    del p1_raw_df['count']
    return p1_raw_df

sense_cleanlist = [clean_read(path) for path in path_list]
sense_cleanlist[0].head()

Unnamed: 0,sequence
0,TCGATCTCCGCCCCCTCTTCGAAGATCAAAGGGTCGAGGGGACTTT...
1,TCGAATGAGTCACATCGAAGATCAAAGGGTCGACCCTTTGATCTTC...
2,TCGAGGGGACTTTCCTCGAGGGGGCGGGGTCGATTTGGCGCTCGAT...
3,TCGATTTCCAAGAAATCGAGGAAAGTCCCCTCGACCCTTTGATCTT...
4,TCGAATGACATCATCTTTCGAAGATCAAAGGGTCGAGGGGACTTTC...


In [3]:
#extract shared sequences between high and low expressors
crosscat_sequences = pd.DataFrame(columns=['sequence'])
for i in range(3):
    common_tstlst = [sense_cleanlist[i], sense_cleanlist[3], sense_cleanlist[4], sense_cleanlist[5]]
    common = functools.reduce(lambda left,right: pd.merge(left,right,on='sequence'), common_tstlst)
    crosscat_sequences = pd.concat([crosscat_sequences, common])

#remove cross class sequences from each dataset
for i in range(len(sense_cleanlist)):
    sense_cleanlist[i] = sense_cleanlist[i][(~sense_cleanlist[i].sequence.isin(crosscat_sequences.sequence))]

for i in [0,1,3,4]:
    linked_common = functools.reduce(lambda left,right: pd.merge(left,right,on='sequence'), 
                                     [sense_cleanlist[i], sense_cleanlist[i+1]])
    sense_cleanlist[i] = sense_cleanlist[i][(~sense_cleanlist[i].sequence.isin(linked_common.sequence))]

In [4]:
#This function encodes each base into a 4 digit binary code
def one_hot_coder(df): 
    df['sequence'] = [list(seq) for seq in df['sequence'].values]
    ltrdict = {'A':[1,0,0,0],'G':[0,1,0,0],'C':[0,0,1,0],'T':[0,0,0,1], 'N':[0,0,0,0]}
    for seq in df['sequence']:
        for i, e in enumerate(seq):
            if e in ltrdict:
                seq[i] = ltrdict[e]
    return list(df['sequence'].values)

#list of datasets with sequences converted into one-hot format
hot_list = [one_hot_coder(df) for df in sense_cleanlist]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [5]:
#Find the max seq length between all datasets (in terms of # of bases)
seqlen_list = [[len(seq) for seq in hot] for hot in hot_list]
max_seq = max([max(hot) for hot in seqlen_list]) + 1
backpad = 20

#create a df that randomly splits the difference between current seq length and max seq length (to add to front and back of seq)
def create_add_df(seqlen_lst):
    add_df = pd.DataFrame()
    add_df['lentoadd'] = [(max_seq - seql) for seql in seqlen_lst]
    add_df['frontadd'] = [random.randrange(0, n) for n in add_df['lentoadd']]
    add_df['backadd'] = add_df['lentoadd'] - add_df['frontadd']
    return add_df

#create a df that adds variable padding to only the front of sequences. same format to fit into pipeline
#we add a static pad to the end of the sequences for easy retrieval of saliency at the end of the pipeline
def front_add_df(seqlen_lst):
    add_df = pd.DataFrame()
    add_df['lentoadd'] = [(max_seq - seql) for seql in seqlen_lst]
    add_df['frontadd'] = add_df['lentoadd']
    add_df['backadd'] = backpad
    return add_df

add_dfs = [front_add_df(seqlen) for seqlen in seqlen_list]
add_dfs[0].head()

Unnamed: 0,lentoadd,frontadd,backadd
0,151,151,20
1,305,305,20
2,277,277,20
3,285,285,20
4,320,320,20


In [6]:
#this pads the sequences with empty data (e.g. [0,0,0,0]) such that all sequences are the same length now
def one_hot_equalizer(flat_lst, add_df):
    equalized = []
    for i, l in enumerate(flat_lst):
        equalized.append(int(add_df['frontadd'][i])*[[0,0,0,0]] + l + int(add_df['backadd'][i])*[[0,0,0,0]])
    return equalized

#list of datasets with padded sequences
hotpad_list = [one_hot_equalizer(hot, add_dfs[i]) for i, hot in enumerate(hot_list)]
print(len(hotpad_list[0][0]))

672


In [7]:
#transpose each sequence such that a sequence is a 1D list with 4 channels (not a list of 4 channels @ each position)
channeled_hotlists = [[np.transpose(np.array(seq)).tolist() for seq in pad] for pad in hotpad_list]

In [12]:
#final features and labels
y = list([0]*(len(channeled_hotlists[0]) + len(channeled_hotlists[1]) + len(channeled_hotlists[2])) 
             + [1]*(len(channeled_hotlists[3]) + len(channeled_hotlists[4]) + len(channeled_hotlists[5])))

#use this instead to transform labels into multi-class
#label_dict ={0:[0,1],1:[1,0]}
#y_hot = []
#for i, n in enumerate(y):
#    if n in label_dict:
#        y[i] = label_dict[n]
y = np.array(y)

x = np.array([item for sublist in channeled_hotlists for item in sublist])

#randomly split and shuffle into training and test sets. 
#Note our test set is for auc/roc calculations, not for validation during training
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.05, random_state=16)

x_train = np.transpose(x_train, axes=(0, 2, 1))
x_test = np.transpose(x_test, axes=(0, 2, 1))

#y_train = np.transpose(y_train, axes=(1,0))
#y_test = np.transpose(y_train, axes=(1,0))

root_savepath = 'D:/Projects/iSynPro/iSynPro/DanQCNNLSTM'
save_files = [x_train, y_train, x_test, y_test]
save_names = ['x_train', 'y_train', 'x_test', 'y_test']
for i in range(len(save_files)):
    np.save('{}/{}.npy'.format(root_savepath, save_names[i]), save_files[i])
