# Preprocessing

1. Extract sequence features
2. Encode numerical feature vector 
3. Save to `tsv` and `hdf` files

In [1]:
import os
import re
import import_ipynb
import numpy as np
import pandas as pd
from utils import load_sequences
from tqdm import tqdm
from functools import reduce

importing Jupyter notebook from utils.ipynb


In [2]:
# set your configurations here

if 1:   ## for training set
    sequences_dir = "Data_files/Sequences_data/Training Set/"
    ftrs_and_labels_csv =  "Data_files/features&labels/training_set.csv"
    ftrs_and_labels_h5 = "Data_files/features&labels/training_set.h5"
    chunk_size = 21
else:   ## for testing set
    sequences_dir = "Data_files/Sequences_data/Testing Set/"
    ftrs_and_labels_csv =  "Data_files/features&labels/testing_set.csv"
    ftrs_and_labels_h5 = "Data_files/features&labels/testing_set.h5"
    chunk_size = 21
    
pd.set_option('max_colwidth',20)
pd.set_option('display.max_columns', None)

## 特征提取

In [3]:
sequences_df = load_sequences(sequences_dir)
df = sequences_df


hlfchs = int((chunk_size - 1) / 2)
tqdm.pandas(desc="processing")
df['Donor_sites'] = df['CDSjoin'].apply(lambda x: {int(i.split('..')[1]) for i in x.split(',')})
df['Accpt_sites'] = df['CDSjoin'].apply(lambda x: {int(i.split('..')[0]) for i in x.split(',')})
df['Other_sites'] = df.progress_apply(lambda x: list(set(range(hlfchs + 1, len(x['Sequence']) - hlfchs)) -
                                                     x['Accpt_sites'] - x['Donor_sites']), 
                                      axis=1)

df['Donor_seqs'] = df.progress_apply(lambda x: np.array([x['Sequence'][i-hlfchs:i+hlfchs] for i in x['Donor_sites']]), axis=1)
df['Accpt_seqs'] = df.progress_apply(lambda x: np.array([x['Sequence'][i-hlfchs:i+hlfchs] for i in x['Accpt_sites']]), axis=1)
df['Other_seqs'] = df.progress_apply(lambda x: np.array([x['Sequence'][i-hlfchs:i+hlfchs] for i in x['Other_sites']]), axis=1)

df[0:10]

462it [00:04, 95.85it/s] 
processing: 100%|██████████| 462/462 [00:01<00:00, 325.08it/s]
processing: 100%|██████████| 462/462 [00:00<00:00, 3705.94it/s]
processing: 100%|██████████| 462/462 [00:00<00:00, 5264.22it/s]
processing: 100%|██████████| 462/462 [01:00<00:00,  7.61it/s]


Unnamed: 0,Name,CDSjoin,Sequence,Donor_sites,Accpt_sites,Other_sites,Donor_seqs,Accpt_seqs,Other_seqs
0,LOCUS AB00...,"28199..28271,288...",gcggccggaattaacc...,"{34586, 28988, 2...","{28881, 34291, 2...","[11, 12, 13, 14,...",[attgccatgaggacc...,[ccctctcagggactt...,[cggccggaattaacc...
1,LOCUS AB00...,"9106..9239,9843....",ggtgaaacctcatctc...,"{17315, 16934, 2...","{17408, 20323, 1...","[11, 12, 13, 14,...",[ggcgttgctggtggg...,[tatgtgcagggtggc...,[gtgaaacctcatctc...
2,LOCUS AB00...,"2301..2483,5205....",catctgaggccactct...,"{11458, 14148, 5...","{6208, 13312, 78...","[11, 12, 13, 14,...",[cgaccctcaggtggg...,[ttcttacaggtgatc...,[atctgaggccactct...
3,LOCUS AB00...,"1..195,845..1035...",atgacccagaccctca...,"{2944, 195, 3431...","{1, 3234, 2116, ...","[11, 12, 13, 14,...",[ccccaaaaatgtgag...,"[, ttgcaccagacgc...",[tgacccagaccctca...
4,LOCUS AB00...,"8540..9479,10624...",ccaatcagtttaaatt...,"{10949, 9479}","{10624, 8540}","[11, 12, 13, 14,...",[tcaagaatagcaaga...,[ttctttcagagatga...,[caatcagtttaaatt...
5,LOCUS AC00...,"6673..6718,6877....",gatcacttgaagccag...,"{11114, 7245, 24...","{10818, 24554, 2...","[11, 12, 13, 14,...",[gacgtcaagtgtgag...,[ctcgtatagacaccc...,[atcacttgaagccag...
6,LOCUS AC00...,"69873..69926,718...",caactccagtttgacc...,"{71910, 69926, 7...","{73632, 73320, 7...","[11, 12, 13, 14,...",[cagctatgaggtaat...,[tctcaccagtaccct...,[aactccagtttgacc...
7,LOCUS AC00...,"9005..9080,9953....",aagcttgccctgggag...,"{9080, 10609, 10...","{10504, 9953, 9005}","[11, 12, 13, 14,...",[gctcagccaggtaag...,[cctccacagcttcag...,[agcttgccctgggag...
8,LOCUS AC00...,"47311..47384,533...",aagcttttgtgcttca...,"{72842, 55052, 6...","{56296, 65420, 6...","[11, 12, 13, 14,...",[tgagctctaacagac...,[gtttataagggatta...,[agcttttgtgcttca...
9,LOCUS AC00...,"15522..15648,473...",aagcttcttagtttat...,"{15648, 65600, 6...","{15522, 65442, 5...","[11, 12, 13, 14,...",[ggccaaggtggtaag...,[ttttgcattatgtgc...,[agcttcttagtttat...


## 编码特征向量

In [4]:
def onehot_enc(onehot_matrix, dna_seq):
    dt = onehot_matrix.dtype
    code_by_replc = dna_seq.replace('a', '0').replace('c', '1').replace('g', '2').replace('t', '3')
    #print(code_by_replc)
    code_by_onehot = np.concatenate([onehot_matrix[int(i)] for i in code_by_replc])
    return code_by_onehot

extra_base = re.compile(pattern='[a-z]')
enc_by_replc = lambda x: x.replace('a', '0').replace('c', '1').replace('g', '2').replace('t', '3')

def sequence_filter(sequences, regex_compiled):
    seqs = [seq for seq in sequences if len(seq) == 20 and extra_base.findall(string=enc_by_replc(seq)) == []]
    return seqs

In [5]:
for col in ['Donor_seqs', 'Accpt_seqs', 'Other_seqs']:
    df[col] = [sequence_filter(seq, regex_compiled=extra_base) for seq in tqdm(df[col])]

100%|██████████| 462/462 [00:00<00:00, 30881.76it/s]
100%|██████████| 462/462 [00:00<00:00, 24383.03it/s]
100%|██████████| 462/462 [00:15<00:00, 41.12it/s]


In [6]:
oh_matrix = np.eye(4, dtype=np.int)
df['Donor_ftrs'] = df.progress_apply(lambda x: np.array([onehot_enc(onehot_matrix=oh_matrix, dna_seq=i) for i in x['Donor_seqs']]), axis=1)
df['Accpt_ftrs'] = df.progress_apply(lambda x: np.array([onehot_enc(onehot_matrix=oh_matrix, dna_seq=i) for i in x['Accpt_seqs']]), axis=1)
df['Other_ftrs'] = df.progress_apply(lambda x: np.array([onehot_enc(onehot_matrix=oh_matrix, dna_seq=i) for i in x['Other_seqs']]), axis=1)

df[0:10]

processing: 100%|██████████| 462/462 [00:00<00:00, 2237.30it/s]
processing: 100%|██████████| 462/462 [00:00<00:00, 3172.08it/s]
processing: 100%|██████████| 462/462 [02:26<00:00,  5.73it/s]


Unnamed: 0,Name,CDSjoin,Sequence,Donor_sites,Accpt_sites,Other_sites,Donor_seqs,Accpt_seqs,Other_seqs,Donor_ftrs,Accpt_ftrs,Other_ftrs
0,LOCUS AB00...,"28199..28271,288...",gcggccggaattaacc...,"{34586, 28988, 2...","{28881, 34291, 2...","[11, 12, 13, 14,...",[attgccatgaggacc...,[ccctctcagggactt...,[cggccggaattaacc...,"[[1, 0, 0, 0, 0,...","[[0, 1, 0, 0, 0,...","[[0, 1, 0, 0, 0,..."
1,LOCUS AB00...,"9106..9239,9843....",ggtgaaacctcatctc...,"{17315, 16934, 2...","{17408, 20323, 1...","[11, 12, 13, 14,...",[ggcgttgctggtggg...,[tatgtgcagggtggc...,[gtgaaacctcatctc...,"[[0, 0, 1, 0, 0,...","[[0, 0, 0, 1, 1,...","[[0, 0, 1, 0, 0,..."
2,LOCUS AB00...,"2301..2483,5205....",catctgaggccactct...,"{11458, 14148, 5...","{6208, 13312, 78...","[11, 12, 13, 14,...",[cgaccctcaggtggg...,[ttcttacaggtgatc...,[atctgaggccactct...,"[[0, 1, 0, 0, 0,...","[[0, 0, 0, 1, 0,...","[[1, 0, 0, 0, 0,..."
3,LOCUS AB00...,"1..195,845..1035...",atgacccagaccctca...,"{2944, 195, 3431...","{1, 3234, 2116, ...","[11, 12, 13, 14,...",[ccccaaaaatgtgag...,[ttgcaccagacgctg...,[tgacccagaccctca...,"[[0, 1, 0, 0, 0,...","[[0, 0, 0, 1, 0,...","[[0, 0, 0, 1, 0,..."
4,LOCUS AB00...,"8540..9479,10624...",ccaatcagtttaaatt...,"{10949, 9479}","{10624, 8540}","[11, 12, 13, 14,...",[tcaagaatagcaaga...,[ttctttcagagatga...,[caatcagtttaaatt...,"[[0, 0, 0, 1, 0,...","[[0, 0, 0, 1, 0,...","[[0, 1, 0, 0, 1,..."
5,LOCUS AC00...,"6673..6718,6877....",gatcacttgaagccag...,"{11114, 7245, 24...","{10818, 24554, 2...","[11, 12, 13, 14,...",[gacgtcaagtgtgag...,[ctcgtatagacaccc...,[atcacttgaagccag...,"[[0, 0, 1, 0, 1,...","[[0, 1, 0, 0, 0,...","[[1, 0, 0, 0, 0,..."
6,LOCUS AC00...,"69873..69926,718...",caactccagtttgacc...,"{71910, 69926, 7...","{73632, 73320, 7...","[11, 12, 13, 14,...",[cagctatgaggtaat...,[tctcaccagtaccct...,[aactccagtttgacc...,"[[0, 1, 0, 0, 1,...","[[0, 0, 0, 1, 0,...","[[1, 0, 0, 0, 1,..."
7,LOCUS AC00...,"9005..9080,9953....",aagcttgccctgggag...,"{9080, 10609, 10...","{10504, 9953, 9005}","[11, 12, 13, 14,...",[gctcagccaggtaag...,[cctccacagcttcag...,[agcttgccctgggag...,"[[0, 0, 1, 0, 0,...","[[0, 1, 0, 0, 0,...","[[1, 0, 0, 0, 0,..."
8,LOCUS AC00...,"47311..47384,533...",aagcttttgtgcttca...,"{72842, 55052, 6...","{56296, 65420, 6...","[11, 12, 13, 14,...",[tgagctctaacagac...,[gtttataagggatta...,[agcttttgtgcttca...,"[[0, 0, 0, 1, 0,...","[[0, 0, 1, 0, 0,...","[[1, 0, 0, 0, 0,..."
9,LOCUS AC00...,"15522..15648,473...",aagcttcttagtttat...,"{15648, 65600, 6...","{15522, 65442, 5...","[11, 12, 13, 14,...",[ggccaaggtggtaag...,[ttttgcattatgtgc...,[agcttcttagtttat...,"[[0, 0, 1, 0, 0,...","[[0, 0, 0, 1, 0,...","[[1, 0, 0, 0, 0,..."


## 调整格式

In [7]:
samples = pd.DataFrame()
concat = np.concatenate

concatd_doseqs = concat(df['Donor_seqs'])
concatd_acseqs = concat(df['Accpt_seqs'])
concatd_otseqs = concat(df['Other_seqs'])

choose_ix = concat((np.ones(int(1e5), dtype=int), np.zeros(len(concatd_otseqs) - int(1e5), dtype=int))).astype(bool)
np.random.shuffle(choose_ix)

samples['Seq. features'] = concat((concatd_doseqs, concatd_acseqs, concatd_otseqs[choose_ix]))

In [8]:
concatd_donums = concat(df['Donor_ftrs'])
concatd_acnums = concat(df['Accpt_ftrs'])
concatd_otnums = concat(df['Other_ftrs'])
samples['Num. features'] = concat((concatd_donums, concatd_acnums, concatd_otnums[choose_ix]), axis=0).tolist()

nsamples = samples.shape[0]
ndoseqs = concatd_doseqs.shape[0]
nacseqs = concatd_acseqs.shape[0]
samples['IsDonor'] = concat((np.ones(ndoseqs), np.zeros(nsamples - ndoseqs)))
samples['IsAcceptor'] = concat((np.zeros(ndoseqs), np.ones(nacseqs), np.zeros(nsamples - ndoseqs - nacseqs)))

In [1]:
pd.set_option('max_colwidth',150)
samples[0:10]

NameError: name 'pd' is not defined

## 保存到硬盘

In [10]:
folders = set([os.path.split(ftrs_and_labels_h5)[0], os.path.split(ftrs_and_labels_csv)[0]])
for folder in folders:
    if not os.path.isdir(folder):
        os.mkdir(folder)

samples.to_hdf(ftrs_and_labels_h5, key='data')
samples.to_csv(ftrs_and_labels_csv)

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->['Seq. features', 'Num. features']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)


## 生成报告

In [12]:
! jupyter nbconvert --to html preprocessing.ipynb

[NbConvertApp] Converting notebook preprocessing.ipynb to html
[NbConvertApp] Writing 321054 bytes to preprocessing.html
