# Data Preparation notebook for exRNA_baseline Dataset

# Import necessary packages

In [1]:
import os
os.chdir("../../")

In [2]:
import exoNet
import scanpy as sc
import numpy as np
import pandas as pd

Using TensorFlow backend.


# Data Loading

In [3]:
data_df = pd.read_csv("./Data/oldDataRefined/DesignMatrices/4_DesignMat_SS_Kmer_Label.csv", index_col='id')
data_df.shape

(11425, 270)

In [4]:
data_columns = data_df.columns.tolist()

In [5]:
seq_columns = ['seq', 'DB']
fcn_columns = [column for column in data_columns if not column in seq_columns]

In [6]:
seq_data_df = data_df[seq_columns]
seq_data_df.head()

Unnamed: 0_level_0,seq,DB
id,Unnamed: 1_level_1,Unnamed: 2_level_1
chr1_100038134_100038156,CUCGCCGAGCCGGGCCGUCAUCA,...(((......)))........
chr1_100088027_100088063,ACUUUACAGAGUCCAUUUGUCCACCCGUAGUGUCGAG,.....(((((.....))))).(((.....))).....
chr1_102096379_102096418,GUUGGGGCACAAAAUCGAGUCAUCGAUUUCGAGACGUGGA,.(((.....)))...((.(((.(((....)))))).))..
chr1_102151616_102151639,GAAAUCUCUGCCCCAGAACGAGAU,...(((((...........)))))
chr1_10298983_10299071,UGUAUAUGAUUUUAACCUUGCUAUGUCUCUUCUAAUCGUACCGGGG...,..............(((((((...((((....(((.(((((.((((...


In [7]:
fcn_data_df = data_df[fcn_columns]
fcn_data_df.head()

Unnamed: 0_level_0,chr,length,a,c,g,u,ic,ev,label,FreeEnergy,...,UUCG,UUCU,UUGA,UUGC,UUGG,UUGU,UUUA,UUUC,UUUG,UUUU
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
chr1_100038134_100038156,chr1,23,3,10,7,3,47,0,NO,-4.8,...,0,0,0,0,0,0,0,0,0,0
chr1_100088027_100088063,chr1,37,8,10,8,11,3836,0,NO,-2.3,...,0,0,0,0,0,1,1,0,1,0
chr1_102096379_102096418,chr1,40,11,7,13,9,39,0,NO,-6.8,...,1,0,0,0,1,0,0,1,0,0
chr1_102151616_102151639,chr1,24,8,7,5,4,83,0,NO,-2.6,...,0,0,0,0,0,0,0,0,0,0
chr1_10298983_10299071,chr1,89,20,19,18,32,618,9,YES,-20.6,...,1,1,0,1,0,0,2,0,0,1


In [8]:
labels = fcn_data_df['label'].values

In [9]:
np.save(file="./Data/exRNA_baseline/labels.npy", arr=labels)

In [10]:
fcn_data_df.drop(['label', 'chr'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [11]:
np.save(file="./Data/exRNA_baseline/features.npy", arr=fcn_data_df.values)

# Data Pre-processing

In [12]:
seq_data = seq_data_df['seq'].values
seq_data

array(['CUCGCCGAGCCGGGCCGUCAUCA', 'ACUUUACAGAGUCCAUUUGUCCACCCGUAGUGUCGAG',
       'GUUGGGGCACAAAAUCGAGUCAUCGAUUUCGAGACGUGGA', ...,
       'CUCCGACCACAAUCGUUUUUU',
       'CCAGGGUCAGGAUCGACGACCGUAGUGAUAUGAUGAUUGUCUGGCGUUGGAGUUGUGGUGGAAGAA',
       'AAAAAAAAAAACAGAGGGUUGG'], dtype=object)

In [13]:
char_encoder = {
    'N': -1,
    'A': 0,
    'C': 1,
    'G': 2,
    'U': 3
}

In [14]:
seq_encoded = exoNet.pp.seq_encoder(seq_data, char_encoder, 200)
seq_encoded.shape

(11425, 200, 4)

In [15]:
db_encoder = {
    '.': 0,
    ')': 1,
    '(': 2,
}

In [16]:
db_encoded = seq_data_df['DB'].values
db_encoded = exoNet.pp.seq_encoder(db_encoded, db_encoder, 200, unknown_char=False)
db_encoded.shape

(11425, 200, 3)

In [17]:
seq_encoded = np.concatenate([seq_encoded, db_encoded], axis=2)
seq_encoded.shape

(11425, 200, 7)

In [18]:
np.save(file="./Data/exRNA_baseline/sequences.npy", arr=seq_encoded)