-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_prepared.py
141 lines (114 loc) · 3.65 KB
/
data_prepared.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Jan 29 11:04:04 2018
@author: noch
"""
import numpy as np
import pandas as pd
from itertools import product
def read_data(st, isTr):
sbroot = "testing_set/"
if(isTr):
sbroot = "training_set/"
root = "/Users/noch/Documents/workspace/data_challenge/dataset/" + sbroot
#root = "/home/jibril/Desktop/data_challenge/dataset/" + sbroot
data = 0
st = st+".csv"
data = pd.read_csv(root+st, index_col=False)
return data
def creat_col_name(r):
permu = product("ATGC", repeat = r)
arr = []
#add new
sub_permu = product("ATGC", repeat = r-1)
for s_val in sub_permu:
st_b = ""
st_e = ""
for i in range(r-1):
st_b = st_b + s_val[i]
st_e = st_e + s_val[i]
st_b = "<" + str(st_b)
arr.append(st_b)
st_e = str(st_e) + ">"
arr.append(st_e)
#end add new
for val in permu:
st = ""
for j in range(r):
st = st + val[j]
arr.append(st)
return arr
#count the # of char occured in the sequence and devided by len
def prepare_data_div(X, num_char):
df = pd.DataFrame(columns = creat_col_name(num_char))
col_name = list(df)
for index, row in X.iterrows():
ln = len(row['DNA'])
df.loc[index] = 0
for i in range(ln-num_char+1):
s = str(row['DNA'])
#add new
s = "<" + str(s) + ">"
#end add new
for n in col_name:
st = ""
for t in range(num_char):
st = st + s[i+t]
if(st == n):
df.loc[index][n] = df.loc[index][n]+1
break
for n in col_name:
df.loc[index][n] = df.loc[index][n]/(ln-num_char+1)
#if (index == 10):
# break
return df
#count the # of char occured in the sequence
def prepare_data_no_div(X, num_char):
df = pd.DataFrame(columns = creat_col_name(num_char))
col_name = list(df)
for index, row in X.iterrows():
ln = len(row['DNA'])
df.loc[index] = 0
for i in range(ln-num_char+1):
s = str(row['DNA'])
#add new
s = "<" + str(s) + ">"
#end add new
for n in col_name:
st = ""
for t in range(num_char):
st = st + s[i+t]
if(st == n):
df.loc[index][n] = df.loc[index][n]+1
break
#if (index == 5):
# break
return df
#if char occured in the sequence it'll be 1 else 0
def prepare_data_bi(X, num_char):
df = pd.DataFrame(columns = creat_col_name(num_char))
col_name = list(df)
for index, row in X.iterrows():
#print(str(row['DNA']))
ln = len(row['DNA'])
df.loc[index] = 0
for i in range(ln-num_char+1):
s = str(row['DNA'])
#add new
s = "<" + str(s) + ">"
#end add new
for n in col_name:
st = ""
for t in range(num_char):
st = st + s[i+t]
if(st == n and df.loc[index][n] == 0):
df.loc[index][n] = 1
break
#if (index == 20):
# break
return df
#print(df)
def split_data(df, tr_num):
msk = np.random.rand(len(df)) < (tr_num/100)
return (df[msk], df[~msk])