In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [64]:
# TODO: bin: length, fraction of each base
features_df = pd.read_csv("features.csv")
features_df = features_df.drop(columns=['Unnamed: 0', 'score'])

In [4]:
features_df.columns

Index(['+ve strand', 'length', 'longest A', 'longest C', 'longest G',
       'longest T', 'As', 'Cs', 'Gs', 'Ts',
       ...
       'pos_23_GC', 'pos_23_GG', 'pos_23_GT', 'pos_23_TA', 'pos_23_TC',
       'pos_23_TG', 'dist prim 3'', 'dist prim 5'', 'dist snd 3'',
       'dist snd 5''],
      dtype='object', length=501)

In [14]:
longest.max()

longest A    8
longest C    8
longest G    9
longest T    9
dtype: int64

In [65]:
# Bin longest run of each nucleotide
longest = features_df[['longest A', 'longest C', 'longest G', 'longest T']]

for index, nucl in enumerate(['A', 'C', 'T', 'G']):
    for run_length in range(longest.max()[index]):
        colname = 'longest_' + nucl + "_" + str(run_length)
        features_df[colname] = longest['longest ' + nucl] == run_length

features_df.head()  

Unnamed: 0,+ve strand,length,longest A,longest C,longest G,longest T,As,Cs,Gs,Ts,...,longest_T_8,longest_G_0,longest_G_1,longest_G_2,longest_G_3,longest_G_4,longest_G_5,longest_G_6,longest_G_7,longest_G_8
0,1,24,1,1,2,2,0.142857,0.238095,0.428571,0.190476,...,False,False,False,True,False,False,False,False,False,False
1,1,24,1,4,2,1,0.142857,0.47619,0.333333,0.047619,...,False,False,False,True,False,False,False,False,False,False
2,1,24,1,3,2,1,0.095238,0.380952,0.380952,0.142857,...,False,False,False,True,False,False,False,False,False,False
3,1,24,1,3,2,1,0.238095,0.238095,0.428571,0.095238,...,False,False,False,True,False,False,False,False,False,False
4,1,24,1,3,3,2,0.047619,0.285714,0.52381,0.142857,...,False,False,False,False,True,False,False,False,False,False


In [66]:
np.arange(0, 1, 0.1)

array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])

In [67]:
# Bin nucleotide composition
nucl_comp = features_df[['As', 'Cs', 'Gs', 'Ts', 'GC', 'CA', 'AG']]
crit = np.arange(0.1, 1, 0.1)
for index, nucl in enumerate(['A', 'C', 'T', 'G']):
    for index in crit:
        colname = nucl + '_comp_' + str(round(index, 2))
        upper = nucl_comp[nucl + "s"] < index
        lower = nucl_comp[nucl + "s"] > index - 0.1
        features_df[colname] = upper & lower

features_df.head()  

Unnamed: 0,+ve strand,length,longest A,longest C,longest G,longest T,As,Cs,Gs,Ts,...,T_comp_0.9,G_comp_0.1,G_comp_0.2,G_comp_0.3,G_comp_0.4,G_comp_0.5,G_comp_0.6,G_comp_0.7,G_comp_0.8,G_comp_0.9
0,1,24,1,1,2,2,0.142857,0.238095,0.428571,0.190476,...,False,False,False,False,False,True,False,False,False,False
1,1,24,1,4,2,1,0.142857,0.47619,0.333333,0.047619,...,False,False,False,False,True,False,False,False,False,False
2,1,24,1,3,2,1,0.095238,0.380952,0.380952,0.142857,...,False,False,False,False,True,False,False,False,False,False
3,1,24,1,3,2,1,0.238095,0.238095,0.428571,0.095238,...,False,False,False,False,False,True,False,False,False,False
4,1,24,1,3,3,2,0.047619,0.285714,0.52381,0.142857,...,False,False,False,False,False,False,True,False,False,False


In [68]:
# Length Bins
length = features_df[['length']]
max_length = int(length.max())
min_length = int(length.min())

for leng in range(min_length, max_length + 1):
        colname = 'length_' + str(leng)
        features_df[colname] = length == leng

features_df.head()  

Unnamed: 0,+ve strand,length,longest A,longest C,longest G,longest T,As,Cs,Gs,Ts,...,G_comp_0.8,G_comp_0.9,length_21,length_22,length_23,length_24,length_25,length_26,length_27,length_28
0,1,24,1,1,2,2,0.142857,0.238095,0.428571,0.190476,...,False,False,False,False,False,True,False,False,False,False
1,1,24,1,4,2,1,0.142857,0.47619,0.333333,0.047619,...,False,False,False,False,False,True,False,False,False,False
2,1,24,1,3,2,1,0.095238,0.380952,0.380952,0.142857,...,False,False,False,False,False,True,False,False,False,False
3,1,24,1,3,2,1,0.238095,0.238095,0.428571,0.095238,...,False,False,False,False,False,True,False,False,False,False
4,1,24,1,3,3,2,0.047619,0.285714,0.52381,0.142857,...,False,False,False,False,False,True,False,False,False,False


In [69]:
features_df.drop(columns=['length', 
                          'longest A', 'longest C', 'longest G', 'longest T', 
                          'As', 'Cs', 'Gs', 'Ts',
                          'GC', 'CA', 'AG'])

Unnamed: 0,+ve strand,length.1,pos_0_A,pos_0_C,pos_0_G,pos_0_T,pos_1_A,pos_1_C,pos_1_G,pos_1_T,...,G_comp_0.8,G_comp_0.9,length_21,length_22,length_23,length_24,length_25,length_26,length_27,length_28
0,1,21,0,0,0,0,0,0,0,0,...,False,False,False,False,False,True,False,False,False,False
1,1,21,0,0,0,0,0,0,0,0,...,False,False,False,False,False,True,False,False,False,False
2,1,21,0,0,0,0,0,0,0,0,...,False,False,False,False,False,True,False,False,False,False
3,1,21,0,0,0,0,0,0,0,0,...,False,False,False,False,False,True,False,False,False,False
4,1,21,0,0,0,0,0,0,0,0,...,False,False,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17245,1,21,0,0,0,0,0,0,0,0,...,False,False,False,False,False,True,False,False,False,False
17246,1,20,0,0,0,0,0,0,0,0,...,False,False,False,False,True,False,False,False,False,False
17247,0,21,0,0,0,0,0,0,0,0,...,False,False,False,False,False,True,False,False,False,False
17248,0,21,0,0,0,0,0,0,0,0,...,False,False,False,False,False,True,False,False,False,False


In [72]:
features_df.to_csv('features_binned.csv')

In [41]:
from sklearn.preprocessing import KBinsDiscretizer

In [47]:
est = KBinsDiscretizer(n_bins=10, encode='onehot', strategy='uniform')
length_bins = est.fit_transform(length)
pd.DataFrame(length_bins)

Unnamed: 0,0
0,"(0, 4)\t1.0"
1,"(0, 4)\t1.0"
2,"(0, 4)\t1.0"
3,"(0, 4)\t1.0"
4,"(0, 4)\t1.0"
...,...
17245,"(0, 4)\t1.0"
17246,"(0, 2)\t1.0"
17247,"(0, 4)\t1.0"
17248,"(0, 4)\t1.0"
