# Binning-hCRISPRi-v2.1

This notebook bins the features processed in Featurization-hCRISPRi-v2.1.ipynb.

The data used is that from the following paper:

[Compact and highly active next-generation libraries for CRISPR-mediated gene repression and activation](https://elifesciences.org/articles/19760)

__Primary Author__: Anthony Hein
    
__Contributors__: Derived from a notebook written by Briana Macedo (Binning.ipynb).

---

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# TODO: bin: length, fraction of each base
features_df = pd.read_csv("features_hCRISPRi_v2.1.csv")
features_df['length'] = features_df['length'] - 3
features_df = features_df.drop(columns=['Unnamed: 0', 'score'])
features_df.head()

Unnamed: 0,+ve strand,length,longest A,longest C,longest G,longest T,As,Cs,Gs,Ts,...,pos_-4_GG,pos_-4_GT,pos_-4_TA,pos_-4_TC,pos_-4_TG,pos_-4_TT,dist prim 3',dist prim 5',dist snd 3',dist snd 5'
0,1,20,3,1,4,2,0.25,0.2,0.25,0.3,...,0,0,0,1,0,0,0.524664,0.382736,0.456344,0.421644
1,0,20,1,4,2,1,0.35,0.35,0.25,0.05,...,0,0,0,0,0,0,0.566183,0.511548,0.479786,0.470973
2,1,20,2,2,3,2,0.3,0.2,0.4,0.1,...,0,0,0,0,0,0,0.28077,0.188223,0.296115,0.271189
3,1,20,3,2,2,4,0.35,0.25,0.15,0.25,...,0,0,0,0,0,0,0.565405,0.478117,0.478031,0.460333
4,1,20,2,2,1,1,0.35,0.3,0.2,0.15,...,0,0,0,0,0,0,0.175457,0.12244,0.242645,0.209068


In [3]:
# Bin longest run of each nucleotide
longest = features_df[['longest A', 'longest C', 'longest G', 'longest T']]
longest_max = [8, 8, 9, 9]

for index, nucl in enumerate(['A', 'C', 'G', 'T']):
    for run_length in range(longest_max[index] + 1):
        colname = 'longest_' + nucl + "_" + str(run_length)
        features_df[colname] = longest['longest ' + nucl] == run_length

features_df.head()  

Unnamed: 0,+ve strand,length,longest A,longest C,longest G,longest T,As,Cs,Gs,Ts,...,longest_T_0,longest_T_1,longest_T_2,longest_T_3,longest_T_4,longest_T_5,longest_T_6,longest_T_7,longest_T_8,longest_T_9
0,1,20,3,1,4,2,0.25,0.2,0.25,0.3,...,False,False,True,False,False,False,False,False,False,False
1,0,20,1,4,2,1,0.35,0.35,0.25,0.05,...,False,True,False,False,False,False,False,False,False,False
2,1,20,2,2,3,2,0.3,0.2,0.4,0.1,...,False,False,True,False,False,False,False,False,False,False
3,1,20,3,2,2,4,0.35,0.25,0.15,0.25,...,False,False,False,False,True,False,False,False,False,False
4,1,20,2,2,1,1,0.35,0.3,0.2,0.15,...,False,True,False,False,False,False,False,False,False,False


In [4]:
# Bin nucleotide composition
features_df[['A', 'C', 'G', 'T']] = features_df[['As', 'Cs', 'Gs', 'Ts']]
features_df = features_df.drop(columns=['As', 'Cs', 'Gs', 'Ts'])
nucl_comp = features_df[['A', 'C', 'G', 'T', 'GC', 'CA', 'AG']]
crit = np.arange(0.1, 1.1, 0.1)
for index, nucl in enumerate(['A', 'C', 'G', 'T', 'GC', 'CA', 'AG']):
    for index in crit:
        colname = nucl + '_comp_' + str(round(index, 2))
        upper = nucl_comp[nucl] < index
        lower = nucl_comp[nucl] > index - 0.1
        features_df[colname] = upper & lower

features_df.head()  

Unnamed: 0,+ve strand,length,longest A,longest C,longest G,longest T,GC,CA,AG,pos_-27_A,...,AG_comp_0.1,AG_comp_0.2,AG_comp_0.3,AG_comp_0.4,AG_comp_0.5,AG_comp_0.6,AG_comp_0.7,AG_comp_0.8,AG_comp_0.9,AG_comp_1.0
0,1,20,3,1,4,2,0.0,0.0,0.0,0,...,False,False,False,False,False,False,False,False,False,False
1,0,20,1,4,2,1,0.0,0.052632,0.105263,0,...,False,True,False,False,False,False,False,False,False,False
2,1,20,2,2,3,2,0.0,0.105263,0.157895,0,...,False,True,False,False,False,False,False,False,False,False
3,1,20,3,2,2,4,0.0,0.052632,0.052632,0,...,True,False,False,False,False,False,False,False,False,False
4,1,20,2,2,1,1,0.105263,0.157895,0.157895,0,...,False,True,False,False,False,False,False,False,False,False


In [5]:
# Length Bins
length = features_df[['length']]
max_length = 25
min_length = 18

for leng in range(min_length, max_length + 1):
        colname = 'length_' + str(leng)
        features_df[colname] = length == leng

features_df.head()  

Unnamed: 0,+ve strand,length,longest A,longest C,longest G,longest T,GC,CA,AG,pos_-27_A,...,AG_comp_0.9,AG_comp_1.0,length_18,length_19,length_20,length_21,length_22,length_23,length_24,length_25
0,1,20,3,1,4,2,0.0,0.0,0.0,0,...,False,False,False,False,True,False,False,False,False,False
1,0,20,1,4,2,1,0.0,0.052632,0.105263,0,...,False,False,False,False,True,False,False,False,False,False
2,1,20,2,2,3,2,0.0,0.105263,0.157895,0,...,False,False,False,False,True,False,False,False,False,False
3,1,20,3,2,2,4,0.0,0.052632,0.052632,0,...,False,False,False,False,True,False,False,False,False,False
4,1,20,2,2,1,1,0.105263,0.157895,0.157895,0,...,False,False,False,False,True,False,False,False,False,False


In [6]:
features_df = features_df.drop(columns=['length', 
                          'longest A', 'longest C', 'longest G', 'longest T', 
                          'A', 'C', 'G', 'T',
                          'GC', 'CA', 'AG'])

In [7]:
features_df.head()

Unnamed: 0,+ve strand,pos_-27_A,pos_-27_C,pos_-27_G,pos_-27_T,pos_-26_A,pos_-26_C,pos_-26_G,pos_-26_T,pos_-25_A,...,AG_comp_0.9,AG_comp_1.0,length_18,length_19,length_20,length_21,length_22,length_23,length_24,length_25
0,1,0,0,0,0,0,0,0,0,0,...,False,False,False,False,True,False,False,False,False,False
1,0,0,0,0,0,0,0,0,0,0,...,False,False,False,False,True,False,False,False,False,False
2,1,0,0,0,0,0,0,0,0,0,...,False,False,False,False,True,False,False,False,False,False
3,1,0,0,0,0,0,0,0,0,0,...,False,False,False,False,True,False,False,False,False,False
4,1,0,0,0,0,0,0,0,0,0,...,False,False,False,False,True,False,False,False,False,False


In [8]:
features_df.to_csv('features_hCRISPRi_v2.1_binned.csv')