In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
import sklearn
import math
%matplotlib inline
import matplotlib as plt
from sklearn.pipeline import Pipeline


In [3]:
negative_with_seq = pd.read_csv("./data/negative_examples.csv")
positive_with_seq = pd.read_csv("./data/positive_examples.csv")


In [4]:
n = positive_with_seq["seq_new"].apply(lambda x: True if "N" in x else False)
np.where(n == True)


(array([ 15218,  15223,  15224, ..., 153676, 153677, 153678]),)

In [7]:
bases = dict(zip("ACGTN", [[1,0,0,0,0],[0,1,0,0,0],[0,0,1,0,0],[0,0,0,1,0],[0,0,0,0,1]]))
bases 




{'A': [1, 0, 0, 0, 0],
 'C': [0, 1, 0, 0, 0],
 'G': [0, 0, 1, 0, 0],
 'T': [0, 0, 0, 1, 0],
 'N': [0, 0, 0, 0, 1]}

In [9]:
def transform(df):
    values = []
    
    def encode(row):
        result = []
        for base in row:
            result.append(bases[base])
            
        values.append(result)
        
    df["seq_new"].apply(encode)
    
    return values

In [10]:
neg_examples = transform(negative_with_seq)
pos_examples = transform(positive_with_seq)
features = np.concatenate((pos_examples, neg_examples), axis=0)
features[0][0]

array([1, 0, 0, 0, 0])

In [None]:
Y_pos = [[1]] * len(pos_examples)
Y_neg = [[0]] * len(neg_examples)
labels = np.concatenate((Y_pos, Y_neg), axis=0)


In [14]:
features.shape


In [15]:
features_reshaped = features.reshape([166348, 2000])



In [16]:
from sklearn import model_selection
X_train, X_test, y_train, y_test = model_selection.train_test_split(features_reshaped,
                                                    labels,
                                                    test_size=0.33,
                                                    random_state=42)


In [17]:
# test it with linear regression

from sklearn.linear_model import LinearRegression

lr_clf = LinearRegression()
lr_clf.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [18]:
# display all scores in one go

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", round(scores.mean()))
    print("Standard deviation:", scores.std())

In [19]:
from sklearn.model_selection import cross_val_score

lr_scores = cross_val_score(lr_clf, X_train, y_train, scoring="neg_mean_squared_error", cv=10)
lr_rmse_scores = np.sqrt(-lr_scores)
display_scores(lr_rmse_scores)

Scores: [  2.58736921e-01   4.31096977e+09   3.37093357e+09   1.11212652e+09
   4.42473422e+08   1.83545108e+09   2.63485935e+09   1.76787665e+07
   1.34292322e+09   5.26210346e+09]
Mean: 2032951916.0
Standard deviation: 1727894073.6
