In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
import sklearn
import math
%matplotlib inline
import matplotlib as plt
from sklearn.pipeline import Pipeline


In [3]:
negative_with_seq = pd.read_csv("./data/negative_examples.csv")
positive_with_seq = pd.read_csv("./data/positive_examples.csv")


In [4]:
n = positive_with_seq["seq_new"].apply(lambda x: True if "N" in x else False)
np.where(n == True)


(array([ 15218,  15223,  15224, ..., 153676, 153677, 153678]),)

In [5]:
bases = dict(zip("ACGTN", [[1,0,0,0,0],[0,1,0,0,0],[0,0,1,0,0],[0,0,0,1,0],[0,0,0,0,1]]))
bases 




{'A': [1, 0, 0, 0, 0],
 'C': [0, 1, 0, 0, 0],
 'G': [0, 0, 1, 0, 0],
 'T': [0, 0, 0, 1, 0],
 'N': [0, 0, 0, 0, 1]}

In [6]:
def transform(df):
    values = []
    
    def encode(row):
        result = []
        for base in row:
            result.append(bases[base])
            
        values.append(result)
        
    df["seq_new"].apply(encode)
    
    return values

In [9]:
neg_examples = transform(negative_with_seq)
pos_examples = transform(positive_with_seq)
features = np.concatenate((pos_examples, neg_examples), axis=0)
features[0][0]

array([1, 0, 0, 0, 0])

In [11]:
Y_pos = [[1]] * len(pos_examples)
Y_neg = [[0]] * len(neg_examples)
labels = np.concatenate((Y_pos, Y_neg), axis=0)


In [12]:
features.shape


(166348, 400, 5)

In [13]:
features_reshaped = features.reshape([166348, 2000])



In [14]:
from sklearn import model_selection
X_train, X_test, y_train, y_test = model_selection.train_test_split(features_reshaped,
                                                    labels,
                                                    test_size=0.33,
                                                    random_state=42)


In [14]:
# test it with linear regression

from sklearn.linear_model import LinearRegression

lr_clf = LinearRegression()
lr_clf.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [15]:
# display all scores in one go

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", round(scores.mean()))
    print("Standard deviation:", scores.std())

In [19]:
from sklearn.model_selection import cross_val_score

lr_scores = cross_val_score(lr_clf, X_train, y_train, scoring="neg_mean_squared_error", cv=10)
lr_rmse_scores = np.sqrt(-lr_scores)
display_scores(lr_rmse_scores)

Scores: [  2.58736921e-01   4.31096977e+09   3.37093357e+09   1.11212652e+09
   4.42473422e+08   1.83545108e+09   2.63485935e+09   1.76787665e+07
   1.34292322e+09   5.26210346e+09]
Mean: 2032951916.0
Standard deviation: 1727894073.6


In [None]:
from sklearn.metrics import mean_squared_error

lr_predictions = lr_clf.predict(X_train)
lr_mse = mean_squared_error(y_train, lr_predictions)
lr_smre = np.sqrt(lr_mse)
lr_smre


In [40]:
# predict for test with linear regression
# not working because it is a classification problem

y_predictions = lr_clf.predict(X_test)
y_test_unidimensional = np.reshape(y_test, -1)
y_predictions_unidimensional = np.reshape(y_predictions, -1)
linear_results = pd.DataFrame(
    {
        "predictions": y_predictions_unidimensional,
        "truth": y_test_unidimensional
    }
)
linear_results.to_csv("./predictions/linear_results.csv", index=False)

In [17]:
# test it with logistic regression

from sklearn.linear_model import LogisticRegression

lg_clf = LogisticRegression()
lg_clf.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [18]:
from sklearn.model_selection import cross_val_score

lg_scores = cross_val_score(lg_clf, X_train, y_train, scoring="neg_mean_squared_error", cv=10)
lg_rmse_scores = np.sqrt(-lg_scores)
display_scores(lg_rmse_scores)

  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)


Scores: [ 0.26790787  0.26790787  0.26807526  0.26790787  0.26791989  0.26791989
  0.26791989  0.26791989  0.26791989  0.2677644 ]
Mean: 0.0
Standard deviation: 6.97277727802e-05


In [19]:
from sklearn.metrics import mean_squared_error

lg_predictions = lg_clf.predict(X_train)
lg_mse = mean_squared_error(y_train, lg_predictions)
lg_smre = np.sqrt(lg_mse)
lg_smre


0.2678827930567016

In [21]:
# predict for test with logistic regression

y_log_predictions = lg_clf.predict(X_test)
y_log_test_unidimensional = np.reshape(y_test, -1)
y_log_predictions_unidimensional = np.reshape(y_log_predictions, -1)
logistic_results = pd.DataFrame(
    {
        "predictions": y_log_predictions_unidimensional,
        "truth": y_log_test_unidimensional
    }
)
logistic_results.to_csv("./predictions/logistic_results.csv", index=False)

In [22]:
logistic_df = pd.read_csv("./predictions/logistic_results.csv")
logistic_df.head()

Unnamed: 0,predictions,truth
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1


In [24]:
logistic_df["predictions"].unique()

array([1])

In [26]:
from sklearn.metrics import precision_recall_curve
precision, recall, thresholds = precision_recall_curve(y_log_test_unidimensional, y_log_predictions_unidimensional)
print(precision)
print(recall)
print(thresholds)

[ 0.93068585  1.        ]
[ 1.  0.]
[1]


In [27]:
from sklearn import metrics

fpr, tpr, thresholds = metrics.roc_curve(y_log_test_unidimensional, y_log_predictions_unidimensional, pos_label=1)
print(fpr)
print(tpr)
print(thresholds)

[ 0.  1.]
[ 0.  1.]
[2 1]


In [28]:
# extreme gradient boosting

from xgboost import XGBRegressor

xgb = XGBRegressor()
xgb.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [29]:
# xgb with test
xgb_predictions = xgb.predict(X_test)
xgb_results = pd.DataFrame(
    {
        "predictions": xgb_predictions,
        "truth": y_log_test_unidimensional
    }
)
xgb_results.to_csv("./predictions/xboost_results.csv", index=False)



In [30]:
# extreme gradient boosting

from xgboost import XGBClassifier

xgb = XGBClassifier()
xgb.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [31]:
# xgb with test
xgb_predictions = xgb.predict(X_test)
xgb_results = pd.DataFrame(
    {
        "predictions": xgb_predictions,
        "truth": y_log_test_unidimensional
    }
)
xgb_results.to_csv("./predictions/xboost_results_classifier.csv", index=False)



In [37]:
len(neg_examples)

11804

In [40]:
len(pos_examples) / len(neg_examples)

13.092511013215859

In [42]:
neg_examples_duplicated = neg_examples * 13
neg_examples_duplicated[0][0]

[0, 0, 0, 1, 0]

In [43]:
neg_examples[0][0]

[0, 0, 0, 1, 0]

In [47]:
features_duplicated = np.concatenate((pos_examples, neg_examples_duplicated), axis=0)


In [49]:
Y_pos = [[1]] * len(pos_examples)
Y_neg_d = [[0]] * len(neg_examples_duplicated)
labels_d = np.concatenate((Y_pos, Y_neg_d), axis=0)
