In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 13 14:21:08 2022

@author: jamesonblount
"""

# In[]:
# Importing required packages
#Importing basic packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#Importing sklearn modules
from sklearn.metrics import mean_squared_error,confusion_matrix, precision_score, recall_score, auc,roc_curve
from sklearn import ensemble, linear_model, neighbors, svm, tree, neural_network
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn import svm,model_selection, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error


from sklearn.ensemble import RandomForestClassifier


In [2]:
#Loading the data and checking for missing values
dataset=pd.read_csv('C:/Users/ictinike/Documents/WrayLab/raw_data/x_0011_df_phyloP.csv')
dataset.isnull().sum()

datasetv2 = dataset.dropna(axis=1)
datasetv2.isnull().sum()
# Checking the data set for any NULL values is very essential, as MLAs can not 
# handle NULL values. We have to either eliminate the records with NULL values 
# or replace them with the mean/median of the other values. we can see each of 
# the variables are printed with number of null values. This data set has no null 
# values so all are zero here.


seqnames                  0
start                     0
end                       0
width                     0
strand                    0
                         ..
gene.y                    0
dTSS                      0
PhastCons                 0
PhyloP_primates_score     0
PhyloP_placental_score    0
Length: 90, dtype: int64

In [3]:
# Transforming the categorical variables into numerical
# Instantiate OneHotEncoder
ohe = OneHotEncoder(sparse = False)
ohe.fit_transform(datasetv2[["chromHMM_cat_longest"]])[:5]
datasetv2['chromHMM_cat_longest'].head()
ohe.categories_

[array(['Active Promoter', 'Candidate Strong Enhancer',
        'Candidate Weak Enhancer', 'Distal CTCF/Candidate Insulator',
        'Heterochromatin/Repetitive/Copy Number Variation',
        'Inactive Promoter', 'Low activity proximal to active states',
        'Polycomb repressed', 'Promoter Flanking',
        'Transcription asociated'], dtype=object)]

In [4]:
ohe.fit_transform(datasetv2.chromHMM_cat_longest.values.reshape(-1, 1))

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

In [5]:
one_hot_encoded_data = pd.get_dummies(datasetv2, columns = ['chromHMM_cat_longest','annotation'])
print(one_hot_encoded_data)

      seqnames      start        end  width strand       name  score  \
0         chr1     793301     793692    392      *     chr1.7    653   
1         chr1     793779     794382    604      *     chr1.8    553   
2         chr1     846424     847133    710      *    chr1.17    867   
3         chr1     847379     847941    563      *    chr1.18    544   
4         chr1     848207     849945   1739      *    chr1.19    605   
...        ...        ...        ...    ...    ...        ...    ...   
80348     chrX  155110643  155111395    753      *  chrX.2827   1000   
80349     chrX  155196562  155196941    380      *  chrX.2829    677   
80350     chrX  155227040  155227522    483      *  chrX.2831    619   
80351     chrX  155231134  155231652    519      *  chrX.2832    745   
80352     chrX  155232085  155232595    511      *  chrX.2833    688   

       signalValue  pValue  qValue  ...  annotation_5' UTR  \
0           0.0677    3.87      -1  ...                  0   
1          

In [6]:
# Compare performance of ML between top quartile to bottom quartile of AUC (best)/peak size
bin_labels = ['Lower', 'Midlower','Midupper', 'Upper']
lower_bin = ['Lower']
upper_bin = ['Upper']
datasetv2['score_quart'] = pd.qcut(datasetv2['score'], q = 4, labels = bin_labels)
datasetLower = datasetv2[datasetv2['score_quart'].isin(lower_bin)]
datasetUpper = datasetv2[datasetv2['score_quart'].isin(upper_bin)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  datasetv2['score_quart'] = pd.qcut(datasetv2['score'], q = 4, labels = bin_labels)


In [7]:
# here we'll start by using wgCERES_score_nosig as the response vector,
x = datasetv2[["DHS_prop_repeat", 
                    "DHS_prop_GC", "DHS_length", "n_SNV_Zhou_per_bp", 
                    "distanceToTSS", "zeta.human", "zeta.chimp", "PP_con", "PP_acc", 
                    "PhastCons",
                    "chromHMM_cat_longest", 
                    "annotation", "PhyloP_primates_score"]]
y = datasetv2["wgCERES_score_nosig"]
# For classifier ML techniques, i.e. SVM and Random Forest
# y = datasetv2["dhs_0_1_wg"]

In [20]:
# Make column transformer
column_transform = []
column_transform = make_column_transformer(
    (ohe, ['chromHMM_cat_longest','annotation']))

#Apply column transformer to predictor variables
column_transform.fit(x)

ColumnTransformer(transformers=[('onehotencoder', OneHotEncoder(sparse=False),
                                 ['chromHMM_cat_longest', 'annotation'])])

In [22]:
lm  = LinearRegression()
lm_pipeline = make_pipeline(column_transform, lm)

In [25]:
lm_pipeline.fit(x_train, y_train)
lm_predictions = lm_pipeline.predict(x_test)
print("First 5 LM predictions: ", list(lm_predictions[:5]))

First 5 LM predictions:  [0.34375, 0.28125, 0.24609375, 0.34375, 0.07421875]


array([ 0.34375   ,  0.28125   ,  0.24609375, ...,  0.31640625,
       -0.8671875 ,  0.34375   ])

In [46]:
#x=pd.get_dummies(x, columns = ['chromHMM_cat_longest','annotation'])

In [9]:
# Splitting train and split data
# The test data set size is 20% of the total records. This test data will not 
# be used in model training and work as an independent test data.
x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=0.2, random_state=0)

In [10]:
# Application of all Machine Learning methods
MLA = [
    #GLM
    linear_model.LogisticRegressionCV(max_iter=2000),
    linear_model.PassiveAggressiveClassifier(max_iter=2000),
    linear_model.SGDClassifier(max_iter=2000),
    linear_model.Perceptron(max_iter=2000),
    
    #Ensemble Methods
    ensemble.AdaBoostClassifier(),
    ensemble.BaggingClassifier(),
    ensemble.ExtraTreesClassifier(),
    ensemble.GradientBoostingClassifier(),
    ensemble.RandomForestClassifier(),

    #Gaussian Processes
    gaussian_process.GaussianProcessClassifier(),
    
    #SVM
    svm.SVC(probability=True),
    svm.NuSVC(probability=True),
    #svm.LinearSVC(max_iter=2000),
    
    #Trees    
    tree.DecisionTreeClassifier(),
  
    #Navies Bayes
    naive_bayes.BernoulliNB(),
    naive_bayes.GaussianNB(),
    
    #Nearest Neighbor
    neighbors.KNeighborsClassifier(),
    
    #Others
    neural_network.MLPClassifier(hidden_layer_sizes = (512, 256, 128, 64), max_iter=10000),
    #neural_network.MLPRegressor(max_iter=2000),
    ]

In [11]:
#Generating a dataframe to visualize comparison betweeen all algorithms
MLA_columns = []
MLA_compare = pd.DataFrame(columns = MLA_columns)

In [12]:
x_test

Unnamed: 0,DHS_prop_repeat,DHS_prop_GC,DHS_length,n_SNV_Zhou_per_bp,distanceToTSS,zeta.human,zeta.chimp,PP_con,PP_acc,PhastCons,chromHMM_cat_longest,annotation,PhyloP_primates_score
9442,0.229236,0.588040,301,0.000000,12820,9.373772e-01,1.033511e+00,0.918973,0.680151,0.0010,Candidate Strong Enhancer,Intron,-0.027056
17819,0.000000,0.650224,223,0.000000,1162,3.161790e-14,3.428107e-14,0.846055,1.000000,0.0085,Candidate Weak Enhancer,Promoter (1-2kb),0.154713
79682,0.000000,0.585565,859,0.000000,-14597,3.515203e-01,2.898497e-01,0.625180,0.833042,0.0110,Candidate Weak Enhancer,Intron,0.061763
79332,0.042553,0.535461,564,0.000000,-26842,3.047240e-01,8.448791e-01,0.122381,0.994416,0.0020,Distal CTCF/Candidate Insulator,Intron,-0.172231
65221,0.000000,0.620619,485,0.006186,352470,1.169852e+00,8.143365e-01,0.659730,0.759748,0.0000,Polycomb repressed,3' UTR,-0.031473
...,...,...,...,...,...,...,...,...,...,...,...,...,...
67558,0.000000,0.521739,391,0.000000,10114,3.499595e+00,3.847540e-01,0.999994,0.000255,0.0000,Candidate Strong Enhancer,Intron,-0.027879
17200,0.133739,0.462006,329,0.003040,-78291,1.856400e-14,2.788474e-01,0.792861,1.000000,1.0000,Candidate Weak Enhancer,Intron,0.327204
61376,1.000000,0.446108,334,0.000000,-160135,1.049991e+00,7.475919e-01,0.906381,0.467250,0.0010,Candidate Strong Enhancer,Distal Intergenic,-0.041890
47714,0.478114,0.441077,297,0.000000,-496,4.586345e-01,9.092983e-01,0.644118,0.897621,0.0010,Active Promoter,Promoter (<=1kb),-0.080085


In [13]:
lm  = LinearRegression()

In [14]:
lm.fit(x_train, y_train).predict(x_test)

ValueError: could not convert string to float: 'Candidate Strong Enhancer'

In [58]:
predicted = lm.fit(x_train, y_train).predict(x_test)

In [59]:
    fp, tp, th = roc_curve(y_test, predicted)
    MLA_name = alg.__class__.__name__
    MLA_compare.loc[row_index,'MLA used'] = MLA_name
    MLA_compare.loc[row_index, 'Train Accuracy'] = round(alg.score(x_train, y_train), 4)
    MLA_compare.loc[row_index, 'Test Accuracy'] = round(alg.score(x_test, y_test), 4)
    MLA_compare.loc[row_index, 'Precission'] = precision_score(y_test, predicted)
    MLA_compare.loc[row_index, 'Recall'] = recall_score(y_test, predicted)
    MLA_compare.loc[row_index, 'AUC'] = auc(fp, tp)

ValueError: continuous format is not supported

In [11]:
row_index = 0
for alg in MLA:  
    
    predicted = alg.fit(x_train, y_train).predict(x_test)
    fp, tp, th = roc_curve(y_test, predicted)
    MLA_name = alg.__class__.__name__
    MLA_compare.loc[row_index,'MLA used'] = MLA_name
    MLA_compare.loc[row_index, 'Train Accuracy'] = round(alg.score(x_train, y_train), 4)
    MLA_compare.loc[row_index, 'Test Accuracy'] = round(alg.score(x_test, y_test), 4)
    MLA_compare.loc[row_index, 'Precission'] = precision_score(y_test, predicted)
    MLA_compare.loc[row_index, 'Recall'] = recall_score(y_test, predicted)
    MLA_compare.loc[row_index, 'AUC'] = auc(fp, tp)

    row_index+=1

ValueError: could not convert string to float: 'Candidate Strong Enhancer'

In [24]:
# Building the rest of the pipeline
# Instantiate pipeline with linear regression
lm  = LinearRegression()
lm_pipeline = make_pipeline(column_transform, lm)

# Instantiate pipeline for SVM
sv = SVC()
sv_pipeline = make_pipeline(column_transform, sv)

# Instantiate pipeline with gradient boosting
gbm = GradientBoostingRegressor()
gbm_pipeline = make_pipeline(column_transform, gbm)

# Instantiate pipeline with logistic regression
lr = LogisticRegression()
lr_pipeline = make_pipeline(column_transform, lr)

In [16]:
# Fit pipeline to training set and make predictions on test set

lm_pipeline.fit(x_train, y_train)
lm_predictions = lm_pipeline.predict(x_test)
print("First 5 LM predictions: ", list(lm_predictions[:5]))

gbm_pipeline.fit(x_train, y_train)
gbm_predictions = gbm_pipeline.predict(x_test)
print("First 5 GBM predictions: ", list(gbm_predictions[:5]))

First 5 LM predictions:  [0.34375, 0.28125, 0.24609375, 0.34375, 0.07421875]
First 5 GBM predictions:  [0.3255569131908791, 0.17682304921118194, 0.2181580050764166, 0.31647069859187477, 0.06331650004906973]


In [None]:
# With predictions ready from the two pipelines, we can proceed to evaluate the 
# accuracy of these predictions using mean absolute error (MAE) and mean squared 
# error (RMSE).
# Calculate mean square error and root mean squared error

lm_mae = mean_absolute_error(lm_predictions, y_test)
lm_rmse = np.sqrt(mean_squared_error(lm_predictions, y_test))
print("LM MAE: {:.2f}".format(round(lm_mae, 2)))
print("LM RMSE: {:.2f}".format(round(lm_rmse, 2)))

gbm_mae = mean_absolute_error(gbm_predictions, y_test)
gbm_rmse = np.sqrt(mean_squared_error(gbm_predictions, y_test))
print("GBM MAE: {:.2f}".format(round(gbm_mae, 2)))
print("GBM RMSE: {:.2f}".format(round(gbm_rmse, 2)))

In [None]:
# here we'll use dhs_0_1_wg as the response vector for the classifiers
x = datasetv2[["DHS_prop_repeat", 
                    "DHS_prop_GC", "DHS_length", "n_SNV_Zhou_per_bp", 
                    "distanceToTSS", "zeta.human", "zeta.chimp", "PP_con", "PP_acc", 
                    "PhastCons",
                    "chromHMM_cat_longest", 
                    "annotation", "PhyloP_primates_score"]]
y = datasetv2["dhs_0_1_wg"]

In [None]:
    # Make column transformer
column_transform = []
column_transform = make_column_transformer(
    (ohe, ['chromHMM_cat_longest','annotation']))

#Apply column transformer to predictor variables
column_transform.fit(x)


# In[]:
# Splitting train and split data
# The test data set size is 20% of the total records. This test data will not 
# be used in model training and work as an independent test data.
x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=0.2, random_state=0)
# In[]:

lr_pipeline.fit(x_train, y_train)
lr_predictions = lr_pipeline.predict(x_test)
print("First 5 LR predictions: ", list(lr_predictions[:5]))

sv_pipeline.fit(x_train, y_train)
sv_predictions = sv_pipeline.predict(x_test)
print("First 5 SVM predictions: ", list(sv_predictions[:5]))

#rf_pipeline.fit(x_train, y_train)
#rf_predictions = rf_pipeline.predict(x_test)
#print("First 5 RF predictions: ", list(rf_predictions[:5]))
# To use random forest, need binary outcome

# In[]:
# With predictions ready from the two pipelines, we can proceed to evaluate the 
# accuracy of these predictions using mean absolute error (MAE) and mean squared 
# error (RMSE).
# Calculate mean square error and root mean squared error

lr_mae = mean_absolute_error(lr_predictions, y_test)
lr_rmse = np.sqrt(mean_squared_error(lr_predictions, y_test))
print("LR MAE: {:.2f}".format(round(lr_mae, 2)))
print("LR RMSE: {:.2f}".format(round(lr_rmse, 2)))

sv_mae = mean_absolute_error(sv_predictions, y_test)
sv_rmse = np.sqrt(mean_squared_error(sv_predictions, y_test))
print("SVM MAE: {:.2f}".format(round(sv_mae, 2)))
print("SVM RMSE: {:.2f}".format(round(sv_rmse, 2)))

# In[]:
# Performing this same pipeline but using the extreme subsets of "score"
# In[]:
    # here we'll start by using wgCERES_score_nosig as the response vector,
x = datasetLower[["DHS_prop_repeat", 
                    "DHS_prop_GC", "DHS_length", "n_SNV_Zhou_per_bp", 
                    "distanceToTSS", "zeta.human", "zeta.chimp", "PP_con", "PP_acc", 
                    "PhastCons",
                    "chromHMM_cat_longest", 
                    "annotation", "PhyloP_primates_score"]]
y = datasetLower["wgCERES_score_nosig"]
# For classifier ML techniques, i.e. SVM and Random Forest
# y = datasetLower["dhs_0_1_wg"]

# In[]:
    # Make column transformer
column_transform = []
column_transform = make_column_transformer(
    (ohe, ['chromHMM_cat_longest','annotation']))

#Apply column transformer to predictor variables
column_transform.fit(x)


# In[]:
# Splitting train and split data
# The test data set size is 20% of the total records. This test data will not 
# be used in model training and work as an independent test data.
x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=0.2, random_state=0)

# In[]:
# Building the rest of the pipeline
# Instantiate pipeline with linear regression
lm  = LinearRegression()
lm_pipeline = make_pipeline(column_transform, lm)

# In[]:
# Instantiate pipeline with gradient boosting
gbm = GradientBoostingRegressor()
gbm_pipeline = make_pipeline(column_transform, gbm)

# In[]:
# Instantiate pipeline with logistic regression
lr = LogisticRegression()
lr_pipeline = make_pipeline(column_transform, lr)

# In[]:
# Instantiate pipeline for SVM
sv = SVC()
sv_pipeline = make_pipeline(column_transform, sv)

# In[]:
# Fit pipeline to training set and make predictions on test set

lm_pipeline.fit(x_train, y_train)
lm_predictions = lm_pipeline.predict(x_test)
print("First 5 LM predictions: ", list(lm_predictions[:5]))

gbm_pipeline.fit(x_train, y_train)
gbm_predictions = gbm_pipeline.predict(x_test)
print("First 5 GBM predictions: ", list(gbm_predictions[:5]))


# In[]:
# With predictions ready from the two pipelines, we can proceed to evaluate the 
# accuracy of these predictions using mean absolute error (MAE) and mean squared 
# error (RMSE).
# Calculate mean square error and root mean squared error

lm_mae = mean_absolute_error(lm_predictions, y_test)
lm_rmse = np.sqrt(mean_squared_error(lm_predictions, y_test))
print("LM MAE: {:.2f}".format(round(lm_mae, 2)))
print("LM RMSE: {:.2f}".format(round(lm_rmse, 2)))

gbm_mae = mean_absolute_error(gbm_predictions, y_test)
gbm_rmse = np.sqrt(mean_squared_error(gbm_predictions, y_test))
print("GBM MAE: {:.2f}".format(round(gbm_mae, 2)))
print("GBM RMSE: {:.2f}".format(round(gbm_rmse, 2)))


# In[]:
    # here we'll use dhs_0_1_wg as the response vector for the classifiers
x = datasetLower[["DHS_prop_repeat", 
                    "DHS_prop_GC", "DHS_length", "n_SNV_Zhou_per_bp", 
                    "distanceToTSS", "zeta.human", "zeta.chimp", "PP_con", "PP_acc", 
                    "PhastCons",
                    "chromHMM_cat_longest", 
                    "annotation", "PhyloP_primates_score"]]
y = datasetLower["dhs_0_1_wg"]

# In[]:
    # Make column transformer
column_transform = []
column_transform = make_column_transformer(
    (ohe, ['chromHMM_cat_longest','annotation']))

#Apply column transformer to predictor variables
column_transform.fit(x)


# In[]:
# Splitting train and split data
# The test data set size is 20% of the total records. This test data will not 
# be used in model training and work as an independent test data.
x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=0.2, random_state=0)
# In[]:

lr_pipeline.fit(x_train, y_train)
lr_predictions = lr_pipeline.predict(x_test)
print("First 5 LR predictions: ", list(lr_predictions[:5]))

sv_pipeline.fit(x_train, y_train)
sv_predictions = sv_pipeline.predict(x_test)
print("First 5 SVM predictions: ", list(sv_predictions[:5]))

#rf_pipeline.fit(x_train, y_train)
#rf_predictions = rf_pipeline.predict(x_test)
#print("First 5 RF predictions: ", list(rf_predictions[:5]))
# To use random forest, need binary outcome

# In[]:
# With predictions ready from the two pipelines, we can proceed to evaluate the 
# accuracy of these predictions using mean absolute error (MAE) and mean squared 
# error (RMSE).
# Calculate mean square error and root mean squared error

lr_mae = mean_absolute_error(lr_predictions, y_test)
lr_rmse = np.sqrt(mean_squared_error(lr_predictions, y_test))
print("LR MAE: {:.2f}".format(round(lr_mae, 2)))
print("LR RMSE: {:.2f}".format(round(lr_rmse, 2)))

sv_mae = mean_absolute_error(sv_predictions, y_test)
sv_rmse = np.sqrt(mean_squared_error(sv_predictions, y_test))
print("SVM MAE: {:.2f}".format(round(sv_mae, 2)))
print("SVM RMSE: {:.2f}".format(round(sv_rmse, 2)))

# In[]:
    # here we'll start by using wgCERES_score_nosig as the response vector,
x = datasetUpper[["DHS_prop_repeat", 
                    "DHS_prop_GC", "DHS_length", "n_SNV_Zhou_per_bp", 
                    "distanceToTSS", "zeta.human", "zeta.chimp", "PP_con", "PP_acc", 
                    "PhastCons",
                    "chromHMM_cat_longest", 
                    "annotation", "PhyloP_primates_score"]]
y = datasetUpper["wgCERES_score_nosig"]
# For classifier ML techniques, i.e. SVM and Random Forest
# y = datasetUpper["dhs_0_1_wg"]

# In[]:
    # Make column transformer
column_transform = []
column_transform = make_column_transformer(
    (ohe, ['chromHMM_cat_longest','annotation']))

#Apply column transformer to predictor variables
column_transform.fit(x)


# In[]:
# Splitting train and split data
# The test data set size is 20% of the total records. This test data will not 
# be used in model training and work as an independent test data.
x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=0.2, random_state=0)

# In[]:
# Building the rest of the pipeline
# Instantiate pipeline with linear regression
lm  = LinearRegression()
lm_pipeline = make_pipeline(column_transform, lm)

# In[]:
# Instantiate pipeline with gradient boosting
gbm = GradientBoostingRegressor()
gbm_pipeline = make_pipeline(column_transform, gbm)

# In[]:
# Instantiate pipeline with logistic regression
lr = LogisticRegression()
lr_pipeline = make_pipeline(column_transform, lr)

# In[]:
# Instantiate pipeline for SVM
sv = SVC()
sv_pipeline = make_pipeline(column_transform, sv)

# In[]:
# Fit pipeline to training set and make predictions on test set

lm_pipeline.fit(x_train, y_train)
lm_predictions = lm_pipeline.predict(x_test)
print("First 5 LM predictions: ", list(lm_predictions[:5]))

gbm_pipeline.fit(x_train, y_train)
gbm_predictions = gbm_pipeline.predict(x_test)
print("First 5 GBM predictions: ", list(gbm_predictions[:5]))


# In[]:
# With predictions ready from the two pipelines, we can proceed to evaluate the 
# accuracy of these predictions using mean absolute error (MAE) and mean squared 
# error (RMSE).
# Calculate mean square error and root mean squared error

lm_mae = mean_absolute_error(lm_predictions, y_test)
lm_rmse = np.sqrt(mean_squared_error(lm_predictions, y_test))
print("LM MAE: {:.2f}".format(round(lm_mae, 2)))
print("LM RMSE: {:.2f}".format(round(lm_rmse, 2)))

gbm_mae = mean_absolute_error(gbm_predictions, y_test)
gbm_rmse = np.sqrt(mean_squared_error(gbm_predictions, y_test))
print("GBM MAE: {:.2f}".format(round(gbm_mae, 2)))
print("GBM RMSE: {:.2f}".format(round(gbm_rmse, 2)))


# In[]:
    # here we'll use dhs_0_1_wg as the response vector for the classifiers
x = datasetUpper[["DHS_prop_repeat", 
                    "DHS_prop_GC", "DHS_length", "n_SNV_Zhou_per_bp", 
                    "distanceToTSS", "zeta.human", "zeta.chimp", "PP_con", "PP_acc", 
                    "PhastCons",
                    "chromHMM_cat_longest", 
                    "annotation", "PhyloP_primates_score"]]
y = datasetUpper["dhs_0_1_wg"]

# In[]:
    # Make column transformer
column_transform = []
column_transform = make_column_transformer(
    (ohe, ['chromHMM_cat_longest','annotation']))

#Apply column transformer to predictor variables
column_transform.fit(x)


# In[]:
# Splitting train and split data
# The test data set size is 20% of the total records. This test data will not 
# be used in model training and work as an independent test data.
x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=0.2, random_state=0)
# In[]:

lr_pipeline.fit(x_train, y_train)
lr_predictions = lr_pipeline.predict(x_test)
print("First 5 LR predictions: ", list(lr_predictions[:5]))

sv_pipeline.fit(x_train, y_train)
sv_predictions = sv_pipeline.predict(x_test)
print("First 5 SVM predictions: ", list(sv_predictions[:5]))

#rf_pipeline.fit(x_train, y_train)
#rf_predictions = rf_pipeline.predict(x_test)
#print("First 5 RF predictions: ", list(rf_predictions[:5]))
# To use random forest, need binary outcome

# In[]:
# With predictions ready from the two pipelines, we can proceed to evaluate the 
# accuracy of these predictions using mean absolute error (MAE) and mean squared 
# error (RMSE).
# Calculate mean square error and root mean squared error

lr_mae = mean_absolute_error(lr_predictions, y_test)
lr_rmse = np.sqrt(mean_squared_error(lr_predictions, y_test))
print("LR MAE: {:.2f}".format(round(lr_mae, 2)))
print("LR RMSE: {:.2f}".format(round(lr_rmse, 2)))

sv_mae = mean_absolute_error(sv_predictions, y_test)
sv_rmse = np.sqrt(mean_squared_error(sv_predictions, y_test))
print("SVM MAE: {:.2f}".format(round(sv_mae, 2)))
print("SVM RMSE: {:.2f}".format(round(sv_rmse, 2)))

# In[]:
    # After filtering for OCRs that are only present within certain TADs,
    # reading in that dataset here and testing the harness again
dataset=pd.read_csv('/Users/jamesonblount/Documents/Wray_Rotation/Carl/X_0012/data/OCRs_inTADs.csv')
dataset.isnull().sum()

datasetv2 = dataset.dropna(axis=1)
datasetv2.isnull().sum()
# Checking the data set for any NULL values is very essential, as MLAs can not 
# handle NULL values. We have to either eliminate the records with NULL values 
# or replace them with the mean/median of the other values. we can see each of 
# the variables are printed with number of null values. This data set has no null 
# values so all are zero here.
# In[]:
    # here we'll start by using wgCERES_score_nosig as the response vector,
x = datasetv2[["DHS_prop_repeat", 
                    "DHS_prop_GC", "DHS_length", "n_SNV_Zhou_per_bp", 
                    "distanceToTSS", "zeta.human", "zeta.chimp", "PP_con", "PP_acc", 
                    "PhastCons",
                    "chromHMM_cat_longest", 
                    "annotation", "PhyloP_primates_score"]]
y = datasetv2["wgCERES_score_nosig"]
# For classifier ML techniques, i.e. SVM and Random Forest
# y = datasetv2["dhs_0_1_wg"]

# In[]:
    # Make column transformer
column_transform = []
column_transform = make_column_transformer(
    (ohe, ['chromHMM_cat_longest','annotation']))

#Apply column transformer to predictor variables
column_transform.fit(x)


# In[]:
# Splitting train and split data
# The test data set size is 20% of the total records. This test data will not 
# be used in model training and work as an independent test data.
x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=0.2, random_state=0)

# In[]:
# Building the rest of the pipeline
# Instantiate pipeline with linear regression
lm  = LinearRegression()
lm_pipeline = make_pipeline(column_transform, lm)

# In[]:
# Instantiate pipeline with gradient boosting
gbm = GradientBoostingRegressor()
gbm_pipeline = make_pipeline(column_transform, gbm)

# In[]:
# Instantiate pipeline with logistic regression
lr = LogisticRegression()
lr_pipeline = make_pipeline(column_transform, lr)

# In[]:
# Instantiate pipeline for SVM
sv = SVC()
sv_pipeline = make_pipeline(column_transform, sv)

# In[]:
# Fit pipeline to training set and make predictions on test set

lm_pipeline.fit(x_train, y_train)
lm_predictions = lm_pipeline.predict(x_test)
print("First 5 LM predictions: ", list(lm_predictions[:5]))

gbm_pipeline.fit(x_train, y_train)
gbm_predictions = gbm_pipeline.predict(x_test)
print("First 5 GBM predictions: ", list(gbm_predictions[:5]))


# In[]:
# With predictions ready from the two pipelines, we can proceed to evaluate the 
# accuracy of these predictions using mean absolute error (MAE) and mean squared 
# error (RMSE).
# Calculate mean square error and root mean squared error

lm_mae = mean_absolute_error(lm_predictions, y_test)
lm_rmse = np.sqrt(mean_squared_error(lm_predictions, y_test))
print("LM MAE: {:.2f}".format(round(lm_mae, 2)))
print("LM RMSE: {:.2f}".format(round(lm_rmse, 2)))

gbm_mae = mean_absolute_error(gbm_predictions, y_test)
gbm_rmse = np.sqrt(mean_squared_error(gbm_predictions, y_test))
print("GBM MAE: {:.2f}".format(round(gbm_mae, 2)))
print("GBM RMSE: {:.2f}".format(round(gbm_rmse, 2)))


# In[]:
    # here we'll use dhs_0_1_wg as the response vector for the classifiers
x = datasetv2[["DHS_prop_repeat", 
                    "DHS_prop_GC", "DHS_length", "n_SNV_Zhou_per_bp", 
                    "distanceToTSS", "zeta.human", "zeta.chimp", "PP_con", "PP_acc", 
                    "PhastCons",
                    "chromHMM_cat_longest", 
                    "annotation", "PhyloP_primates_score"]]
y = datasetv2["dhs_0_1_wg"]

# In[]:
    # Make column transformer
column_transform = []
column_transform = make_column_transformer(
    (ohe, ['chromHMM_cat_longest','annotation']))

#Apply column transformer to predictor variables
column_transform.fit(x)


# In[]:
# Splitting train and split data
# The test data set size is 20% of the total records. This test data will not 
# be used in model training and work as an independent test data.
x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=0.2, random_state=0)
# In[]:

lr_pipeline.fit(x_train, y_train)
lr_predictions = lr_pipeline.predict(x_test)
print("First 5 LR predictions: ", list(lr_predictions[:5]))

sv_pipeline.fit(x_train, y_train)
sv_predictions = sv_pipeline.predict(x_test)
print("First 5 SVM predictions: ", list(sv_predictions[:5]))

#rf_pipeline.fit(x_train, y_train)
#rf_predictions = rf_pipeline.predict(x_test)
#print("First 5 RF predictions: ", list(rf_predictions[:5]))
# To use random forest, need binary outcome

# In[]:
# With predictions ready from the two pipelines, we can proceed to evaluate the 
# accuracy of these predictions using mean absolute error (MAE) and mean squared 
# error (RMSE).
# Calculate mean square error and root mean squared error

lr_mae = mean_absolute_error(lr_predictions, y_test)
lr_rmse = np.sqrt(mean_squared_error(lr_predictions, y_test))
print("LR MAE: {:.2f}".format(round(lr_mae, 2)))
print("LR RMSE: {:.2f}".format(round(lr_rmse, 2)))

sv_mae = mean_absolute_error(sv_predictions, y_test)
sv_rmse = np.sqrt(mean_squared_error(sv_predictions, y_test))
print("SVM MAE: {:.2f}".format(round(sv_mae, 2)))
print("SVM RMSE: {:.2f}".format(round(sv_rmse, 2)))