In [None]:
import pandas as pd
import numpy as np
from plotnine import *
from tensorflow import keras
import keras.layers as kl
import itertools
import seaborn as sns
import plotnine as p9

from sklearn.preprocessing import scale 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error,r2_score,mean_squared_error,explained_variance_score
from sklearn.model_selection import GridSearchCV

In [None]:
# Metabolic labeling
df  = pd.read_csv('D:/ML4RG_file/genomic_sequence_plus_features_hl_all_tissues.csv', index_col = 0)
df.dropna(inplace=True)
tissue = df.columns.values[:49]

# find the index of train, test, and validation set 
chrom_val=['chr2', 'chr3','chr4']
chrom_test=['chr1','chr8','chr9']

idx_test = np.where(df.chromosome.isin(chrom_test))[0]
idx_val = np.where(df.chromosome.isin(chrom_val))[0]
idx_train = np.where(~(df.chromosome.isin(chrom_test)| df.chromosome.isin(chrom_val)))[0]

# get the dataframe of each tissue including haalf_life, and other features coloumns
def get_df_tissue(tissue):
    return df[[tissue,'log_3_utr_length',
       'log_5_utr_length', 'log_cds_length', 'gc_content_5_utr','gc_content_cds', 'gc_content_3_utr', 
       'AAA', 'AAC', 'AAG', 'AAT','ACA', 'ACC', 'ACG', 'ACT', 'AGA', 'AGC', 'AGG', 'AGT', 'ATA',
       'ATC', 'ATG', 'ATT', 'CAA', 'CAC', 'CAG', 'CAT', 'CCA', 'CCC',
       'CCG', 'CCT', 'CGA', 'CGC', 'CGG', 'CGT', 'CTA', 'CTC', 'CTG',
       'CTT', 'GAA', 'GAC', 'GAG', 'GAT', 'GCA', 'GCC', 'GCG', 'GCT',
       'GGA', 'GGC', 'GGG', 'GGT', 'GTA', 'GTC', 'GTG', 'GTT', 'TAA',
       'TAC', 'TAG', 'TAT', 'TCA', 'TCC', 'TCG', 'TCT', 'TGA', 'TGC',
       'TGG', 'TGT', 'TTA', 'TTC', 'TTG', 'TTT']]

# do the ridge regression 
def ridge(tissue_list):
    msqe_result = []
    explained_variance_score_result = []
    for tissue in tissue_list:
        # find the alpha for each model (tissue)
        df_tun = get_df_tissue(tissue).drop(columns=['AAA', 'AAC',
                           'AAG', 'AAT', 'ACA', 'ACC', 'ACG', 'ACT', 'AGA', 'AGC', 'AGG', 'AGT',
                           'ATA', 'ATC', 'ATG', 'ATT', 'CAA', 'CAC', 'CAG', 'CAT', 'CCA', 'CCC',
                           'CCG', 'CCT', 'CGA', 'CGC', 'CGG', 'CGT', 'CTA', 'CTC', 'CTG', 'CTT',
                           'GAA', 'GAC', 'GAG', 'GAT', 'GCA', 'GCC', 'GCG', 'GCT', 'GGA', 'GGC',
                           'GGG', 'GGT', 'GTA', 'GTC', 'GTG', 'GTT', 'TAA', 'TAC', 'TAG', 'TAT',
                           'TCA', 'TCC', 'TCG', 'TCT', 'TGA', 'TGC', 'TGG', 'TGT', 'TTA', 'TTC',
                           'TTG', 'TTT'])
        
        X_tun = df_tun.drop(tissue, axis=1)
        y_tun = df_tun.loc[:,tissue].values
        parameters = {'alpha':[1e-3, 1e-2, 1e-1, 1, 10]}
        model = Ridge()
        Ridge_reg= GridSearchCV(model, parameters, scoring='neg_mean_squared_error',cv=5)
        result = Ridge_reg.fit(X_tun,y_tun)
        best_alpha = result.best_params_['alpha']  
        
        # train model with best alpha
        X = get_df_tissue(tissue).drop(columns=[tissue])
        y = get_df_tissue(tissue).loc[:,tissue].values
       
        X_train = X[~(df.chromosome.isin(chrom_test)| df.chromosome.isin(chrom_val))]
        X_test = X[df.chromosome.isin(chrom_test)]
        X_val = X[df.chromosome.isin(chrom_val)]
        y_train = y[idx_train]
        y_test = y[idx_test]
        y_val = y[idx_val]
        ridge2 = Ridge(alpha = best_alpha, normalize = True)
        ridge2.fit(X_train, y_train)           
        pred2 = ridge2.predict(X_test)
        msqe = mean_squared_error(y_test,pred2,squared=False)/(abs(max(y_test)-min(y_test))/100)
        
        msqe_result.append(msqe)
        explained_variance_score_result.append(mean_squared_error(y_test, pred2)) 
    
    result = [msqe_result, explained_variance_score_result] 
    
    return result

tissue_list = df.columns.values[:49].tolist() 
results = ridge(tissue_list)

xvar = results[1]
mse = results[0]

# plot the results
var = {'exp_var_sc':xvar,'tissue':tissue_list}
m2e = {'%root_mean_sq_err':mse,'tissue':tissue_list}

pl_var = pd.DataFrame(var)
pl_m2e = pd.DataFrame(m2e)

plot = p9.ggplot(pl_var, p9.aes('tissue', 'exp_var_sc')) + p9.geom_col() + p9.theme(axis_text_x = p9.element_text(angle = 90))
plot.save(filename="lasso_regression_tuned.jpg")

plot1 = p9.ggplot(pl_m2e, p9.aes('tissue', '%root_mean_sq_err')) + p9.geom_col() + p9.theme(axis_text_x = p9.element_text(angle = 90))
plot1.save(filename="root_mean_square_error_tuned.jpg")