In [1]:
#Packages
import sys
import math
import matplotlib.pyplot as plt
import json
import csv
import numpy as np
import pandas as pd
import time
import re
import random
from tqdm import tqdm
import os
import multiprocessing as mp
import prince

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from scipy.stats import loguniform
from skopt import BayesSearchCV
from skopt.plots import plot_objective, plot_histogram
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import ShuffleSplit
from sklearn import linear_model

# Load and Prepare Data

In [2]:
#Prepare USDA and Phenol Expore data
flq_data=pd.read_csv('C:\\Users\\Sebekington\\Documents\\Python Scripts\\GCP\\Concentration Prediction Platform\\USPX_features.csv')
flq_training=flq_data[flq_data['value']!=0] #Remove rows reporting 0 concentration
flq_training=flq_training.reset_index()
flq_training=flq_training.drop(columns=['index'])

#Split data
idsU=flq_training[['row_id','name','InChIKey']] #Chemical Identifiers
Slbl=flq_training[['row_id','StratLabel','StratVariable','Cvariable','C_Label','F_Label']] #Labels for Stratifying Folds
propU=flq_training[['MW','C','logP','logS','HBI','NCA','NPSA','C_Label']] #Property Features
ptU=flq_training[['class','order','family','genus','F_Label']] #Phylogenetic Features
RespU=flq_training[['row_id','logvalue']] #Response Values

In [3]:
#Prepare Metabolon Data
Mbn_data=pd.read_csv('C:\\Users\\Sebekington\\Documents\\Python Scripts\\GCP\\Concentration Prediction Platform\\MassSpec_features.csv')
flq_Mbn=Mbn_data[Mbn_data['MS_peak']!=0] #Remove rows reporting 0 peak area
flq_Mbn=flq_Mbn.reset_index()
flq_Mbn=flq_Mbn.drop(columns=['index'])

idsM=flq_Mbn[['row_id','name','InChIKey']] #Chemical Identifiers
propM=flq_Mbn[['MW','C','logP','logS','HBI','NCA','NPSA','C_Label']] #Property Features
ptM=flq_Mbn[['class','order','family','genus','F_Label']] #Phylogenetic Features
RespM=flq_Mbn[['row_id','MS_peak']] #Response Values

In [4]:
#Prepare FoodMine Chemical Properties
FM_data=pd.read_csv('C:\\Users\\Sebekington\\Documents\\Python Scripts\\GCP\\Concentration Prediction Platform\\FoodMine_features.csv')
flq_FM=FM_data[FM_data['MS_peak']!=0]
flqF=flq_FM.drop(columns=['food','Phylum','class','order','family','genus','species','fd_id','fd-cp_id','MS_peak'])

idsF=flqF[['row_id','COMP ID','InChIKey']] #Chemical Identifiers
propF=flqF[['MW','C','logP','logS','HBI','NCA','NPSA','C_Label']] #Property Features
propF=propF.drop_duplicates()
RespF=flqF[['COMP ID']] #Output Matrix
RespF=RespF.drop_duplicates()
RespF=RespF.reset_index()
RespF=RespF.drop(columns=['index'])

# Find Property and Phylogenetic Component Variables

In [5]:
#Merge all chemical property lists of all datasets
PropAll=pd.concat([propU,propM,propF]) #Merge all Property Features of all datasets
PropAll=PropAll.drop_duplicates() #Remove duplicate compounds
PropAll=PropAll.reset_index(drop=True)
Clbl=pd.DataFrame(PropAll['C_Label']) #Create Compound Label list of merged dataset
PropAll=PropAll.drop(columns=['C_Label']) #Drop label column for input into PCA

#Merge all phylogenetic lists of all datasets
ptAll=pd.concat([ptU,ptM]) #Merge all Phylogenetic Features of all datasets
ptAll=ptAll.drop_duplicates() #Remove dulplicate foods
ptAll=ptAll.reset_index(drop=True)
Flbl=pd.DataFrame(ptAll['F_Label']) #Create Phylogenetic Label list of merged dataset
ptAll=ptAll.drop(columns=['F_Label']) #Drop label column for input into MCA

In [6]:
#Find Principle Components of property features for merged dataset
pca=PCA(n_components=7)
pc_pa=pca.fit_transform(PropAll)
pcc_df=pd.DataFrame(data=pc_pa,columns=['PCC1','PCC2','PCC3','PCC4','PCC5','PCC6','PCC7'])
print('prop explained:',pca.explained_variance_ratio_)

#Find Components of phylogenetic features for merged dataset
mca=prince.MCA(n_components=13)
mca.fit(ptAll)
pc_pt=mca.transform(ptAll)
pc_pt.columns=['PCP1','PCP2','PCP3','PCP4','PCP5','PCP6','PCP7','PCP8','PCP9','PCP10','PCP11','PCP12','PCP13']
pcp_df=pc_pt
print('pt explained:',mca.total_inertia_)
print(sum(mca.explained_inertia_))

prop explained: [9.90923865e-01 8.51286465e-03 4.95454198e-04 4.81692882e-05
 1.65587810e-05 2.23033207e-06 8.57489030e-07]
pt explained: 72.0
0.14350632637131425


In [64]:
#Create a USDA Phenol Explorer Training Dataset from the PCA and MCA analysis components
n1=len(flq_training)
xx=np.linspace(0,(n1*1),(n1*1)+1)
USPX=pd.DataFrame(columns=['PCC1','PCC2','PCC3','PCC4','PCP1','PCP2','PCP3','PCP4','PCP5','PCP6','PCP7','PCP8','PCP9','PCP10','PCP11','PCP12','PCP13'], index=xx)

for h in flq_training.index:
    #Find property component features
    S1=Clbl.loc[Clbl['C_Label']==flq_training['C_Label'][h]]
    USPX.loc[h,('PCC1','PCC2','PCC3','PCC4')]=[pcc_df.loc[S1.index[0],'PCC1'],pcc_df.loc[S1.index[0],'PCC2'],pcc_df.loc[S1.index[0],'PCC3'],pcc_df.loc[S1.index[0],'PCC4']]
    
    #Find phylogenetic component features
    S1=Flbl.loc[Flbl['F_Label']==flq_training['F_Label'][h]]
    USPX.loc[h,('PCP1','PCP2','PCP3','PCP4','PCP5','PCP6','PCP7','PCP8','PCP9','PCP10','PCP11','PCP12','PCP13')]=[pcp_df.loc[S1.index[0],'PCP1'],pcp_df.loc[S1.index[0],'PCP2'],pcp_df.loc[S1.index[0],'PCP3'],pcp_df.loc[S1.index[0],'PCP4'],pcp_df.loc[S1.index[0],'PCP5'],pcp_df.loc[S1.index[0],'PCP6'],pcp_df.loc[S1.index[0],'PCP7'],pcp_df.loc[S1.index[0],'PCP8'],pcp_df.loc[S1.index[0],'PCP9'],pcp_df.loc[S1.index[0],'PCP10'],pcp_df.loc[S1.index[0],'PCP11'],pcp_df.loc[S1.index[0],'PCP12'],pcp_df.loc[S1.index[0],'PCP13']]
USPX=USPX.dropna()
USPX=USPX.reset_index(drop=True)

In [65]:
#Create a MBN dataset from the PCA and MCA analysis components
n1=len(flq_Mbn)
xx=np.linspace(0,(n1*1),(n1*1)+1)
MBN=pd.DataFrame(columns=['PCC1','PCC2','PCC3','PCC4','PCP1','PCP2','PCP3','PCP4','PCP5','PCP6','PCP7','PCP8','PCP9','PCP10','PCP11','PCP12','PCP13'], index=xx)

for h in flq_Mbn.index:
    #Find property component features
    S1=Clbl.loc[Clbl['C_Label']==flq_training['C_Label'][h]]
    MBN.loc[h,('PCC1','PCC2','PCC3','PCC4')]=[pcc_df.loc[S1.index[0],'PCC1'],pcc_df.loc[S1.index[0],'PCC2'],pcc_df.loc[S1.index[0],'PCC3'],pcc_df.loc[S1.index[0],'PCC4']]
    
    #Find phylogenetic component features
    S1=Flbl.loc[Flbl['F_Label']==flq_training['F_Label'][h]]
    MBN.loc[h,('PCP1','PCP2','PCP3','PCP4','PCP5','PCP6','PCP7','PCP8','PCP9','PCP10','PCP11','PCP12','PCP13')]=[pcp_df.loc[S1.index[0],'PCP1'],pcp_df.loc[S1.index[0],'PCP2'],pcp_df.loc[S1.index[0],'PCP3'],pcp_df.loc[S1.index[0],'PCP4'],pcp_df.loc[S1.index[0],'PCP5'],pcp_df.loc[S1.index[0],'PCP6'],pcp_df.loc[S1.index[0],'PCP7'],pcp_df.loc[S1.index[0],'PCP8'],pcp_df.loc[S1.index[0],'PCP9'],pcp_df.loc[S1.index[0],'PCP10'],pcp_df.loc[S1.index[0],'PCP11'],pcp_df.loc[S1.index[0],'PCP12'],pcp_df.loc[S1.index[0],'PCP13']]
MBN=MBN.dropna()
MBN=MBN.reset_index(drop=True)

In [66]:
#Create a FoodMine (FM) dataset from the PCA and MCA analysis components using USPX food list and FM property list
ptU2=ptU.drop_duplicates() #Make list of unique foods in USPX
ptU2=ptU2.reset_index(drop=True)
propF=propF.reset_index(drop=True)

n1=len(ptU2)*len(propF) #All combinations of USPX foods and FM compounds
xx=np.linspace(0,(n1*1),(n1*1)+1)
FM=pd.DataFrame(columns=['PCC1','PCC2','PCC3','PCC4','PCP1','PCP2','PCP3','PCP4','PCP5','PCP6','PCP7','PCP8','PCP9','PCP10','PCP11','PCP12','PCP13'], index=xx)
RespF2=pd.DataFrame(columns=['COMP ID'], index=xx)

r=0
for h in propF.index:
    #Select compound
    S1=Clbl.loc[Clbl['C_Label']==propF['C_Label'][h]]
    #Iterate over the foods in USPX
    for j in ptU2.index:
        S2=Flbl.loc[Flbl['F_Label']==ptU2['F_Label'][j]]
        #Create rows such that there is a row for each possible compound-food pair between FM and USPX
        FM.loc[r,('PCC1','PCC2','PCC3','PCC4')]=[pcc_df.loc[S1.index[0],'PCC1'],pcc_df.loc[S1.index[0],'PCC2'],pcc_df.loc[S1.index[0],'PCC3'],pcc_df.loc[S1.index[0],'PCC4']]
        FM.loc[r,('PCP1','PCP2','PCP3','PCP4','PCP5','PCP6','PCP7','PCP8','PCP9','PCP10','PCP11','PCP12','PCP13')]=[pcp_df.loc[S2.index[0],'PCP1'],pcp_df.loc[S2.index[0],'PCP2'],pcp_df.loc[S2.index[0],'PCP3'],pcp_df.loc[S2.index[0],'PCP4'],pcp_df.loc[S2.index[0],'PCP5'],pcp_df.loc[S2.index[0],'PCP6'],pcp_df.loc[S2.index[0],'PCP7'],pcp_df.loc[S2.index[0],'PCP8'],pcp_df.loc[S2.index[0],'PCP9'],pcp_df.loc[S2.index[0],'PCP10'],pcp_df.loc[S2.index[0],'PCP11'],pcp_df.loc[S2.index[0],'PCP12'],pcp_df.loc[S2.index[0],'PCP13']]
        r=r+1
FM=FM.dropna()
FM=FM.reset_index(drop=True)


#Create Output matrix of FoodMine Predictions
r=0
for h in range(0,len(RespF)):
    for j in range(0,len(ptU2)):
        RespF2['COMP ID'][r]=RespF['COMP ID'][h]
        r=r+1
RespF2=RespF2.dropna()
RespF2=RespF2.reset_index(drop=True)

# Building Model

In [10]:
#HyperParameter Optimization
X=USPX
Y=RespU['logvalue']

#Stratified folds by Chemical Classes
y=Slbl['Cvariable']
skf=StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
cvf=skf.split(X,y)

#Find optimization start time
t=time.time()

#Bayesian optimizer and parameters
opt=BayesSearchCV(
    estimator=GradientBoostingRegressor(),
    search_spaces={'max_depth':(3,10),
    'max_leaf_nodes':(2,50),
    'learning_rate':(0.01,1.0,'log-uniform'),
    'n_estimators':(10,200)},
    n_iter=100,
    verbose=0,
    #cv=ShuffleSplit(n_splits=5)
    cv=cvf
)
opt.fit(X,np.ravel(Y))

#Duration of optimization and best fit hyperparameters
print('Elapsed: %s' % (time.time()-t))
print("score: %s" % opt.best_score_)
print(opt.best_estimator_)

Elapsed: 2753.7110571861267
score: 0.9059540791275971
GradientBoostingRegressor(learning_rate=0.13304011236020746, max_depth=10,
                          max_leaf_nodes=43, n_estimators=200)


In [11]:
#All Hyperparameter Optimization Results
pd.DataFrame(opt.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_max_leaf_nodes,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.607585,0.025185,0.010774,0.000743,0.156125,5,5,21,"{'learning_rate': 0.15612482965552427, 'max_de...",0.762125,0.759175,0.765501,0.753666,0.754218,0.758937,0.004547,96
1,2.860286,0.055765,0.022133,0.005179,0.021392,3,45,123,"{'learning_rate': 0.021392358937739393, 'max_d...",0.748779,0.750794,0.753671,0.749305,0.741464,0.748802,0.004044,97
2,3.915478,0.705981,0.022538,0.002936,0.446763,9,49,85,"{'learning_rate': 0.4467627446245862, 'max_dep...",0.905445,0.897964,0.900405,0.896114,0.899320,0.899850,0.003144,33
3,3.917670,0.206597,0.026728,0.011273,0.027903,7,17,82,"{'learning_rate': 0.02790334729956252, 'max_de...",0.821395,0.815798,0.821747,0.812584,0.811870,0.816679,0.004210,91
4,4.408323,0.245922,0.020441,0.003659,0.027779,9,8,126,"{'learning_rate': 0.027779480071645376, 'max_d...",0.814767,0.810933,0.815188,0.806350,0.802790,0.810005,0.004813,93
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,4.644771,0.115035,0.020943,0.003021,0.555089,10,40,149,"{'learning_rate': 0.5550886737449787, 'max_dep...",0.894570,0.888849,0.901114,0.888994,0.893129,0.893331,0.004496,49
96,10.967851,0.298235,0.040298,0.003245,0.014622,10,38,200,"{'learning_rate': 0.014622162886561707, 'max_d...",0.869283,0.862356,0.869038,0.861235,0.859999,0.864382,0.003973,81
97,5.475157,0.216805,0.022731,0.002045,0.152368,10,43,127,"{'learning_rate': 0.152368380949515, 'max_dept...",0.907855,0.903000,0.907488,0.902484,0.902688,0.904703,0.002432,9
98,8.195871,0.367423,0.038100,0.015480,0.13304,10,43,200,"{'learning_rate': 0.13304011236020746, 'max_de...",0.911232,0.905046,0.907750,0.901656,0.904086,0.905954,0.003282,1


In [12]:
np.mean(opt.cv_results_['mean_test_score'])

0.8684123553409774

In [13]:
d=pd.DataFrame(opt.cv_results_)
d.loc[d['mean_test_score']==opt.best_score_]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_max_leaf_nodes,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
98,8.195871,0.367423,0.0381,0.01548,0.13304,10,43,200,"{'learning_rate': 0.13304011236020746, 'max_de...",0.911232,0.905046,0.90775,0.901656,0.904086,0.905954,0.003282,1


In [16]:
#Create Model with Best Hyperparameters
mdl=GradientBoostingRegressor(
    n_estimators=opt.best_params_['n_estimators'],
    max_leaf_nodes=opt.best_params_['max_leaf_nodes'],
    learning_rate=opt.best_params_['learning_rate'],
    max_depth=opt.best_params_['max_depth'])
print(mdl)

GradientBoostingRegressor(learning_rate=0.13304011236020746, max_depth=10,
                          max_leaf_nodes=43, n_estimators=200)


# Predicting FoodMine Compounds

In [69]:
#Predicting concentration of compound in each food
slct=FM
X=np.array(USPX)
Y=RespU['logvalue']
y=Slbl['Cvariable']

skf=StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

r=1
for train_index, test_index in skf.split(X,y):
    X_train,X_test=X[train_index],X[test_index]
    Y_train,Y_test=Y[train_index],Y[test_index]
    
    xx=np.linspace(0,((len(X_train)-1)*1),((len(X_train)-1)*1)+1)
    X_train=X_train.astype('float64')
    X_train=pd.DataFrame(X_train,columns=['PCC1','PCC2','PCC3','PCC4','PCP1','PCP2','PCP3','PCP4','PCP5','PCP6','PCP7','PCP8','PCP9','PCP10','PCP11','PCP12','PCP13'], index=xx)
    
    xx=np.linspace(0,((len(X_test)-1)*1),((len(X_test)-1)*1)+1)
    X_test=X_test.astype('float64')
    X_test=pd.DataFrame(X_test,columns=['PCC1','PCC2','PCC3','PCC4','PCP1','PCP2','PCP3','PCP4','PCP5','PCP6','PCP7','PCP8','PCP9','PCP10','PCP11','PCP12','PCP13'], index=xx)
    
    mdl.fit(X_train,Y_train)
    print(mdl.score(X_test,Y_test))
    Pred=mdl.predict(slct)
    col='Pred' + str(r)
    RespF2[col]=Pred.tolist()
    r=r+1

0.9115057583451239
0.9035512148853974
0.9076150906765147
0.9010657264215817
0.9045092344916111


In [70]:
#Predicting the average concentration over all foods
ave=(np.array(RespF2['Pred1'])+np.array(RespF2['Pred2'])+np.array(RespF2['Pred3'])+np.array(RespF2['Pred4'])+np.array(RespF2['Pred5']))/5
RespF2['Ave']=ave.tolist()

TF=[]
for h in range(0,len(RespF)):
    S1=RespF2.loc[RespF2['COMP ID']==RespF['COMP ID'][h]]
    TF.append(np.mean(S1.Ave))
TF=np.array(TF)
RespF['Prediction']=TF.tolist()

In [72]:
RespF2.to_csv('FM_output2.csv')

In [103]:
fm=pd.read_csv('Book2.csv')

In [106]:
TF=[]
for h in range(0,len(RespF)):
    val=[]
    S1=fm.loc[fm['COMP ID']==RespF['COMP ID'][h]]
    for j in S1.index:
        val.append(S1['Ave_fold11'][j])
    try:
        val=[i for i in val if i != 0]
        TF.append(np.mean(val))
    except:
        val=0
        TF.append(0)
    
TF=np.array(TF)
RespF['Prediction5']=TF.tolist()

In [107]:
RespF

Unnamed: 0,COMP ID,Prediction,Prediction2,Prediction3,Prediction4,Prediction5
0,53,-4.205555,-4.205555,-3.955937,-4.205555,-4.205555
1,512,-8.241219,-7.205864,-6.750794,-8.241219,-8.194497
2,1303,-8.432786,-7.229034,-6.885224,-8.432786,-8.404914
3,1643,-7.018805,-6.786073,-6.340066,-7.018805,-7.018805
4,21177,-8.602602,-7.236524,-6.958231,-8.602602,-8.380869
5,22012,-6.690439,-6.55397,-5.595853,-6.690439,-6.257316
6,35630,-10.533097,,,-10.093902,-9.853809
7,35639,-7.652569,-7.08232,-6.673754,-7.652569,-7.635891
8,39592,-8.559922,-7.330311,-6.942868,-8.559922,-8.545744
9,43239,-8.918936,-7.470122,-7.2265,-8.906421,-8.875961


In [108]:
RespF.to_csv('FM_output.csv')

# Predicting Metabolon Compounds

In [None]:
#Features Matrix for Prediction
mdl.fit(X,Y)
Pred=mdl.predict(MBN)
RespM['Pred']=Pred.tolist()