In [None]:
#Packages
import sys
import math
import matplotlib.pyplot as plt
import json
import csv
import numpy as np
import pandas as pd
import time
import re
import random
from tqdm import tqdm
import os
import multiprocessing as mp
import prince

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from scipy.stats import loguniform
from skopt import BayesSearchCV
from skopt.plots import plot_objective, plot_histogram
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import ShuffleSplit
from sklearn import linear_model

# Load and Prepare Data

In [None]:
#Prepare USDA and Phenol Expore data
uspx_data=pd.read_csv('../data/USPX_features.csv')
uspx_data=uspx_data[uspx_data['value']!=0] #Remove rows reporting 0 concentration
uspx_data=uspx_data.reset_index()
uspx_data=uspx_data.drop(columns=['index'])

#Split data
uspx_comp_ids=uspx_data[['row_id','comp_name','InChIKey']] #Chemical Identifiers
Slbl=uspx_data[['row_id','Cvariable','C_Label','F_Label']] #Labels for Stratifying Folds
uspx_props=uspx_data[['MW','C','logP','logS','HBI','NCA','NPSA','C_Label']] #Property Features
uspx_phylo=uspx_data[['class','order','family','genus','F_Label']] #Phylogenetic Features
uspx_resp=uspx_data[['row_id','logvalue']] #Response Values

In [None]:
#Prepare Metabolon Data
MS_data=pd.read_csv('../data/MassSpec_features.csv')
MS_data=MS_data[MS_data['MS_peak']!=0] #Remove rows reporting 0 peak area
MS_data=MS_data.reset_index()
MS_data=MS_data.drop(columns=['index'])

MS_comp_ids=MS_data[['row_id','comp_name','InChIKey']] #Chemical Identifiers
MS_props=MS_data[['MW','C','logP','logS','HBI','NCA','NPSA','C_Label']] #Property Features
MS_phylo=MS_data[['class','order','family','genus','F_Label']] #Phylogenetic Features
MS_resp=MS_data[['row_id','MS_peak']] #Response Values

In [None]:
#Prepare FoodMine Chemical Properties
FM_data=pd.read_csv('../data/FoodMine_features.csv')
FM_data=FM_data[FM_data['MS_peak']!=0]
FM_data=FM_data.drop(columns=['row_id','food_name','phylum','class','order','family','genus','F_Label','MS_peak'])
FM_data=FM_data.drop_duplicates()
FM_data=FM_data.reset_index()
FM_data=FM_data.drop(columns=['index'])

FM_comp_ids=FM_data[['COMP_ID','comp_name','InChIKey']] #Chemical Identifiers
FM_props=FM_data[['MW','C','logP','logS','HBI','NCA','NPSA','C_Label']] #Property Features
FM_resp=FM_data[['COMP_ID']] #Output Matrix

# Find Property and Phylogenetic Component Variables

In [None]:
#Merge all chemical property lists of all datasets
Prop_All=pd.concat([uspx_props,MS_props,FM_props]) #Merge all Property Features of all datasets
Prop_All=Prop_All.drop_duplicates() #Remove duplicate compounds
Prop_All=Prop_All.reset_index(drop=True)
Clbl=pd.DataFrame(Prop_All['C_Label']) #Create Compound Label list of merged dataset
Prop_All=Prop_All.drop(columns=['C_Label']) #Drop label column for input into PCA

#Merge all phylogenetic lists of all datasets
Phylo_All=pd.concat([uspx_phylo,MS_phylo]) #Merge all Phylogenetic Features of all datasets
Phylo_All=Phylo_All.drop_duplicates() #Remove dulplicate foods
Phylo_All=Phylo_All.reset_index(drop=True)
Flbl=pd.DataFrame(Phylo_All['F_Label']) #Create Phylogenetic Label list of merged dataset
Phylo_All=Phylo_All.drop(columns=['F_Label']) #Drop label column for input into MCA

In [None]:
#Find Principle Components of property features for merged dataset
pca=PCA(n_components=7)
pcc_df=pca.fit_transform(Prop_All) #Find principal components for the chemical properties
pcc_df=pd.DataFrame(data=pcc_df,columns=['PCC1','PCC2','PCC3','PCC4','PCC5','PCC6','PCC7'])
print('prop explained:',pca.explained_variance_ratio_)

#Find Components of phylogenetic features for merged dataset
mca=prince.MCA(n_components=13)
mca.fit(Phylo_All)
pcp_df=mca.transform(Phylo_All) #Find principal components for the phylogenetic tree
pcp_df.columns=['PCP1','PCP2','PCP3','PCP4','PCP5','PCP6','PCP7','PCP8','PCP9','PCP10','PCP11','PCP12','PCP13']
print('pt explained:',mca.total_inertia_)
print(sum(mca.explained_inertia_))

# Create Component Matrices for the Datasets

In [None]:
#Create a USDA Phenol Explorer Training Dataset from the PCA and MCA analysis components
n1=len(uspx_data)
xx=np.linspace(0,(n1*1),(n1*1)+1)
USPX=pd.DataFrame(columns=['PCC1','PCC2','PCC3','PCC4','PCP1','PCP2','PCP3','PCP4','PCP5','PCP6','PCP7','PCP8','PCP9','PCP10','PCP11','PCP12','PCP13'], index=xx)

for h in uspx_data.index:
    #Find property component features for each row in USPX data
    S1=Clbl.loc[Clbl['C_Label']==uspx_data['C_Label'][h]]
    USPX.loc[h,('PCC1','PCC2','PCC3','PCC4')]=[pcc_df.loc[S1.index[0],'PCC1'],pcc_df.loc[S1.index[0],'PCC2'],pcc_df.loc[S1.index[0],'PCC3'],pcc_df.loc[S1.index[0],'PCC4']]
    
    #Find phylogenetic component features for each row in USPX data
    S1=Flbl.loc[Flbl['F_Label']==uspx_data['F_Label'][h]]
    USPX.loc[h,('PCP1','PCP2','PCP3','PCP4','PCP5','PCP6','PCP7','PCP8','PCP9','PCP10','PCP11','PCP12','PCP13')]=[pcp_df.loc[S1.index[0],'PCP1'],pcp_df.loc[S1.index[0],'PCP2'],pcp_df.loc[S1.index[0],'PCP3'],pcp_df.loc[S1.index[0],'PCP4'],pcp_df.loc[S1.index[0],'PCP5'],pcp_df.loc[S1.index[0],'PCP6'],pcp_df.loc[S1.index[0],'PCP7'],pcp_df.loc[S1.index[0],'PCP8'],pcp_df.loc[S1.index[0],'PCP9'],pcp_df.loc[S1.index[0],'PCP10'],pcp_df.loc[S1.index[0],'PCP11'],pcp_df.loc[S1.index[0],'PCP12'],pcp_df.loc[S1.index[0],'PCP13']]
USPX=USPX.dropna()
USPX=USPX.reset_index(drop=True)

In [None]:
#Create a MS dataset from the PCA and MCA analysis components
n1=len(MS_data)
xx=np.linspace(0,(n1*1),(n1*1)+1)
MS=pd.DataFrame(columns=['PCC1','PCC2','PCC3','PCC4','PCP1','PCP2','PCP3','PCP4','PCP5','PCP6','PCP7','PCP8','PCP9','PCP10','PCP11','PCP12','PCP13'], index=xx)

for h in MS_data.index:
    #Find property component features for each row in Mbn data
    S1=Clbl.loc[Clbl['C_Label']==MS_data['C_Label'][h]]
    MS.loc[h,('PCC1','PCC2','PCC3','PCC4')]=[pcc_df.loc[S1.index[0],'PCC1'],pcc_df.loc[S1.index[0],'PCC2'],pcc_df.loc[S1.index[0],'PCC3'],pcc_df.loc[S1.index[0],'PCC4']]
    
    #Find phylogenetic component features for each row in Mbn data
    S1=Flbl.loc[Flbl['F_Label']==MS_data['F_Label'][h]]
    MS.loc[h,('PCP1','PCP2','PCP3','PCP4','PCP5','PCP6','PCP7','PCP8','PCP9','PCP10','PCP11','PCP12','PCP13')]=[pcp_df.loc[S1.index[0],'PCP1'],pcp_df.loc[S1.index[0],'PCP2'],pcp_df.loc[S1.index[0],'PCP3'],pcp_df.loc[S1.index[0],'PCP4'],pcp_df.loc[S1.index[0],'PCP5'],pcp_df.loc[S1.index[0],'PCP6'],pcp_df.loc[S1.index[0],'PCP7'],pcp_df.loc[S1.index[0],'PCP8'],pcp_df.loc[S1.index[0],'PCP9'],pcp_df.loc[S1.index[0],'PCP10'],pcp_df.loc[S1.index[0],'PCP11'],pcp_df.loc[S1.index[0],'PCP12'],pcp_df.loc[S1.index[0],'PCP13']]
MS=MS.dropna()
MS=MS.reset_index(drop=True)

In [None]:
#Create a FoodMine (FM) dataset from the PCA and MCA analysis components using USPX food list and FM property list
uspx_phylo_FM=uspx_phylo.drop_duplicates() #Make list of unique foods in USPX
uspx_phylo_FM=uspx_phylo_FM.reset_index(drop=True)
FM_props=FM_props.reset_index(drop=True)

n1=len(uspx_phylo_FM)*len(FM_props) #All combinations of USPX foods and FM compounds
xx=np.linspace(0,(n1*1),(n1*1)+1)
FM=pd.DataFrame(columns=['PCC1','PCC2','PCC3','PCC4','PCP1','PCP2','PCP3','PCP4','PCP5','PCP6','PCP7','PCP8','PCP9','PCP10','PCP11','PCP12','PCP13'], index=xx)
FM_resp2=pd.DataFrame(columns=['COMP_ID'], index=xx)

r=0
for h in FM_props.index:
    #Select compound
    S1=Clbl.loc[Clbl['C_Label']==FM_props['C_Label'][h]]
    #Iterate over the foods in USPX
    for j in uspx_phylo_FM.index:
        S2=Flbl.loc[Flbl['F_Label']==uspx_phylo_FM['F_Label'][j]]
        #Create rows such that there is a row for each possible compound-food pair between FM and USPX
        FM.loc[r,('PCC1','PCC2','PCC3','PCC4')]=[pcc_df.loc[S1.index[0],'PCC1'],pcc_df.loc[S1.index[0],'PCC2'],pcc_df.loc[S1.index[0],'PCC3'],pcc_df.loc[S1.index[0],'PCC4']]
        FM.loc[r,('PCP1','PCP2','PCP3','PCP4','PCP5','PCP6','PCP7','PCP8','PCP9','PCP10','PCP11','PCP12','PCP13')]=[pcp_df.loc[S2.index[0],'PCP1'],pcp_df.loc[S2.index[0],'PCP2'],pcp_df.loc[S2.index[0],'PCP3'],pcp_df.loc[S2.index[0],'PCP4'],pcp_df.loc[S2.index[0],'PCP5'],pcp_df.loc[S2.index[0],'PCP6'],pcp_df.loc[S2.index[0],'PCP7'],pcp_df.loc[S2.index[0],'PCP8'],pcp_df.loc[S2.index[0],'PCP9'],pcp_df.loc[S2.index[0],'PCP10'],pcp_df.loc[S2.index[0],'PCP11'],pcp_df.loc[S2.index[0],'PCP12'],pcp_df.loc[S2.index[0],'PCP13']]
        r=r+1
FM=FM.dropna()
FM=FM.reset_index(drop=True)

#Create Output matrix of FoodMine Predictions
r=0
for h in range(0,len(FM_resp)):
    for j in range(0,len(uspx_phylo_FM)):
        FM_resp2['COMP_ID'][r]=FM_resp['COMP_ID'][h]
        r=r+1
FM_resp2=FM_resp2.dropna()
FM_resp2=FM_resp2.reset_index(drop=True)

# Building Model

In [None]:
#HyperParameter Optimization
X=USPX
Y=uspx_resp['logvalue']

#Stratified folds by Chemical Classes
y=Slbl['Cvariable']
skf=StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
cvf=skf.split(X,y)

#Find optimization start time
t=time.time()

#Bayesian optimizer and parameters
opt=BayesSearchCV(
    estimator=GradientBoostingRegressor(),
    search_spaces={'max_depth':(3,10),
    'max_leaf_nodes':(2,50),
    'learning_rate':(0.01,1.0,'log-uniform'),
    'n_estimators':(10,200)},
    n_iter=100,
    verbose=0,
    cv=cvf
)
opt.fit(X,np.ravel(Y))

#Duration of optimization and best fit hyperparameters
print('Elapsed: %s' % (time.time()-t))
print("score: %s" % opt.best_score_)
print(opt.best_estimator_)

In [None]:
#All Hyperparameter Optimization Results
pd.DataFrame(opt.cv_results_)

In [None]:
np.mean(opt.cv_results_['mean_test_score'])

In [None]:
d=pd.DataFrame(opt.cv_results_)
d.loc[d['mean_test_score']==opt.best_score_]

In [None]:
#Create Model with Best Hyperparameters
mdl=GradientBoostingRegressor(
    n_estimators=opt.best_params_['n_estimators'],
    max_leaf_nodes=opt.best_params_['max_leaf_nodes'],
    learning_rate=opt.best_params_['learning_rate'],
    max_depth=opt.best_params_['max_depth'])
print(mdl)

# Predicting FoodMine Compounds

In [None]:
#Predicting concentration of compound in each food for FoodMine
slct=FM
X=np.array(USPX)
Y=uspx_resp['logvalue']
y=Slbl['Cvariable']

skf=StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

r=1
for train_index, test_index in skf.split(X,y):
    X_train,X_test=X[train_index],X[test_index]
    Y_train,Y_test=Y[train_index],Y[test_index]
    
    #Create the training and test sets for a fold
    xx=np.linspace(0,((len(X_train)-1)*1),((len(X_train)-1)*1)+1)
    X_train=X_train.astype('float64')
    X_train=pd.DataFrame(X_train,columns=['PCC1','PCC2','PCC3','PCC4','PCP1','PCP2','PCP3','PCP4','PCP5','PCP6','PCP7','PCP8','PCP9','PCP10','PCP11','PCP12','PCP13'], index=xx)
    
    xx=np.linspace(0,((len(X_test)-1)*1),((len(X_test)-1)*1)+1)
    X_test=X_test.astype('float64')
    X_test=pd.DataFrame(X_test,columns=['PCC1','PCC2','PCC3','PCC4','PCP1','PCP2','PCP3','PCP4','PCP5','PCP6','PCP7','PCP8','PCP9','PCP10','PCP11','PCP12','PCP13'], index=xx)
    
    #Make prediction for a fold
    mdl.fit(X_train,Y_train)
    print(mdl.score(X_test,Y_test))
    Pred=mdl.predict(slct)
    
    #Store each fold prediction
    col='Pred' + str(r)
    FM_resp2[col]=Pred.tolist()
    r=r+1

In [None]:
FM_resp2.to_csv('FM_output.csv')

# Predicting MassSpec Compounds

In [None]:
#Predicting concentration of compound in each food for MassSpec
slct=MS
X=np.array(USPX)
Y=uspx_resp['logvalue']
y=Slbl['Cvariable']

skf=StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

r=1
for train_index, test_index in skf.split(X,y):
    X_train,X_test=X[train_index],X[test_index]
    Y_train,Y_test=Y[train_index],Y[test_index]
    
    #Create the training and test sets for a fold
    xx=np.linspace(0,((len(X_train)-1)*1),((len(X_train)-1)*1)+1)
    X_train=X_train.astype('float64')
    X_train=pd.DataFrame(X_train,columns=['PCC1','PCC2','PCC3','PCC4','PCP1','PCP2','PCP3','PCP4','PCP5','PCP6','PCP7','PCP8','PCP9','PCP10','PCP11','PCP12','PCP13'], index=xx)
    
    xx=np.linspace(0,((len(X_test)-1)*1),((len(X_test)-1)*1)+1)
    X_test=X_test.astype('float64')
    X_test=pd.DataFrame(X_test,columns=['PCC1','PCC2','PCC3','PCC4','PCP1','PCP2','PCP3','PCP4','PCP5','PCP6','PCP7','PCP8','PCP9','PCP10','PCP11','PCP12','PCP13'], index=xx)
    
    #Make prediction for a fold
    mdl.fit(X_train,Y_train)
    print(mdl.score(X_test,Y_test))
    Pred=mdl.predict(slct)

    #Store each fold prediction
    col='Pred' + str(r)
    MS_resp[col]=Pred.tolist()
    r=r+1

In [None]:
MS_resp.to_csv('MS_output.csv')