In [1]:
import pandas as pd
import numpy as np
import importlib
import seaborn as sns
import matplotlib.pyplot as plt

from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,confusion_matrix, roc_auc_score,average_precision_score,f1_score
from xgboost import XGBClassifier
import shap
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
# from sklearn.metrics import plot_roc_curve
from statistics import mean
from sklearn.ensemble import RandomForestClassifier
import re
import pickle
from matplotlib import pyplot
import time
import scispacy
import spacy
import nltk
from tqdm import tqdm

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


# Functions to generate features

In [2]:
from nltk.corpus import stopwords
import string
#nltk.download('punkt')
#nltk.download('stopwords')

def feature_gen(dataframe,include_bigrams='FALSE'):
    
    meas_method_column_vals = []
    entity_nlp = spacy.load('en_core_sci_sm')
    chemical_disease_nlp = spacy.load('en_ner_bc5cdr_md')
    chem_ent_ratios, seen_chem, chem_count = [], [], []
    human_bigram = []    
    for index, row in dataframe.iterrows():
        abstract = row['abstract']
    #   ------------ Chemical Names ----------------
        entity_doc = entity_nlp(abstract)
        chemical_disease_doc = chemical_disease_nlp(abstract)
        chemical_ents = [ent.text for ent in chemical_disease_doc.ents if ent.label_ == 'CHEMICAL']
        if len(entity_doc.ents) == 0:
            chem_ent_ratios.append(0)
        else:
            chem_ent_ratios.append(len(chemical_ents) / len(entity_doc.ents))
        seen_chem.append(list(set(chemical_ents)))
        chem_count.append(len(chemical_ents))
    #   ------------ Bigram Score --------------
        if include_bigrams == 'TRUE':
            tokens = nltk.word_tokenize(abstract)
            bigrams = nltk.bigrams(tokens)
            stopset = set(stopwords.words('english') + list(string.punctuation))
            milk_bigrams = [(w1, w2) for w1, w2 in bigrams if 
                            (w1.lower() == 'milk' or w2.lower() == 'milk')
                             and (w1.lower() not in stopset and w2.lower() not in stopset)]
            human_bigrams = [(w1, w2) for w1, w2 in milk_bigrams if 
                             (w1.lower() == 'human' or w2.lower() == 'human')
                             and (w1.lower() not in stopset and w2.lower() not in stopset)]
            human_bigram.append(len(human_bigrams) / len(milk_bigrams) if len(milk_bigrams) != 0 else 0)
        else:
            human_bigram.append(0.0)

    dataframe['chem_ent_ratio'] = chem_ent_ratios
    dataframe['chemicals'] = seen_chem
    dataframe['bigram_score'] = human_bigram
    dataframe['chem_term_count'] = chem_count
    
    return dataframe

In [3]:
import matplotlib as mpl
mpl.rcParams['figure.dpi']= 150

def clean_plot(leg=True, grid=None, font=None):
    ax = plt.gca()
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.yaxis.set_ticks_position('left')
    ax.xaxis.set_ticks_position('bottom')
    
    axis_color = 'lightgrey'
    ax.spines['bottom'].set_color(axis_color)
    ax.spines['left'].set_color(axis_color)
    ax.tick_params(axis='both', color=axis_color)
    
    if leg:
        ax.legend(frameon = False, loc='upper left', bbox_to_anchor=(1, 1))
        
    if grid is not None:
        plt.grid(color='lightgrey', axis = grid, linestyle='-', linewidth=.5)
        
    if font is not None:
        for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] +
            ax.get_xticklabels() + ax.get_yticklabels()):
            
            item.set_fontfamily(font['family'])
            item.set_color(font['color'])
            

In [4]:
from src.filter import Filter

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\parth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Loading datasets and generating features

In [5]:
## Garlic and Cocoa

gtrain = pd.read_csv("data/garlic_scoring.csv", encoding='latin1')[['PMID', 'abstract', 'paper', 'mesh_terms', 'qual_terms', 'is_useful']]
gtrain['food'] = 'garlic'
ctrain = pd.read_csv("data/cocoa_scoring.csv", encoding='latin1')[['PMID', 'abstract', 'paper', 'mesh_terms', 'qual_terms', 'is_useful']]
ctrain['food'] = 'cocoa'
gtrain['is_useful'] = gtrain['is_useful'].replace(2, 1, regex=True)
ctrain['is_useful'] = ctrain['is_useful'].replace(2, 1, regex=True)
gtrain = gtrain[gtrain['is_useful'].notnull()]
ctrain = ctrain[ctrain['is_useful'].notnull()]

## Basil

btrain = pd.read_excel("data/basil_scoring.xls")[['PMID', 'abstract', 'paper', 'mesh_terms', 'qual_terms', 'is_useful']]
btrain['food'] = 'apple'

for i in range(len(btrain)):
    if btrain['is_useful'].loc[i] == 'x':
        btrain['is_useful'].loc[i] = 0
        
btrain['is_useful'] = btrain['is_useful'].replace(2, 1, regex=True)
btrain = btrain[btrain['is_useful'].notnull()]

## Apple

atrain = pd.read_excel("data/apple_scoring.xls")[['PMID', 'abstract', 'paper', 'mesh_terms', 'qual_terms', 'is_useful']]
atrain['food'] = 'apple'

for i in range(len(atrain)):
    if atrain['is_useful'].loc[i] == 'x':
        atrain['is_useful'].loc[i] = 0

atrain['is_useful'] = atrain['is_useful'].replace(2, 1, regex=True)
atrain = atrain[atrain['is_useful'].notnull()]
atrain = atrain[atrain['abstract'].notnull()]
atrain = atrain[atrain['PMID'].notnull()]

## Human Milk database

mtrain_new = pd.read_csv("mBase_15Aug_abstract[chemical_gen].csv")
mtrain_new['food'] = 'milk'
mtrain_new = mtrain_new[mtrain_new['abstract'].notnull()]
mtrain_new = mtrain_new[mtrain_new['PMID'].notnull()]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.h

In [6]:
print('Length of the databases for garlic, cocoa, basil, apple, milk: ', len(gtrain), len(ctrain), len(btrain), len(atrain), len(mtrain_new))

Length of the databases for garlic, cocoa, basil, apple, milk:  299 324 93 1653 229


In [7]:
print('Useful and non-useful: ',len(gtrain[gtrain['is_useful'] == 1.0]),len(gtrain[gtrain['is_useful'] == 0.0]))

Useful and non-useful:  77 222


In [8]:
print('Useful and non-useful: ',len(ctrain[ctrain['is_useful'] == 1.0]),len(ctrain[ctrain['is_useful'] == 0.0]))

Useful and non-useful:  93 231


In [9]:
print('Useful and non-useful: ',len(atrain[atrain['is_useful'] == 1.0]),len(atrain[atrain['is_useful'] == 0.0]))

Useful and non-useful:  462 1191


In [10]:
print('Useful and non-useful: ',len(btrain[btrain['is_useful'] == 1.0]),len(btrain[btrain['is_useful'] == 0.0]))

Useful and non-useful:  57 36


In [11]:
# def build_all_features(ftrain,include_bigrams='FALSE'):
#     fmodel_data = Filter()

#     fmodel_data.build_features(input_data = ftrain,is_traindata = True)

#     ftrain = feature_gen(ftrain,include_bigrams)

#     fmodel_data.data['chem_ent_ratio'] = ftrain['chem_ent_ratio'].values
#     fmodel_data.data['chem_term_count'] = ftrain['chem_term_count'].values
#     fmodel_data.data['bigram_score'] = ftrain['bigram_score'].values
    
#     return fmodel_data

In [12]:
# print('----Starting feature generation----')
# gdata = build_all_features(gtrain,include_bigrams='FALSE')
# print('----DONE----')
# cdata = build_all_features(ctrain,include_bigrams='FALSE')
# print('----DONE----')
# bdata = build_all_features(btrain,include_bigrams='FALSE')
# print('----DONE----')

In [13]:
# print('----Starting feature generation----')
# adata = build_all_features(atrain,include_bigrams='FALSE')
# print('----DONE----')
# mdata = build_all_features(mtrain_new,include_bigrams='TRUE')
# print('----DONE----')

In [14]:
# gdata_features_class = gdata.data.copy()
# cdata_features_class = cdata.data.copy()
# bdata_features_class = bdata.data.copy()
# adata_features_class = adata.data.copy()
# mdata_features_class = mdata.data.copy()

In [15]:
# gdata_features_class.to_csv('data_with_feature/gdata_features_class.csv')
# cdata_features_class.to_csv('data_with_feature/cdata_features_class.csv')
# bdata_features_class.to_csv('data_with_feature/bdata_features_class.csv')
# adata_features_class.to_csv('data_with_feature/adata_features_class.csv')
# mdata_features_class.to_csv('data_with_feature/mdata_features_class.csv')

In [16]:
gdata_features_class = pd.read_csv('data_with_feature/gdata_features_class.csv')
cdata_features_class = pd.read_csv('data_with_feature/cdata_features_class.csv')
bdata_features_class = pd.read_csv('data_with_feature/bdata_features_class.csv')
adata_features_class = pd.read_csv('data_with_feature/adata_features_class.csv')
mdata_features_class = pd.read_csv('data_with_feature/mdata_features_class.csv')

# Doc2vec trained on FoodBase

In [17]:
fmine = pd.read_csv('FoodBase_Abstracts_embeddings.csv')

In [18]:
embedding_list = []

for index, row in fmine.iterrows():
    local_list = []
    for x in row['vectors'].replace('[','').replace(']','').replace('\n',' ').replace('  ',' ').replace('   ',' ').split(' '):
        if x != '':
            local_list.append(float(x))
    embedding_list.append(local_list)
    
fmine['embeddings'] = embedding_list

In [19]:
abstract_embedding_dict = dict()

for index, row in fmine.iterrows():
    abstract_embedding_dict[row['abstract']] = row['embeddings']

In [20]:
print('Dimension of Doc2Vec: ', len(embedding_list[0]))

Dimension of Doc2Vec:  64


In [21]:
fmine_expanded = pd.concat([fmine, fmine['embeddings'].apply(pd.Series)], axis = 1)

In [22]:
gdata_train_doc2vec = pd.merge(gtrain, fmine_expanded[fmine_expanded['food'] == 'garlic'], on=['abstract'])
gdata_features_class_doc2vec = pd.merge(gdata_train_doc2vec, gdata_features_class, on=['PMID'])
gdata_features_class_doc2vec = gdata_features_class_doc2vec.drop(columns=['PMID','abstract','paper','mesh_terms','qual_terms','is_useful_x','food_x','is_useful_y','food_y','vectors','embeddings'])

In [23]:
cdata_train_doc2vec = pd.merge(ctrain, fmine_expanded[fmine_expanded['food'] == 'cocoa'], on=['abstract'])
cdata_features_class_doc2vec = pd.merge(cdata_train_doc2vec, cdata_features_class, on=['PMID'])
cdata_features_class_doc2vec = cdata_features_class_doc2vec.drop(columns=['PMID','abstract','paper','mesh_terms','qual_terms','is_useful_x','food_x','is_useful_y','food_y','vectors','embeddings'])

In [24]:
bdata_train_doc2vec = pd.merge(btrain, fmine_expanded[fmine_expanded['food'] == 'basil'], on=['abstract'])
bdata_features_class_doc2vec = pd.merge(bdata_train_doc2vec, bdata_features_class, on=['PMID'])
bdata_features_class_doc2vec = bdata_features_class_doc2vec.drop(columns=['PMID','abstract','paper','mesh_terms','qual_terms','is_useful_x','food_x','is_useful_y','food_y','vectors','embeddings'])

In [25]:
adata_train_doc2vec = pd.merge(atrain, fmine_expanded[fmine_expanded['food'] == 'apple'], on=['abstract'])
adata_features_class_doc2vec = pd.merge(adata_train_doc2vec, adata_features_class, on=['PMID'])
adata_features_class_doc2vec = adata_features_class_doc2vec.drop(columns=['PMID','abstract','paper','mesh_terms','qual_terms','is_useful_x','food_x','is_useful_y','food_y','vectors','embeddings'])

In [26]:
mdata_train_doc2vec = pd.merge(mtrain_new, fmine_expanded[fmine_expanded['food'] == 'human milk'], on=['abstract'])
mdata_features_class_doc2vec = pd.merge(mdata_train_doc2vec, mdata_features_class, on=['PMID'])
mdata_features_class_doc2vec = mdata_features_class_doc2vec.drop(columns=['PMID','abstract','paper','mesh_terms','qual_terms','is_useful_x','food_x','is_useful_y','food_y','vectors','embeddings'])

In [27]:
mdata_features_class_doc2vec = mdata_features_class_doc2vec.drop(columns=['journal','mesh_UIds','qual_UIds','webpage','year','source','measmethod','chem_ent_ratio_x','chemicals','bigram_score_x','chem_term_count_x'])

In [28]:
mdata_features_class_doc2vec = mdata_features_class_doc2vec.rename(columns={'chem_ent_ratio_y':'chem_ent_ratio','chem_term_count_y':'chem_term_count','bigram_score_y':'bigram_score'})

# Normalize the features

In [29]:
def normalize_features(total_data):
    total_data_features = total_data.drop(columns=['class'])
    for col_feature in list(total_data_features.columns): 
        col_list = total_data_features[col_feature].tolist()
        if col_list != [0.0] * len(col_list):
            col_list_normalized = [(x-np.mean(col_list))/np.std(col_list) for x in col_list]
        else:
            col_list_normalized = col_list
        total_data_features[col_feature] = col_list_normalized
    # total_data_features_normalized = (total_data_features-total_data_features.mean())/total_data_features.std()
    # total_data_features_normalized = (total_data_features-total_data_features.min())/(total_data_features.max()-total_data_features.min())
    total_data_features['class'] = total_data['class'].tolist()
    
    return total_data_features

In [30]:
gdata_features_class_normalized = normalize_features(gdata_features_class)
cdata_features_class_normalized = normalize_features(cdata_features_class)
bdata_features_class_normalized = normalize_features(bdata_features_class)
adata_features_class_normalized = normalize_features(adata_features_class)
mdata_features_class_normalized = normalize_features(mdata_features_class)

In [31]:
fm1p0_columns = ['chromatography', 'food_term_count', 'gen_term_count', 'sci_term_count', 'spectrometry', 'spectrophotometry']

# Performances on seen food 

In [32]:
def xgboost_model(x,y,kfold,n_splits=10,fm1p0_columns=fm1p0_columns):

    cross_val_model_fm1 = RandomForestClassifier(max_depth=80,random_state=0)
    cross_val_model_fm2 = RandomForestClassifier(max_depth=80,random_state=0)
    
    kfold = StratifiedKFold(n_splits)
    
    auc_fm1 = []
    aup_fm1 = []
    f1_score_list_fm1 = []
    
    auc_fm2 = []
    aup_fm2 = []
    f1_score_list_fm2 = []
    
    for i,(train,test) in enumerate(kfold.split(x,y)):
        
        x_old = x[fm1p0_columns]
        
        cross_val_model_fm2.fit(x.loc[train],y.loc[train])
        cross_val_model_fm1.fit(x_old.loc[train],y.loc[train])
        
        #viz_fm1 = plot_roc_curve(cross_val_model_fm1, x_old.loc[test], y.loc[test],
        #                 name='ROC fold {}'.format(i),
        #                 alpha=0.3, lw=1)
        #viz_fm2 = plot_roc_curve(cross_val_model_fm2, x.loc[test], y.loc[test],
        #                 name='ROC fold {}'.format(i),
        #                 alpha=0.3, lw=1)
        
        y_predicted_fm1 = cross_val_model_fm1.predict(x_old.loc[test])
        y_predicted_fm2 = cross_val_model_fm2.predict(x.loc[test])
        
        auc_fm1.append(roc_auc_score(np.array(y.loc[test].tolist()),y_predicted_fm1))
        aup_fm1.append(average_precision_score(np.array(y.loc[test].tolist()),y_predicted_fm1))
        f1_score_list_fm1.append(f1_score(np.array(y.loc[test].tolist()),y_predicted_fm1))
        
        auc_fm2.append(roc_auc_score(np.array(y.loc[test].tolist()),y_predicted_fm2))
        aup_fm2.append(average_precision_score(np.array(y.loc[test].tolist()),y_predicted_fm2))
        f1_score_list_fm2.append(f1_score(np.array(y.loc[test].tolist()),y_predicted_fm2))
        # plot_confusion_matrix(cross_val_model,x.loc[test],y.loc[test])
        
    print('------------------------------------------------------------------------------------------------------')
    print('FoodMine1.0')
    print('Average and SD of AUROC: ', mean(auc_fm1), np.std(auc_fm1))
    print('Average and SD of AUPRC:', mean(aup_fm1), np.std(aup_fm1))
    print('Average and SD of f1-Score:', mean(f1_score_list_fm1), np.std(f1_score_list_fm1))
    print('All AUROC values: ', auc_fm1)
    print('All AUPRC values: ', aup_fm1)
    print('All f1-Score values: ', f1_score_list_fm1)
    print('------------------------------------------------------------------------------------------------------')
    print('FoodMine2.0')
    print('Average and SD of AUROC: ', mean(auc_fm2), np.std(auc_fm2))
    print('Average and SD of AUPRC:', mean(aup_fm2), np.std(aup_fm2))
    print('Average and SD of f1-Score:', mean(f1_score_list_fm2), np.std(f1_score_list_fm2))
    print('All AUROC values: ', auc_fm2)
    print('All AUPRC values: ', aup_fm2)
    print('All f1-Score values: ', f1_score_list_fm2)
    print('------------------------------------------------------------------------------------------------------')

    return mean(auc_fm2), mean(aup_fm2), mean(f1_score_list_fm2)

In [33]:
def get_cross_validation_seen_food(fdata_features_class_normalized):

    repeat = 1

    for _ in tqdm(range(repeat)):
        oversample = SMOTE()
        y = fdata_features_class_normalized.copy()['class']
        X = fdata_features_class_normalized.copy().drop('class', axis = 1)
        X.columns = X.columns.astype(str)
        X_smote, y_smote = oversample.fit_resample(X, y)
        kfold = StratifiedKFold(n_splits=10)
        auc_fm2, aup_fm2, f1_fm2 = xgboost_model(X_smote, y_smote,kfold,n_splits=10)
        
    return 

In [34]:
gdata_features_class_doc2vec

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,gen_term_count,food_term_count,sci_term_count,class,spectrometry,chromatography,spectrophotometry,chem_ent_ratio,chem_term_count,bigram_score
0,2.318923,1.615156,0.801215,-0.258407,3.384802,-3.805800,1.640497,-2.008467,4.323183,1.473609,...,0.0,0.0,2.0,1.0,1.0,1.0,0.0,0.0,0,0.0
1,0.466219,1.549901,0.386801,0.742964,-0.081566,-1.721926,-0.858324,-3.293416,-0.251621,-0.822619,...,2.0,3.0,3.0,0.0,0.0,1.0,0.0,0.0,0,0.0
2,-0.077940,-0.733739,-0.106453,-3.235562,-0.142488,-0.406461,0.403134,-2.334560,0.607535,1.598378,...,0.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0.0
3,2.014768,0.478264,-0.868394,-0.492501,1.277909,-0.316566,0.148894,-2.738673,0.289150,0.453324,...,0.0,2.0,0.0,0.0,1.0,1.0,0.0,0.0,0,0.0
4,-1.138469,1.089024,-0.205716,-1.830036,-0.178908,-0.019330,0.420647,-3.019905,0.205741,1.183853,...,0.0,7.0,0.0,1.0,1.0,1.0,0.0,0.0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,1.536668,2.807355,0.013995,-2.084901,2.366916,-3.757211,0.391783,-2.721904,3.457376,5.559508,...,0.0,1.0,2.0,0.0,0.0,1.0,0.0,0.0,0,0.0
295,1.476389,-2.733936,-0.561345,-2.081398,-0.446338,-2.195139,0.051300,-1.428699,-1.087287,1.599730,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0.0
296,0.194007,0.971212,-1.602840,-1.000298,2.354669,-1.833926,-0.000545,-0.132896,1.422767,1.188912,...,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0
297,0.324103,1.300528,-1.449213,-2.510239,1.168539,-3.529079,-1.929223,0.195159,-0.548822,4.082344,...,0.0,4.0,1.0,1.0,0.0,1.0,0.0,0.0,0,0.0


In [35]:
## Garlic
get_cross_validation_seen_food(gdata_features_class_doc2vec)

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:08<00:00,  8.49s/it]

------------------------------------------------------------------------------------------------------
FoodMine1.0
Average and SD of AUROC:  0.7761857707509882 0.12932230851301624
Average and SD of AUPRC: 0.7372888826149695 0.11538431913800254
Average and SD of f1-Score: 0.7258965694877088 0.19552879730470754
All AUROC values:  [0.6739130434782609, 0.5375494071146245, 0.6146245059288538, 0.6857707509881422, 0.8181818181818181, 0.8636363636363636, 0.8863636363636365, 0.9318181818181819, 0.8863636363636365, 0.8636363636363635]
All AUPRC values:  [0.681159420289855, 0.5320910973084886, 0.5893217893217892, 0.6313131313131313, 0.7704545454545454, 0.7857142857142857, 0.8359683794466404, 0.8942687747035573, 0.8495670995670996, 0.803030303030303]
All f1-Score values:  [0.5161290322580645, 0.4324324324324324, 0.41379310344827586, 0.631578947368421, 0.8095238095238095, 0.88, 0.888888888888889, 0.9333333333333332, 0.8837209302325582, 0.8695652173913043]
-------------------------------------------




In [36]:
## Cocoa
get_cross_validation_seen_food(cdata_features_class_doc2vec)

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:23<00:00, 23.06s/it]

------------------------------------------------------------------------------------------------------
FoodMine1.0
Average and SD of AUROC:  0.7083333333333334 0.18767976050063392
Average and SD of AUPRC: 0.6816481419529523 0.15307435495359886
Average and SD of f1-Score: 0.6222605158075792 0.29667529598067244
All AUROC values:  [0.5235507246376812, 0.6032608695652174, 0.4565217391304347, 0.3913043478260869, 0.8913043478260869, 0.782608695652174, 0.9347826086956522, 0.891304347826087, 0.7826086956521738, 0.8260869565217391]
All AUPRC values:  [0.5037927844588344, 0.5972644376899696, 0.4826086956521739, 0.4665551839464883, 0.8423913043478262, 0.7147826086956521, 0.9150197628458498, 0.8311036789297659, 0.7093397745571659, 0.7536231884057971]
All f1-Score values:  [0.21428571428571427, 0.3870967741935483, 0.24242424242424243, 0.2222222222222222, 0.8936170212765957, 0.7916666666666667, 0.9333333333333332, 0.8979591836734695, 0.7999999999999999, 0.84]
----------------------------------------




In [37]:
## Basil
get_cross_validation_seen_food(bdata_features_class_doc2vec)

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:16<00:00, 16.49s/it]

------------------------------------------------------------------------------------------------------
FoodMine1.0
Average and SD of AUROC:  0.6 0.16799470891138873
Average and SD of AUPRC: 0.6002561327561328 0.1537092327437355
Average and SD of f1-Score: 0.5652225552225552 0.1814353492703115
All AUROC values:  [0.41666666666666663, 0.8333333333333334, 0.41666666666666663, 0.41666666666666663, 0.5499999999999999, 0.6500000000000001, 0.75, 0.9, 0.5166666666666667, 0.55]
All AUPRC values:  [0.4666666666666667, 0.7777777777777779, 0.4642857142857143, 0.4642857142857143, 0.5727272727272728, 0.6477272727272727, 0.7727272727272727, 0.8909090909090909, 0.4636363636363636, 0.4818181818181818]
All f1-Score values:  [0.3636363636363636, 0.8333333333333334, 0.4615384615384615, 0.4615384615384615, 0.5454545454545454, 0.6, 0.6666666666666666, 0.888888888888889, 0.28571428571428575, 0.5454545454545454]
--------------------------------------------------------------------------------------------------




In [38]:
## Apple
get_cross_validation_seen_food(adata_features_class_doc2vec)

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:58<00:00, 58.83s/it]

------------------------------------------------------------------------------------------------------
FoodMine1.0
Average and SD of AUROC:  0.5882142857142857 0.0657991931814764
Average and SD of AUPRC: 0.5536636730710411 0.043121495309338946
Average and SD of f1-Score: 0.607794907892713 0.0918336203025077
All AUROC values:  [0.5438375350140056, 0.5273809523809524, 0.49579831932773116, 0.5252100840336135, 0.5966386554621849, 0.6386554621848739, 0.7016806722689075, 0.6806722689075629, 0.6092436974789917, 0.5630252100840336]
All AUPRC values:  [0.5218491712915071, 0.5166007702839301, 0.4979187936856986, 0.5132803121248499, 0.5563146121758056, 0.5851057664445088, 0.632684652808492, 0.6150778782850719, 0.5632814101250256, 0.5345233634855215]
All f1-Score values:  [0.5321888412017167, 0.5065502183406114, 0.4690265486725664, 0.5108225108225107, 0.6279069767441859, 0.6742424242424242, 0.7380073800738007, 0.7246376811594202, 0.6713780918727914, 0.6231884057971014]
----------------------------




In [39]:
## Milk
get_cross_validation_seen_food(mdata_features_class_doc2vec)

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:16<00:00, 16.89s/it]

------------------------------------------------------------------------------------------------------
FoodMine1.0
Average and SD of AUROC:  0.5698717948717948 0.05589802399126064
Average and SD of AUPRC: 0.540088597308277 0.032711519186533214
Average and SD of f1-Score: 0.6732326415414651 0.03469393091285233
All AUROC values:  [0.4615384615384615, 0.6153846153846154, 0.6538461538461539, 0.5384615384615384, 0.6153846153846154, 0.5, 0.5448717948717948, 0.5865384615384616, 0.608974358974359, 0.5737179487179487]
All AUPRC values:  [0.4816053511705686, 0.5668016194331984, 0.5923076923076923, 0.5201465201465202, 0.5659340659340659, 0.5, 0.5434965034965034, 0.5674725274725274, 0.542962962962963, 0.5201587301587302]
All f1-Score values:  [0.6111111111111112, 0.6875, 0.7272727272727274, 0.6470588235294118, 0.7058823529411765, 0.6285714285714286, 0.6857142857142856, 0.7058823529411765, 0.6666666666666667, 0.6666666666666667]
----------------------------------------------------------------------




# Performance on unseen food

In [40]:
def xgboost_model_unseen_food(x_other,y_other,x_food,y_food,kfold,n_splits=10,fm1p0_columns=fm1p0_columns):

    cross_val_model_fm1 = RandomForestClassifier(max_depth=80,random_state=0)
    cross_val_model_fm2 = RandomForestClassifier(max_depth=80,random_state=0)
    
    kfold = StratifiedKFold(n_splits)
    
    auc_fm1 = []
    aup_fm1 = []
    f1_score_list_fm1 = []
    
    auc_fm2 = []
    aup_fm2 = []
    f1_score_list_fm2 = []
    
    for i,(test_food,train_food) in enumerate(kfold.split(x_food,y_food)): # test set is kept larger here 
        
        x_other_old = x_other[fm1p0_columns]
        x_food_old = x_food[fm1p0_columns]
        
        cross_val_model_fm2.fit(pd.concat([x_other,x_food.loc[test_food]]),pd.concat([y_other,y_food.loc[test_food]]))
        cross_val_model_fm1.fit(pd.concat([x_other_old,x_food_old.loc[test_food]]),pd.concat([y_other,y_food.loc[test_food]]))
        
        #viz_fm1 = plot_roc_curve(cross_val_model_fm1, x_old.loc[test], y.loc[test],
        #                 name='ROC fold {}'.format(i),
        #                 alpha=0.3, lw=1)
        #viz_fm2 = plot_roc_curve(cross_val_model_fm2, x.loc[test], y.loc[test],
        #                 name='ROC fold {}'.format(i),
        #                 alpha=0.3, lw=1)
        
        y_predicted_fm1 = cross_val_model_fm1.predict(x_food_old.loc[train_food])
        y_predicted_fm2 = cross_val_model_fm2.predict(x_food.loc[train_food])
        
        auc_fm1.append(roc_auc_score(np.array(y_food.loc[train_food].tolist()),y_predicted_fm1))
        aup_fm1.append(average_precision_score(np.array(y_food.loc[train_food].tolist()),y_predicted_fm1))
        f1_score_list_fm1.append(f1_score(np.array(y_food.loc[train_food].tolist()),y_predicted_fm1))
        
        auc_fm2.append(roc_auc_score(np.array(y_food.loc[train_food].tolist()),y_predicted_fm2))
        aup_fm2.append(average_precision_score(np.array(y_food.loc[train_food].tolist()),y_predicted_fm2))
        f1_score_list_fm2.append(f1_score(np.array(y_food.loc[train_food].tolist()),y_predicted_fm2))
        # plot_confusion_matrix(cross_val_model,x.loc[test],y.loc[test])
        
    print('------------------------------------------------------------------------------------------------------')
    print('FoodMine1.0')
    print('Average and SD of AUROC: ', mean(auc_fm1), np.std(auc_fm1))
    print('Average and SD of AUPRC:', mean(aup_fm1), np.std(aup_fm1))
    print('Average and SD of f1-Score:', mean(f1_score_list_fm1), np.std(f1_score_list_fm1))
    print('All AUROC values: ', auc_fm1)
    print('All AUPRC values: ', aup_fm1)
    print('All f1-Score values: ', f1_score_list_fm1)
    print('------------------------------------------------------------------------------------------------------')
    print('FoodMine2.0')
    print('Average and SD of AUROC: ', mean(auc_fm2), np.std(auc_fm2))
    print('Average and SD of AUPRC:', mean(aup_fm2), np.std(aup_fm2))
    print('Average and SD of f1-Score:', mean(f1_score_list_fm2), np.std(f1_score_list_fm2))
    print('All AUROC values: ', auc_fm2)
    print('All AUPRC values: ', aup_fm2)
    print('All f1-Score values: ', f1_score_list_fm2)
    print('------------------------------------------------------------------------------------------------------')

    return mean(auc_fm2), mean(aup_fm2), mean(f1_score_list_fm2)

In [41]:
def get_cross_validation_unseen_food(other_data_features_class,fdata_features_class):

    repeat = 1

    for _ in tqdm(range(repeat)):
        oversample = SMOTE()
        y_other = other_data_features_class.copy()['class']
        X_other = other_data_features_class.copy().drop('class', axis = 1)
        X_other.columns = X_other.columns.astype(str)
        X_other_smote, y_other_smote = oversample.fit_resample(X_other, y_other)
        y_food = fdata_features_class.copy()['class']
        X_food = fdata_features_class.copy().drop('class', axis = 1)
        X_food.columns = X_food.columns.astype(str)
        X_food_smote, y_food_smote = oversample.fit_resample(X_food, y_food)
        kfold = StratifiedKFold(n_splits=10)
        auc_fm2, aup_fm2, f1_fm2 = xgboost_model_unseen_food(X_other_smote, y_other_smote,X_food_smote, y_food_smote,kfold,n_splits=10)
        
    return 

In [42]:
## Garlic
other_foods = pd.concat([cdata_features_class_doc2vec,bdata_features_class_doc2vec,adata_features_class_doc2vec,mdata_features_class_doc2vec])
get_cross_validation_unseen_food(other_foods,gdata_features_class_doc2vec)

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [01:35<00:00, 95.87s/it]

------------------------------------------------------------------------------------------------------
FoodMine1.0
Average and SD of AUROC:  0.7518774703557313 0.16207307938147736
Average and SD of AUPRC: 0.7137391618043792 0.1366497860151581
Average and SD of f1-Score: 0.6768486906027685 0.2789390481771794
All AUROC values:  [0.540513833992095, 0.540513833992095, 0.458498023715415, 0.7747035573122529, 0.8863636363636364, 0.7954545454545455, 0.8863636363636365, 0.8636363636363636, 0.8863636363636365, 0.8863636363636365]
All AUPRC values:  [0.5358695652173913, 0.5358695652173913, 0.4747474747474747, 0.7345959595959596, 0.8245454545454546, 0.7245454545454546, 0.8495670995670996, 0.7857142857142857, 0.8359683794466404, 0.8359683794466404]
All f1-Score values:  [0.3225806451612903, 0.3225806451612903, 0.14285714285714288, 0.7368421052631579, 0.8936170212765958, 0.8085106382978724, 0.8837209302325582, 0.88, 0.888888888888889, 0.888888888888889]
----------------------------------------------




In [43]:
## Cocoa
other_foods = pd.concat([gdata_features_class_doc2vec,bdata_features_class_doc2vec,adata_features_class_doc2vec,mdata_features_class_doc2vec])
get_cross_validation_unseen_food(other_foods,cdata_features_class_doc2vec)

100%|███████████████████████████████████████████████████████████████████████████████████| 1/1 [01:42<00:00, 102.94s/it]

------------------------------------------------------------------------------------------------------
FoodMine1.0
Average and SD of AUROC:  0.7390398550724638 0.20570109725520008
Average and SD of AUPRC: 0.7133440569263881 0.17814557620895305
Average and SD of f1-Score: 0.6168039458009063 0.36519191887102104
All AUROC values:  [0.5226449275362319, 0.4547101449275362, 0.4782608695652174, 0.5, 0.9130434782608696, 0.8913043478260869, 0.9347826086956522, 0.891304347826087, 0.8695652173913044, 0.9347826086956522]
All AUPRC values:  [0.5047795251310515, 0.49586288416075647, 0.4927536231884058, 0.5, 0.8518518518518519, 0.8423913043478262, 0.8985507246376812, 0.8311036789297659, 0.8011272141706924, 0.9150197628458498]
All f1-Score values:  [0.15384615384615383, 0.13333333333333333, 0.07692307692307691, 0.3428571428571428, 0.92, 0.8936170212765957, 0.9361702127659574, 0.8979591836734695, 0.8800000000000001, 0.9333333333333332]
-------------------------------------------------------------------




In [44]:
## Basil
other_foods = pd.concat([gdata_features_class_doc2vec,cdata_features_class_doc2vec,adata_features_class_doc2vec,mdata_features_class_doc2vec])
get_cross_validation_unseen_food(other_foods,bdata_features_class_doc2vec)

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [01:11<00:00, 71.82s/it]

------------------------------------------------------------------------------------------------------
FoodMine1.0
Average and SD of AUROC:  0.465 0.10366881026722659
Average and SD of AUPRC: 0.5146363636363637 0.048086759032455975
Average and SD of f1-Score: 0.23325396825396827 0.16915543547419964
All AUROC values:  [0.5833333333333334, 0.5833333333333334, 0.5, 0.5833333333333334, 0.38333333333333336, 0.4, 0.4, 0.44999999999999996, 0.5166666666666667, 0.25]
All AUPRC values:  [0.5833333333333334, 0.5555555555555556, 0.5, 0.5555555555555556, 0.51010101010101, 0.5454545454545454, 0.5454545454545454, 0.43272727272727274, 0.4636363636363636, 0.45454545454545453]
All f1-Score values:  [0.2857142857142857, 0.4444444444444444, 0.25, 0.4444444444444444, 0.2222222222222222, 0.0, 0.0, 0.4000000000000001, 0.28571428571428575, 0.0]
------------------------------------------------------------------------------------------------------
FoodMine2.0
Average and SD of AUROC:  0.5183333333333333 0.12898




In [45]:
## Apple
other_foods = pd.concat([gdata_features_class_doc2vec,cdata_features_class_doc2vec,bdata_features_class_doc2vec,mdata_features_class_doc2vec])
get_cross_validation_unseen_food(other_foods,adata_features_class_doc2vec)

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:44<00:00, 44.68s/it]

------------------------------------------------------------------------------------------------------
FoodMine1.0
Average and SD of AUROC:  0.6007843137254902 0.06143304062099646
Average and SD of AUPRC: 0.5640218358010893 0.041478061481557485
Average and SD of f1-Score: 0.577256202334624 0.09944874448391819
All AUROC values:  [0.5475140056022408, 0.5443627450980392, 0.5210084033613445, 0.5042016806722689, 0.6638655462184875, 0.6428571428571429, 0.6848739495798319, 0.6260504201680672, 0.634453781512605, 0.6386554621848739]
All AUPRC values:  [0.5248516536814085, 0.5267382368054025, 0.511024211664864, 0.5021281239768635, 0.6063249727371864, 0.5935064935064935, 0.6257749001239841, 0.5773491214667685, 0.5854579119783506, 0.587062732069572]
All f1-Score values:  [0.47058823529411764, 0.49302325581395345, 0.48181818181818187, 0.3979591836734694, 0.6799999999999999, 0.62882096069869, 0.6887966804979253, 0.6454183266932272, 0.6329113924050632, 0.653225806451613]
-----------------------------




In [46]:
## Milk
other_foods = pd.concat([gdata_features_class_doc2vec,cdata_features_class_doc2vec,bdata_features_class_doc2vec,adata_features_class_doc2vec])
get_cross_validation_unseen_food(other_foods,mdata_features_class_doc2vec)

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:48<00:00, 48.49s/it]

------------------------------------------------------------------------------------------------------
FoodMine1.0
Average and SD of AUROC:  0.48878205128205127 0.07430438424977137
Average and SD of AUPRC: 0.5162673992673993 0.031023000480114728
Average and SD of f1-Score: 0.09528822055137846 0.0633249372331306
All AUROC values:  [0.5384615384615384, 0.5384615384615384, 0.5384615384615384, 0.5384615384615384, 0.5, 0.5, 0.5, 0.5384615384615384, 0.3846153846153846, 0.31089743589743585]
All AUPRC values:  [0.5384615384615385, 0.5384615384615385, 0.5384615384615385, 0.5384615384615385, 0.5, 0.5, 0.52, 0.556923076923077, 0.48, 0.45190476190476186]
All f1-Score values:  [0.14285714285714288, 0.14285714285714288, 0.14285714285714288, 0.14285714285714288, 0.0, 0.13333333333333336, 0.0, 0.14285714285714288, 0.0, 0.10526315789473685]
------------------------------------------------------------------------------------------------------
FoodMine2.0
Average and SD of AUROC:  0.6608974358974359 0.07


