In [1]:
import pandas as pd
import numpy as np
import importlib
import seaborn as sns
import matplotlib.pyplot as plt

from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,confusion_matrix, roc_auc_score,average_precision_score,f1_score,ConfusionMatrixDisplay
from xgboost import XGBClassifier
import shap
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import RocCurveDisplay
from statistics import mean
from sklearn.ensemble import RandomForestClassifier
import re
import pickle
from matplotlib import pyplot
import time
import scispacy
import spacy
import nltk
from tqdm import tqdm

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


# Functions to generate features

In [2]:
from nltk.corpus import stopwords
import string
#nltk.download('punkt')
#nltk.download('stopwords')

def feature_gen(dataframe,include_bigrams='FALSE'):
    
    meas_method_column_vals = []
    entity_nlp = spacy.load('en_core_web_sm')
    chemical_disease_nlp = spacy.load('en_core_web_md')
    chem_ent_ratios, seen_chem, chem_count = [], [], []
    human_bigram = []    
    for index, row in dataframe.iterrows():
        abstract = row['abstract']
    #   ------------ Chemical Names ----------------
        entity_doc = entity_nlp(abstract)
        chemical_disease_doc = chemical_disease_nlp(abstract)
        chemical_ents = [ent.text for ent in chemical_disease_doc.ents if ent.label_ == 'CHEMICAL']
        if len(entity_doc.ents) == 0:
            chem_ent_ratios.append(0)
        else:
            chem_ent_ratios.append(len(chemical_ents) / len(entity_doc.ents))
        seen_chem.append(list(set(chemical_ents)))
        chem_count.append(len(chemical_ents))
    #   ------------ Bigram Score --------------
        if include_bigrams == 'TRUE':
            tokens = nltk.word_tokenize(abstract)
            bigrams = nltk.bigrams(tokens)
            stopset = set(stopwords.words('english') + list(string.punctuation))
            milk_bigrams = [(w1, w2) for w1, w2 in bigrams if 
                            (w1.lower() == 'milk' or w2.lower() == 'milk')
                             and (w1.lower() not in stopset and w2.lower() not in stopset)]
            human_bigrams = [(w1, w2) for w1, w2 in milk_bigrams if 
                             (w1.lower() == 'human' or w2.lower() == 'human')
                             and (w1.lower() not in stopset and w2.lower() not in stopset)]
            human_bigram.append(len(human_bigrams) / len(milk_bigrams) if len(milk_bigrams) != 0 else 0)
        else:
            human_bigram.append(0.0)

    dataframe['chem_ent_ratio'] = chem_ent_ratios
    dataframe['chemicals'] = seen_chem
    dataframe['bigram_score'] = human_bigram
    dataframe['chem_term_count'] = chem_count
    
    return dataframe

In [3]:
import matplotlib as mpl
mpl.rcParams['figure.dpi']= 150

def clean_plot(leg=True, grid=None, font=None):
    ax = plt.gca()
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.yaxis.set_ticks_position('left')
    ax.xaxis.set_ticks_position('bottom')
    
    axis_color = 'lightgrey'
    ax.spines['bottom'].set_color(axis_color)
    ax.spines['left'].set_color(axis_color)
    ax.tick_params(axis='both', color=axis_color)
    
    if leg:
        ax.legend(frameon = False, loc='upper left', bbox_to_anchor=(1, 1))
        
    if grid is not None:
        plt.grid(color='lightgrey', axis = grid, linestyle='-', linewidth=.5)
        
    if font is not None:
        for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] +
            ax.get_xticklabels() + ax.get_yticklabels()):
            
            item.set_fontfamily(font['family'])
            item.set_color(font['color'])
            

In [4]:
from src.filter import Filter

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\parth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Loading datasets and generating features

In [5]:
## Garlic and Cocoa

gtrain = pd.read_csv("data/garlic_scoring.csv", encoding='latin1')[['PMID', 'abstract', 'paper', 'mesh_terms', 'qual_terms', 'is_useful']]
gtrain['food'] = 'garlic'
ctrain = pd.read_csv("data/cocoa_scoring.csv", encoding='latin1')[['PMID', 'abstract', 'paper', 'mesh_terms', 'qual_terms', 'is_useful']]
ctrain['food'] = 'cocoa'
gtrain['is_useful'] = gtrain['is_useful'].replace(2, 1, regex=True)
ctrain['is_useful'] = ctrain['is_useful'].replace(2, 1, regex=True)
gtrain = gtrain[gtrain['is_useful'].notnull()]
ctrain = ctrain[ctrain['is_useful'].notnull()]

## Basil

btrain = pd.read_excel("data/basil_scoring.xls")[['PMID', 'abstract', 'paper', 'mesh_terms', 'qual_terms', 'is_useful']]
btrain['food'] = 'apple'

for i in range(len(btrain)):
    if btrain['is_useful'].loc[i] == 'x':
        btrain['is_useful'].loc[i] = 0
        
btrain['is_useful'] = btrain['is_useful'].replace(2, 1, regex=True)
btrain = btrain[btrain['is_useful'].notnull()]

## Apple

atrain = pd.read_excel("data/apple_scoring.xls")[['PMID', 'abstract', 'paper', 'mesh_terms', 'qual_terms', 'is_useful']]
atrain['food'] = 'apple'

for i in range(len(atrain)):
    if atrain['is_useful'].loc[i] == 'x':
        atrain['is_useful'].loc[i] = 0

atrain['is_useful'] = atrain['is_useful'].replace(2, 1, regex=True)
atrain = atrain[atrain['is_useful'].notnull()]
atrain = atrain[atrain['abstract'].notnull()]
atrain = atrain[atrain['PMID'].notnull()]

## Human Milk database

mtrain_new = pd.read_csv("mBase_15Aug_abstract[chemical_gen].csv")
mtrain_new['food'] = 'milk'
mtrain_new = mtrain_new[mtrain_new['abstract'].notnull()]
mtrain_new = mtrain_new[mtrain_new['PMID'].notnull()]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.h

In [6]:
print('Length of the databases for garlic, cocoa, basil, apple, milk: ', len(gtrain), len(ctrain), len(btrain), len(atrain), len(mtrain_new))

Length of the databases for garlic, cocoa, basil, apple, milk:  299 324 93 1653 229


In [7]:
print('Useful and non-useful: ',len(gtrain[gtrain['is_useful'] == 1.0]),len(gtrain[gtrain['is_useful'] == 0.0]))

Useful and non-useful:  77 222


In [8]:
print('Useful and non-useful: ',len(ctrain[ctrain['is_useful'] == 1.0]),len(ctrain[ctrain['is_useful'] == 0.0]))

Useful and non-useful:  93 231


In [9]:
print('Useful and non-useful: ',len(atrain[atrain['is_useful'] == 1.0]),len(atrain[atrain['is_useful'] == 0.0]))

Useful and non-useful:  462 1191


In [10]:
print('Useful and non-useful: ',len(btrain[btrain['is_useful'] == 1.0]),len(btrain[btrain['is_useful'] == 0.0]))

Useful and non-useful:  57 36


In [11]:
def build_all_features(ftrain,include_bigrams='FALSE'):
    fmodel_data = Filter()

    fmodel_data.build_features(input_data = ftrain,is_traindata = True)

    ftrain = feature_gen(ftrain,include_bigrams)

    fmodel_data.data['chem_ent_ratio'] = ftrain['chem_ent_ratio'].values
    fmodel_data.data['chem_term_count'] = ftrain['chem_term_count'].values
    fmodel_data.data['bigram_score'] = ftrain['bigram_score'].values
    
    return fmodel_data

In [12]:
print('----Starting feature generation----')
gdata = build_all_features(gtrain,include_bigrams='FALSE')
print('----DONE----')
cdata = build_all_features(ctrain,include_bigrams='FALSE')
print('----DONE----')
bdata = build_all_features(btrain,include_bigrams='FALSE')
print('----DONE----')

----Starting feature generation----
Creating features...


299it [02:20,  2.13it/s]
[W095] Model 'en_core_web_md' (3.5.0) was trained with spaCy v3.5.0 and may not be 100% compatible with the current version (3.6.1). If you see errors or degraded performance, download a newer compatible model or retrain your custom model with the current spaCy version. For more details and available updates, run: python -m spacy validate


----DONE----
Creating features...


324it [02:13,  2.43it/s]
[W095] Model 'en_core_web_md' (3.5.0) was trained with spaCy v3.5.0 and may not be 100% compatible with the current version (3.6.1). If you see errors or degraded performance, download a newer compatible model or retrain your custom model with the current spaCy version. For more details and available updates, run: python -m spacy validate


----DONE----
Creating features...


93it [00:38,  2.43it/s]
[W095] Model 'en_core_web_md' (3.5.0) was trained with spaCy v3.5.0 and may not be 100% compatible with the current version (3.6.1). If you see errors or degraded performance, download a newer compatible model or retrain your custom model with the current spaCy version. For more details and available updates, run: python -m spacy validate


----DONE----


In [13]:
print('----Starting feature generation----')
adata = build_all_features(atrain,include_bigrams='FALSE')
print('----DONE----')
mdata = build_all_features(mtrain_new,include_bigrams='TRUE')
print('----DONE----')

----Starting feature generation----
Creating features...


1653it [14:24,  1.91it/s]
[W095] Model 'en_core_web_md' (3.5.0) was trained with spaCy v3.5.0 and may not be 100% compatible with the current version (3.6.1). If you see errors or degraded performance, download a newer compatible model or retrain your custom model with the current spaCy version. For more details and available updates, run: python -m spacy validate


----DONE----
Creating features...


229it [01:16,  3.00it/s]
[W095] Model 'en_core_web_md' (3.5.0) was trained with spaCy v3.5.0 and may not be 100% compatible with the current version (3.6.1). If you see errors or degraded performance, download a newer compatible model or retrain your custom model with the current spaCy version. For more details and available updates, run: python -m spacy validate


----DONE----


In [68]:
gdata_features_class = gdata.data.copy()
cdata_features_class = cdata.data.copy()
bdata_features_class = bdata.data.copy()
adata_features_class = adata.data.copy()
mdata_features_class = mdata.data.copy()

In [69]:
gdata_features_class.to_csv('data_with_feature/gdata_features_class.csv')
cdata_features_class.to_csv('data_with_feature/cdata_features_class.csv')
bdata_features_class.to_csv('data_with_feature/bdata_features_class.csv')
adata_features_class.to_csv('data_with_feature/adata_features_class.csv')
mdata_features_class.to_csv('data_with_feature/mdata_features_class.csv')

In [70]:
gdata_features_class = pd.read_csv('data_with_feature/gdata_features_class.csv')
cdata_features_class = pd.read_csv('data_with_feature/cdata_features_class.csv')
bdata_features_class = pd.read_csv('data_with_feature/bdata_features_class.csv')
adata_features_class = pd.read_csv('data_with_feature/adata_features_class.csv')
mdata_features_class = pd.read_csv('data_with_feature/mdata_features_class.csv')

# Doc2vec trained on FoodBase

In [71]:
fmine = pd.read_csv('FoodBase_Abstracts_embeddings.csv')

In [72]:
embedding_list = []

for index, row in fmine.iterrows():
    local_list = []
    for x in row['vectors'].replace('[','').replace(']','').replace('\n',' ').replace('  ',' ').replace('   ',' ').split(' '):
        if x != '':
            local_list.append(float(x))
    embedding_list.append(local_list)
    
fmine['embeddings'] = embedding_list

In [73]:
abstract_embedding_dict = dict()

for index, row in fmine.iterrows():
    abstract_embedding_dict[row['abstract']] = row['embeddings']

In [74]:
print('Dimension of Doc2Vec: ', len(embedding_list[0]))

Dimension of Doc2Vec:  64


In [98]:
fmine_expanded = pd.concat([fmine, fmine['embeddings'].apply(pd.Series)], axis = 1)

In [99]:
gdata_train_doc2vec = pd.merge(gtrain, fmine_expanded[fmine_expanded['food'] == 'garlic'], on=['abstract'])
gdata_features_class_doc2vec = pd.merge(gdata_train_doc2vec, gdata_features_class, on=['PMID'])
gdata_features_class_doc2vec = gdata_features_class_doc2vec.drop(columns=['PMID','abstract','paper','mesh_terms','qual_terms','is_useful_x','food_x','is_useful_y','food_y','vectors','embeddings'])

In [100]:
cdata_train_doc2vec = pd.merge(ctrain, fmine_expanded[fmine_expanded['food'] == 'cocoa'], on=['abstract'])
cdata_features_class_doc2vec = pd.merge(cdata_train_doc2vec, cdata_features_class, on=['PMID'])
cdata_features_class_doc2vec = cdata_features_class_doc2vec.drop(columns=['PMID','abstract','paper','mesh_terms','qual_terms','is_useful_x','food_x','is_useful_y','food_y','vectors','embeddings'])

In [101]:
bdata_train_doc2vec = pd.merge(btrain, fmine_expanded[fmine_expanded['food'] == 'basil'], on=['abstract'])
bdata_features_class_doc2vec = pd.merge(bdata_train_doc2vec, bdata_features_class, on=['PMID'])
bdata_features_class_doc2vec = bdata_features_class_doc2vec.drop(columns=['PMID','abstract','paper','mesh_terms','qual_terms','is_useful_x','food_x','is_useful_y','food_y','vectors','embeddings'])

In [102]:
adata_train_doc2vec = pd.merge(atrain, fmine_expanded[fmine_expanded['food'] == 'apple'], on=['abstract'])
adata_features_class_doc2vec = pd.merge(adata_train_doc2vec, adata_features_class, on=['PMID'])
adata_features_class_doc2vec = adata_features_class_doc2vec.drop(columns=['PMID','abstract','paper','mesh_terms','qual_terms','is_useful_x','food_x','is_useful_y','food_y','vectors','embeddings'])

In [103]:
mdata_train_doc2vec = pd.merge(mtrain_new, fmine_expanded[fmine_expanded['food'] == 'human milk'], on=['abstract'])
mdata_features_class_doc2vec = pd.merge(mdata_train_doc2vec, mdata_features_class, on=['PMID'])
mdata_features_class_doc2vec = mdata_features_class_doc2vec.drop(columns=['PMID','abstract','paper','mesh_terms','qual_terms','is_useful_x','food_x','is_useful_y','food_y','vectors','embeddings'])

In [104]:
mdata_features_class_doc2vec = mdata_features_class_doc2vec.drop(columns=['journal','mesh_UIds','qual_UIds','webpage','year','source','measmethod','chem_ent_ratio_x','chemicals','bigram_score_x','chem_term_count_x'])

In [105]:
mdata_features_class_doc2vec = mdata_features_class_doc2vec.rename(columns={'chem_ent_ratio_y':'chem_ent_ratio','chem_term_count_y':'chem_term_count','bigram_score_y':'bigram_score'})

# Normalize the features

In [106]:
def normalize_features(total_data):
    total_data_features = total_data.drop(columns=['class'])
    for col_feature in list(total_data_features.columns): 
        col_list = total_data_features[col_feature].tolist()
        if col_list != [0.0] * len(col_list):
            col_list_normalized = [(x-np.mean(col_list))/np.std(col_list) for x in col_list]
        else:
            col_list_normalized = col_list
        total_data_features[col_feature] = col_list_normalized
    # total_data_features_normalized = (total_data_features-total_data_features.mean())/total_data_features.std()
    # total_data_features_normalized = (total_data_features-total_data_features.min())/(total_data_features.max()-total_data_features.min())
    total_data_features['class'] = total_data['class'].tolist()
    
    return total_data_features

In [107]:
gdata_features_class_normalized = normalize_features(gdata_features_class)
cdata_features_class_normalized = normalize_features(cdata_features_class)
bdata_features_class_normalized = normalize_features(bdata_features_class)
adata_features_class_normalized = normalize_features(adata_features_class)
mdata_features_class_normalized = normalize_features(mdata_features_class)

In [108]:
fm1p0_columns = ['chromatography', 'food_term_count', 'gen_term_count', 'sci_term_count', 'spectrometry', 'spectrophotometry']

# Performances on seen food 

In [110]:
def xgboost_model(x,y,kfold,n_splits=10,fm1p0_columns=fm1p0_columns):

    cross_val_model_fm1 = RandomForestClassifier(max_depth=80,random_state=0)
    cross_val_model_fm2 = RandomForestClassifier(max_depth=80,random_state=0)
    
    kfold = StratifiedKFold(n_splits)
    
    auc_fm1 = []
    aup_fm1 = []
    f1_score_list_fm1 = []
    
    auc_fm2 = []
    aup_fm2 = []
    f1_score_list_fm2 = []
    
    for i,(train,test) in enumerate(kfold.split(x,y)):
        
        x_old = x[fm1p0_columns]
        
        cross_val_model_fm2.fit(x.loc[train],y.loc[train])
        cross_val_model_fm1.fit(x_old.loc[train],y.loc[train])
        
        #viz_fm1 = plot_roc_curve(cross_val_model_fm1, x_old.loc[test], y.loc[test],
        #                 name='ROC fold {}'.format(i),
        #                 alpha=0.3, lw=1)
        #viz_fm2 = plot_roc_curve(cross_val_model_fm2, x.loc[test], y.loc[test],
        #                 name='ROC fold {}'.format(i),
        #                 alpha=0.3, lw=1)
        
        y_predicted_fm1 = cross_val_model_fm1.predict(x_old.loc[test])
        y_predicted_fm2 = cross_val_model_fm2.predict(x.loc[test])
        
        auc_fm1.append(roc_auc_score(np.array(y.loc[test].tolist()),y_predicted_fm1))
        aup_fm1.append(average_precision_score(np.array(y.loc[test].tolist()),y_predicted_fm1))
        f1_score_list_fm1.append(f1_score(np.array(y.loc[test].tolist()),y_predicted_fm1))
        
        auc_fm2.append(roc_auc_score(np.array(y.loc[test].tolist()),y_predicted_fm2))
        aup_fm2.append(average_precision_score(np.array(y.loc[test].tolist()),y_predicted_fm2))
        f1_score_list_fm2.append(f1_score(np.array(y.loc[test].tolist()),y_predicted_fm2))
        # plot_confusion_matrix(cross_val_model,x.loc[test],y.loc[test])
        
    print('------------------------------------------------------------------------------------------------------')
    print('FoodMine1.0')
    print('Average and SD of AUROC: ', mean(auc_fm1), np.std(auc_fm1))
    print('Average and SD of AUPRC:', mean(aup_fm1), np.std(aup_fm1))
    print('Average and SD of f1-Score:', mean(f1_score_list_fm1), np.std(f1_score_list_fm1))
    print('All AUROC values: ', auc_fm1)
    print('All AUPRC values: ', aup_fm1)
    print('All f1-Score values: ', f1_score_list_fm1)
    print('------------------------------------------------------------------------------------------------------')
    print('FoodMine2.0')
    print('Average and SD of AUROC: ', mean(auc_fm2), np.std(auc_fm2))
    print('Average and SD of AUPRC:', mean(aup_fm2), np.std(aup_fm2))
    print('Average and SD of f1-Score:', mean(f1_score_list_fm2), np.std(f1_score_list_fm2))
    print('All AUROC values: ', auc_fm2)
    print('All AUPRC values: ', aup_fm2)
    print('All f1-Score values: ', f1_score_list_fm2)
    print('------------------------------------------------------------------------------------------------------')

    return mean(auc_fm2), mean(aup_fm2), mean(f1_score_list_fm2)

In [117]:
def get_cross_validation_seen_food(fdata_features_class_normalized):

    repeat = 1

    for _ in tqdm(range(repeat)):
        oversample = SMOTE()
        y = fdata_features_class_normalized.copy()['class']
        X = fdata_features_class_normalized.copy().drop('class', axis = 1)
        X = fdata_features_class_normalized.copy().drop('chemicals', axis = 1)
        X.columns = X.columns.astype(str)
        X_smote, y_smote = oversample.fit_resample(X, y)
        kfold = StratifiedKFold(n_splits=10)
        auc_fm2, aup_fm2, f1_fm2 = xgboost_model(X_smote, y_smote,kfold,n_splits=10)
        
    return 

In [118]:
## Garlic
get_cross_validation_seen_food(gdata_features_class_doc2vec)

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:04<00:00,  4.96s/it]

------------------------------------------------------------------------------------------------------
FoodMine1.0
Average and SD of AUROC:  0.7764822134387351 0.13656240883157408
Average and SD of AUPRC: 0.7356675611668636 0.11780240217446954
Average and SD of f1-Score: 0.7198902336956919 0.21598530465828072
All AUROC values:  [0.6521739130434783, 0.516798418972332, 0.6146245059288538, 0.708498023715415, 0.8636363636363635, 0.8863636363636364, 0.9090909090909092, 0.9090909090909092, 0.9090909090909091, 0.7954545454545454]
All AUPRC values:  [0.659903381642512, 0.5200702678963548, 0.5893217893217892, 0.6518716577540107, 0.7937062937062938, 0.8148148148148148, 0.8579545454545454, 0.8579545454545454, 0.8719008264462809, 0.7391774891774892]
All f1-Score values:  [0.4666666666666667, 0.3529411764705882, 0.41379310344827586, 0.6666666666666667, 0.875, 0.8979591836734693, 0.9130434782608695, 0.9130434782608695, 0.9090909090909091, 0.7906976744186046]
-----------------------------------------




In [119]:
## Cocoa
get_cross_validation_seen_food(cdata_features_class_doc2vec)

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:05<00:00,  5.34s/it]

------------------------------------------------------------------------------------------------------
FoodMine1.0
Average and SD of AUROC:  0.7192028985507246 0.1852934116292177
Average and SD of AUPRC: 0.6901133856464569 0.15447645115041672
Average and SD of f1-Score: 0.6344240302181479 0.2943318615643008
All AUROC values:  [0.5235507246376812, 0.5815217391304348, 0.4782608695652174, 0.41304347826086957, 0.891304347826087, 0.8260869565217391, 0.9130434782608695, 0.8478260869565217, 0.8695652173913044, 0.8478260869565217]
All AUPRC values:  [0.5037927844588344, 0.5704787234042553, 0.4901185770750988, 0.47101449275362317, 0.8557312252964426, 0.7536231884057971, 0.8933747412008282, 0.7809364548494984, 0.8011272141706924, 0.7809364548494984]
All f1-Score values:  [0.21428571428571427, 0.375, 0.29411764705882354, 0.22857142857142854, 0.888888888888889, 0.84, 0.909090909090909, 0.8571428571428572, 0.8800000000000001, 0.8571428571428572]
-----------------------------------------------------




In [120]:
## Basil
get_cross_validation_seen_food(bdata_features_class_doc2vec)

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.92s/it]

------------------------------------------------------------------------------------------------------
FoodMine1.0
Average and SD of AUROC:  0.5833333333333334 0.15509853498842457
Average and SD of AUPRC: 0.5783513708513709 0.11569540797589978
Average and SD of f1-Score: 0.5425574425574425 0.16970215608483222
All AUROC values:  [0.41666666666666663, 0.7500000000000002, 0.41666666666666663, 0.33333333333333337, 0.5499999999999999, 0.6500000000000001, 0.75, 0.8166666666666667, 0.5166666666666667, 0.6333333333333334]
All AUPRC values:  [0.4666666666666667, 0.6785714285714286, 0.4642857142857143, 0.4444444444444445, 0.5727272727272728, 0.6477272727272727, 0.7727272727272727, 0.730909090909091, 0.4636363636363636, 0.5418181818181818]
All f1-Score values:  [0.3636363636363636, 0.7692307692307692, 0.4615384615384615, 0.3333333333333333, 0.5454545454545454, 0.6, 0.6666666666666666, 0.8000000000000002, 0.28571428571428575, 0.6]
-------------------------------------------------------------------




In [121]:
## Apple
get_cross_validation_seen_food(adata_features_class_doc2vec)

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:16<00:00, 16.75s/it]

------------------------------------------------------------------------------------------------------
FoodMine1.0
Average and SD of AUROC:  0.5869257703081232 0.05668365058735479
Average and SD of AUPRC: 0.552239400494523 0.03615589989295591
Average and SD of f1-Score: 0.5974663616606625 0.09344150941431156
All AUROC values:  [0.5436274509803922, 0.5189075630252101, 0.5168067226890757, 0.5126050420168067, 0.6302521008403361, 0.6260504201680672, 0.680672268907563, 0.634453781512605, 0.6134453781512605, 0.592436974789916]
All AUPRC values:  [0.5219604947566032, 0.5119155903219939, 0.5087463556851312, 0.5065015479876162, 0.5784962991819245, 0.5773491214667685, 0.6167609901103299, 0.5813799203892083, 0.56686515665869, 0.5524185283869645]
All f1-Score values:  [0.5067873303167422, 0.5106382978723404, 0.4700460829493087, 0.45794392523364486, 0.674074074074074, 0.6454183266932272, 0.7142857142857144, 0.6789667896678967, 0.6592592592592592, 0.6572438162544169]
--------------------------------




In [122]:
## Milk
get_cross_validation_seen_food(mdata_features_class_doc2vec)

  0%|                                                                                            | 0/1 [00:00<?, ?it/s]


KeyError: "['chemicals'] not found in axis"

# Performance on unseen food

In [123]:
def xgboost_model_unseen_food(x_other,y_other,x_food,y_food,kfold,n_splits=10,fm1p0_columns=fm1p0_columns):

    cross_val_model_fm1 = RandomForestClassifier(max_depth=80,random_state=0)
    cross_val_model_fm2 = RandomForestClassifier(max_depth=80,random_state=0)
    
    kfold = StratifiedKFold(n_splits)
    
    auc_fm1 = []
    aup_fm1 = []
    f1_score_list_fm1 = []
    
    auc_fm2 = []
    aup_fm2 = []
    f1_score_list_fm2 = []
    
    for i,(test_food,train_food) in enumerate(kfold.split(x_food,y_food)): # test set is kept larger here 
        
        x_other_old = x_other[fm1p0_columns]
        x_food_old = x_food[fm1p0_columns]
        
        cross_val_model_fm2.fit(pd.concat([x_other,x_food.loc[test_food]]),pd.concat([y_other,y_food.loc[test_food]]))
        cross_val_model_fm1.fit(pd.concat([x_other_old,x_food_old.loc[test_food]]),pd.concat([y_other,y_food.loc[test_food]]))
        
        #viz_fm1 = plot_roc_curve(cross_val_model_fm1, x_old.loc[test], y.loc[test],
        #                 name='ROC fold {}'.format(i),
        #                 alpha=0.3, lw=1)
        #viz_fm2 = plot_roc_curve(cross_val_model_fm2, x.loc[test], y.loc[test],
        #                 name='ROC fold {}'.format(i),
        #                 alpha=0.3, lw=1)
        
        y_predicted_fm1 = cross_val_model_fm1.predict(x_food_old.loc[train_food])
        y_predicted_fm2 = cross_val_model_fm2.predict(x_food.loc[train_food])
        
        auc_fm1.append(roc_auc_score(np.array(y_food.loc[train_food].tolist()),y_predicted_fm1))
        aup_fm1.append(average_precision_score(np.array(y_food.loc[train_food].tolist()),y_predicted_fm1))
        f1_score_list_fm1.append(f1_score(np.array(y_food.loc[train_food].tolist()),y_predicted_fm1))
        
        auc_fm2.append(roc_auc_score(np.array(y_food.loc[train_food].tolist()),y_predicted_fm2))
        aup_fm2.append(average_precision_score(np.array(y_food.loc[train_food].tolist()),y_predicted_fm2))
        f1_score_list_fm2.append(f1_score(np.array(y_food.loc[train_food].tolist()),y_predicted_fm2))
        # plot_confusion_matrix(cross_val_model,x.loc[test],y.loc[test])
        
    print('------------------------------------------------------------------------------------------------------')
    print('FoodMine1.0')
    print('Average and SD of AUROC: ', mean(auc_fm1), np.std(auc_fm1))
    print('Average and SD of AUPRC:', mean(aup_fm1), np.std(aup_fm1))
    print('Average and SD of f1-Score:', mean(f1_score_list_fm1), np.std(f1_score_list_fm1))
    print('All AUROC values: ', auc_fm1)
    print('All AUPRC values: ', aup_fm1)
    print('All f1-Score values: ', f1_score_list_fm1)
    print('------------------------------------------------------------------------------------------------------')
    print('FoodMine2.0')
    print('Average and SD of AUROC: ', mean(auc_fm2), np.std(auc_fm2))
    print('Average and SD of AUPRC:', mean(aup_fm2), np.std(aup_fm2))
    print('Average and SD of f1-Score:', mean(f1_score_list_fm2), np.std(f1_score_list_fm2))
    print('All AUROC values: ', auc_fm2)
    print('All AUPRC values: ', aup_fm2)
    print('All f1-Score values: ', f1_score_list_fm2)
    print('------------------------------------------------------------------------------------------------------')

    return mean(auc_fm2), mean(aup_fm2), mean(f1_score_list_fm2)

In [130]:
other_foods

Unnamed: 0,chem_ent_ratio_x,chemicals,bigram_score_x,chem_term_count_x,0,1,2,3,4,5,...,class,spectrometry,chromatography,spectrophotometry,chem_ent_ratio_y,chem_term_count_y,bigram_score_y,chem_ent_ratio,chem_term_count,bigram_score
0,0.0,[],0.0,0.0,0.829169,0.982137,-0.094145,-2.726737,1.988802,-3.287893,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
1,0.0,[],0.0,0.0,1.398161,-0.680581,-0.014942,-0.365835,-0.685722,-1.055222,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,,,
2,0.0,[],0.0,0.0,-0.304260,-1.165422,-0.358080,-2.827604,1.150531,-0.974158,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
3,0.0,[],0.0,0.0,-1.008594,-0.866767,-0.076545,-1.335826,0.230017,-1.727068,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,,,
4,0.0,[],0.0,0.0,-2.908859,-1.621425,-3.386691,0.013008,0.014096,0.085421,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
230,,,,,1.930623,0.606643,-1.623046,-2.966842,0.356345,-2.050584,...,1.0,0.0,0.0,0.0,,,,0.0,0.0,0.6
231,,,,,-0.175221,-0.166890,-0.695077,-1.659369,1.041467,-0.973924,...,1.0,0.0,0.0,0.0,,,,0.0,0.0,0.5
232,,,,,-0.999931,-0.205728,-2.001056,-0.408468,1.569245,-1.439709,...,1.0,0.0,0.0,0.0,,,,0.0,0.0,0.0
233,,,,,3.860390,2.208683,-2.119556,1.693405,-0.015089,-3.426805,...,0.0,0.0,0.0,0.0,,,,0.0,0.0,0.0


In [131]:
def get_cross_validation_unseen_food(other_data_features_class,fdata_features_class):

    repeat = 1

    for _ in tqdm(range(repeat)):
        oversample = SMOTE()
        y_other = other_data_features_class.copy()['class']
        X_other = other_data_features_class.copy().drop('class', axis = 1)
        X_other = other_data_features_class.copy().drop('chemicals', axis = 1)
        X_other.columns = X_other.columns.astype(str)
        X_other_smote, y_other_smote = oversample.fit_resample(X_other, y_other)
        y_food = fdata_features_class.copy()['class']
        X_food = fdata_features_class.copy().drop('class', axis = 1)
        X_food = fdata_features_class.copy().drop('chemicals', axis = 1)
        X_food.columns = X_food.columns.astype(str)
        X_food_smote, y_food_smote = oversample.fit_resample(X_food, y_food)
        kfold = StratifiedKFold(n_splits=10)
        auc_fm2, aup_fm2, f1_fm2 = xgboost_model_unseen_food(X_other_smote, y_other_smote,X_food_smote, y_food_smote,kfold,n_splits=10)
        
    return 

In [132]:
## Garlic
other_foods = pd.concat([cdata_features_class_doc2vec,bdata_features_class_doc2vec,adata_features_class_doc2vec,mdata_features_class_doc2vec])
get_cross_validation_unseen_food(other_foods,gdata_features_class_doc2vec)

  0%|                                                                                            | 0/1 [00:00<?, ?it/s]


ValueError: Input X contains NaN.
SMOTE does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [53]:
## Cocoa
other_foods = pd.concat([gdata_features_class_doc2vec,bdata_features_class_doc2vec,adata_features_class_doc2vec,mdata_features_class_doc2vec])
get_cross_validation_unseen_food(other_foods,cdata_features_class_doc2vec)

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:18<00:00, 18.79s/it]

------------------------------------------------------------------------------------------------------
FoodMine1.0
Average and SD of AUROC:  0.7455615942028986 0.21070890790249047
Average and SD of AUPRC: 0.7195966006789318 0.18265392834404642
Average and SD of f1-Score: 0.6150366104522164 0.3776704566202394
All AUROC values:  [0.5226449275362319, 0.4547101449275362, 0.4782608695652174, 0.49999999999999994, 0.891304347826087, 0.9347826086956521, 0.9347826086956522, 0.9130434782608696, 0.891304347826087, 0.9347826086956522]
All AUPRC values:  [0.5047795251310515, 0.49586288416075647, 0.4927536231884058, 0.5, 0.8311036789297659, 0.8846153846153846, 0.8985507246376812, 0.8518518518518519, 0.8214285714285714, 0.9150197628458498]
All f1-Score values:  [0.15384615384615383, 0.13333333333333333, 0.07692307692307691, 0.25806451612903225, 0.8979591836734695, 0.9387755102040816, 0.9361702127659574, 0.92, 0.9019607843137255, 0.9333333333333332]
----------------------------------------------------




In [54]:
## Basil
other_foods = pd.concat([gdata_features_class_doc2vec,cdata_features_class_doc2vec,adata_features_class_doc2vec,mdata_features_class_doc2vec])
get_cross_validation_unseen_food(other_foods,bdata_features_class_doc2vec)

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:21<00:00, 21.37s/it]

------------------------------------------------------------------------------------------------------
FoodMine1.0
Average and SD of AUROC:  0.3816666666666667 0.2068883219946882
Average and SD of AUPRC: 0.5056998556998556 0.06694085093847828
Average and SD of f1-Score: 0.21746031746031746 0.16361337770663945
All AUROC values:  [0.5833333333333334, 0.5833333333333334, 0.5, 0.5833333333333334, 0.4833333333333334, 0.5, 0.2, 0.2833333333333333, 0.1, 0.0]
All AUPRC values:  [0.5833333333333334, 0.5555555555555556, 0.5, 0.5555555555555556, 0.5378787878787878, 0.5454545454545454, 0.5454545454545454, 0.387012987012987, 0.3922077922077922, 0.45454545454545453]
All f1-Score values:  [0.2857142857142857, 0.4444444444444444, 0.25, 0.4444444444444444, 0.25, 0.0, 0.0, 0.3333333333333333, 0.16666666666666666, 0.0]
------------------------------------------------------------------------------------------------------
FoodMine2.0
Average and SD of AUROC:  0.515 0.11016401913107159
Average and SD of AUP




In [55]:
## Apple
other_foods = pd.concat([gdata_features_class_doc2vec,cdata_features_class_doc2vec,bdata_features_class_doc2vec,mdata_features_class_doc2vec])
get_cross_validation_unseen_food(other_foods,adata_features_class_doc2vec)

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:19<00:00, 19.15s/it]

------------------------------------------------------------------------------------------------------
FoodMine1.0
Average and SD of AUROC:  0.7541981792717086 0.20173174640334612
Average and SD of AUPRC: 0.7331111183790664 0.18376070506257436
Average and SD of f1-Score: 0.6179401408230707 0.3692218482255413
All AUROC values:  [0.5045168067226891, 0.49964985994397754, 0.47478991596638653, 0.5546218487394958, 0.9201680672268907, 0.9159663865546218, 0.9411764705882353, 0.9117647058823529, 0.903361344537815, 0.9159663865546218]
All AUPRC values:  [0.5002951632095252, 0.5019177126917712, 0.4903038138332256, 0.54421768707483, 0.8881213502350092, 0.8839689722042663, 0.9152249134948097, 0.8685958254269449, 0.8657598632673409, 0.8727058823529412]
All f1-Score values:  [0.14492753623188406, 0.14285714285714285, 0.13793103448275865, 0.24285714285714285, 0.9198312236286919, 0.9152542372881355, 0.9411764705882353, 0.9135802469135801, 0.9029535864978903, 0.9180327868852459]
------------------------




In [56]:
## Milk
other_foods = pd.concat([gdata_features_class_doc2vec,cdata_features_class_doc2vec,bdata_features_class_doc2vec,adata_features_class_doc2vec])
get_cross_validation_unseen_food(other_foods,mdata_features_class_doc2vec)

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:20<00:00, 20.79s/it]

------------------------------------------------------------------------------------------------------
FoodMine1.0
Average and SD of AUROC:  0.4926282051282051 0.06966344494876907
Average and SD of AUPRC: 0.5162673992673993 0.031023000480114728
Average and SD of f1-Score: 0.09528822055137846 0.0633249372331306
All AUROC values:  [0.5384615384615384, 0.5384615384615384, 0.5384615384615384, 0.5384615384615384, 0.5, 0.5, 0.5, 0.5384615384615384, 0.4230769230769231, 0.31089743589743585]
All AUPRC values:  [0.5384615384615385, 0.5384615384615385, 0.5384615384615385, 0.5384615384615385, 0.5, 0.5, 0.52, 0.556923076923077, 0.48, 0.45190476190476186]
All f1-Score values:  [0.14285714285714288, 0.14285714285714288, 0.14285714285714288, 0.14285714285714288, 0.0, 0.13333333333333336, 0.0, 0.14285714285714288, 0.0, 0.10526315789473685]
------------------------------------------------------------------------------------------------------
FoodMine2.0
Average and SD of AUROC:  0.6820512820512821 0.088


