In [37]:
import pandas as pd
import numpy as np
import importlib
import seaborn as sns
import matplotlib.pyplot as plt

from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,confusion_matrix, roc_auc_score,average_precision_score,f1_score,plot_confusion_matrix
from xgboost import XGBClassifier
import shap
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import plot_roc_curve
from statistics import mean
from sklearn.ensemble import RandomForestClassifier
import re
import pickle
from matplotlib import pyplot
import time
import scispacy
import spacy
import nltk
from tqdm import tqdm

# Functions to generate features

In [2]:
from nltk.corpus import stopwords
import string
#nltk.download('punkt')
#nltk.download('stopwords')

def feature_gen(dataframe,include_bigrams='FALSE'):
    
    meas_method_column_vals = []
    entity_nlp = spacy.load('en_core_sci_sm')
    chemical_disease_nlp = spacy.load('en_ner_bc5cdr_md')
    chem_ent_ratios, seen_chem, chem_count = [], [], []
    human_bigram = []    
    for index, row in dataframe.iterrows():
        abstract = row['abstract']
    #   ------------ Chemical Names ----------------
        entity_doc = entity_nlp(abstract)
        chemical_disease_doc = chemical_disease_nlp(abstract)
        chemical_ents = [ent.text for ent in chemical_disease_doc.ents if ent.label_ == 'CHEMICAL']
        if len(entity_doc.ents) == 0:
            chem_ent_ratios.append(0)
        else:
            chem_ent_ratios.append(len(chemical_ents) / len(entity_doc.ents))
        seen_chem.append(list(set(chemical_ents)))
        chem_count.append(len(chemical_ents))
    #   ------------ Bigram Score --------------
        if include_bigrams == 'TRUE':
            tokens = nltk.word_tokenize(abstract)
            bigrams = nltk.bigrams(tokens)
            stopset = set(stopwords.words('english') + list(string.punctuation))
            milk_bigrams = [(w1, w2) for w1, w2 in bigrams if 
                            (w1.lower() == 'milk' or w2.lower() == 'milk')
                             and (w1.lower() not in stopset and w2.lower() not in stopset)]
            human_bigrams = [(w1, w2) for w1, w2 in milk_bigrams if 
                             (w1.lower() == 'human' or w2.lower() == 'human')
                             and (w1.lower() not in stopset and w2.lower() not in stopset)]
            human_bigram.append(len(human_bigrams) / len(milk_bigrams) if len(milk_bigrams) != 0 else 0)
        else:
            human_bigram.append(0.0)

    dataframe['chem_ent_ratio'] = chem_ent_ratios
    dataframe['chemicals'] = seen_chem
    dataframe['bigram_score'] = human_bigram
    dataframe['chem_term_count'] = chem_count
    
    return dataframe

In [2]:
import matplotlib as mpl
mpl.rcParams['figure.dpi']= 150

def clean_plot(leg=True, grid=None, font=None):
    ax = plt.gca()
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.yaxis.set_ticks_position('left')
    ax.xaxis.set_ticks_position('bottom')
    
    axis_color = 'lightgrey'
    ax.spines['bottom'].set_color(axis_color)
    ax.spines['left'].set_color(axis_color)
    ax.tick_params(axis='both', color=axis_color)
    
    if leg:
        ax.legend(frameon = False, loc='upper left', bbox_to_anchor=(1, 1))
        
    if grid is not None:
        plt.grid(color='lightgrey', axis = grid, linestyle='-', linewidth=.5)
        
    if font is not None:
        for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] +
            ax.get_xticklabels() + ax.get_yticklabels()):
            
            item.set_fontfamily(font['family'])
            item.set_color(font['color'])
            

In [4]:
from src.filter import Filter

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ayan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Loading datasets and generating features

In [10]:
## Garlic and Cocoa

gtrain = pd.read_csv("data/garlic_scoring.csv", encoding='latin1')[['PMID', 'abstract', 'paper', 'mesh_terms', 'qual_terms', 'is_useful']]
gtrain['food'] = 'garlic'
ctrain = pd.read_csv("data/cocoa_scoring.csv", encoding='latin1')[['PMID', 'abstract', 'paper', 'mesh_terms', 'qual_terms', 'is_useful']]
ctrain['food'] = 'cocoa'
gtrain['is_useful'] = gtrain['is_useful'].replace(2, 1, regex=True)
ctrain['is_useful'] = ctrain['is_useful'].replace(2, 1, regex=True)
gtrain = gtrain[gtrain['is_useful'].notnull()]
ctrain = ctrain[ctrain['is_useful'].notnull()]

## Basil

btrain = pd.read_excel("data/basil_scoring.xls", encoding='latin1')[['PMID', 'abstract', 'paper', 'mesh_terms', 'qual_terms', 'is_useful']]
btrain['food'] = 'apple'

for i in range(len(btrain)):
    if btrain['is_useful'].loc[i] == 'x':
        btrain['is_useful'].loc[i] = 0
        
btrain['is_useful'] = btrain['is_useful'].replace(2, 1, regex=True)
btrain = btrain[btrain['is_useful'].notnull()]

## Apple

atrain = pd.read_excel("data/apple_scoring.xls", encoding='latin1')[['PMID', 'abstract', 'paper', 'mesh_terms', 'qual_terms', 'is_useful']]
atrain['food'] = 'apple'

for i in range(len(atrain)):
    if atrain['is_useful'].loc[i] == 'x':
        atrain['is_useful'].loc[i] = 0

atrain['is_useful'] = atrain['is_useful'].replace(2, 1, regex=True)
atrain = atrain[atrain['is_useful'].notnull()]
atrain = atrain[atrain['abstract'].notnull()]
atrain = atrain[atrain['PMID'].notnull()]

## Human Milk database

mtrain_new = pd.read_csv("mBase_15Aug_abstract[chemical_gen].csv")
mtrain_new['food'] = 'milk'
mtrain_new = mtrain_new[mtrain_new['abstract'].notnull()]
mtrain_new = mtrain_new[mtrain_new['PMID'].notnull()]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [6]:
print('Length of the databases for garlic, cocoa, basil, apple, milk: ', len(gtrain), len(ctrain), len(btrain), len(atrain), len(mtrain_new))

Length of the databases for garlic, cocoa, basil, apple, milk:  299 324 93 1653 229


In [7]:
print('Useful and non-useful: ',len(gtrain[gtrain['is_useful'] == 1.0]),len(gtrain[gtrain['is_useful'] == 0.0]))

Useful and non-useful:  77 222


In [8]:
print('Useful and non-useful: ',len(ctrain[ctrain['is_useful'] == 1.0]),len(ctrain[ctrain['is_useful'] == 0.0]))

Useful and non-useful:  93 231


In [9]:
print('Useful and non-useful: ',len(atrain[atrain['is_useful'] == 1.0]),len(atrain[atrain['is_useful'] == 0.0]))

Useful and non-useful:  462 1191


In [309]:
print('Useful and non-useful: ',len(btrain[btrain['is_useful'] == 1.0]),len(btrain[btrain['is_useful'] == 0.0]))

Useful and non-useful:  57 36


In [257]:
def build_all_features(ftrain,include_bigrams='FALSE'):
    fmodel_data = Filter()

    fmodel_data.build_features(input_data = ftrain,is_traindata = True)

    ftrain = feature_gen(ftrain,include_bigrams)

    fmodel_data.data['chem_ent_ratio'] = ftrain['chem_ent_ratio'].values
    fmodel_data.data['chem_term_count'] = ftrain['chem_term_count'].values
    fmodel_data.data['bigram_score'] = ftrain['bigram_score'].values
    
    return fmodel_data

In [260]:
print('----Starting feature generation----')
gdata = build_all_features(gtrain,include_bigrams='FALSE')
print('----DONE----')
cdata = build_all_features(ctrain,include_bigrams='FALSE')
print('----DONE----')
bdata = build_all_features(btrain,include_bigrams='FALSE')
print('----DONE----')

----Starting feature generation----




0it [00:00, ?it/s][A[A



Creating features...


1it [00:00,  4.79it/s][A[A

2it [00:00,  3.97it/s][A[A

3it [00:00,  4.32it/s][A[A

4it [00:00,  4.50it/s][A[A

5it [00:01,  4.80it/s][A[A

6it [00:01,  4.61it/s][A[A

7it [00:01,  5.04it/s][A[A

8it [00:01,  5.15it/s][A[A

9it [00:01,  4.64it/s][A[A

10it [00:02,  4.01it/s][A[A

11it [00:02,  4.18it/s][A[A

12it [00:02,  3.83it/s][A[A

13it [00:03,  4.01it/s][A[A

14it [00:03,  3.81it/s][A[A

15it [00:03,  4.35it/s][A[A

16it [00:03,  4.54it/s][A[A

17it [00:03,  4.16it/s][A[A

18it [00:04,  5.01it/s][A[A

20it [00:04,  5.12it/s][A[A

21it [00:04,  5.56it/s][A[A

22it [00:04,  4.23it/s][A[A

23it [00:05,  4.61it/s][A[A

24it [00:05,  5.35it/s][A[A

25it [00:05,  4.15it/s][A[A

26it [00:05,  3.91it/s][A[A

27it [00:06,  3.51it/s][A[A

28it [00:06,  3.19it/s][A[A

29it [00:06,  3.77it/s][A[A

30it [00:07,  3.96it/s][A[A

31it [00:07,  3.67it/s][A[A

32it [00:07,  3.82it/s][A[A

33it [00:07,  4.07it/s][A[A

34it [00:08,  4.0

----DONE----




0it [00:00, ?it/s][A[A

Creating features...




1it [00:00,  3.73it/s][A[A

2it [00:00,  3.72it/s][A[A

3it [00:00,  3.94it/s][A[A

4it [00:01,  3.84it/s][A[A

5it [00:01,  4.06it/s][A[A

6it [00:01,  3.95it/s][A[A

7it [00:01,  3.78it/s][A[A

8it [00:02,  3.90it/s][A[A

9it [00:02,  3.44it/s][A[A

10it [00:02,  3.35it/s][A[A

11it [00:02,  3.54it/s][A[A

12it [00:03,  3.38it/s][A[A

13it [00:03,  3.25it/s][A[A

14it [00:03,  3.39it/s][A[A

15it [00:04,  3.36it/s][A[A

16it [00:04,  2.85it/s][A[A

17it [00:04,  3.23it/s][A[A

18it [00:05,  3.34it/s][A[A

19it [00:05,  3.43it/s][A[A

20it [00:05,  3.45it/s][A[A

21it [00:06,  3.47it/s][A[A

22it [00:06,  3.47it/s][A[A

23it [00:06,  3.46it/s][A[A

24it [00:06,  3.22it/s][A[A

25it [00:07,  3.37it/s][A[A

26it [00:07,  3.25it/s][A[A

27it [00:07,  3.23it/s][A[A

28it [00:08,  3.36it/s][A[A

29it [00:08,  3.25it/s][A[A

30it [00:08,  2.99it/s][A[A

31it [00:09,  3.38it/s][A[A

32it [00:09,  3.33it/s][A[A

33it [00:09,  3

----DONE----




0it [00:00, ?it/s][A[A

Creating features...




1it [00:00,  2.91it/s][A[A

2it [00:00,  2.69it/s][A[A

3it [00:01,  2.97it/s][A[A

4it [00:01,  2.85it/s][A[A

5it [00:01,  2.80it/s][A[A

6it [00:02,  2.93it/s][A[A

7it [00:02,  2.79it/s][A[A

8it [00:02,  2.80it/s][A[A

9it [00:03,  2.72it/s][A[A

10it [00:03,  2.88it/s][A[A

11it [00:04,  2.45it/s][A[A

12it [00:04,  2.51it/s][A[A

13it [00:04,  2.47it/s][A[A

14it [00:05,  2.28it/s][A[A

15it [00:05,  2.25it/s][A[A

16it [00:06,  2.42it/s][A[A

17it [00:06,  2.51it/s][A[A

18it [00:06,  2.98it/s][A[A

19it [00:06,  3.71it/s][A[A

20it [00:07,  4.33it/s][A[A

21it [00:07,  3.83it/s][A[A

22it [00:07,  3.90it/s][A[A

23it [00:08,  3.10it/s][A[A

24it [00:08,  2.55it/s][A[A

25it [00:08,  2.65it/s][A[A

26it [00:09,  2.94it/s][A[A

27it [00:09,  3.10it/s][A[A

28it [00:09,  3.67it/s][A[A

29it [00:09,  4.32it/s][A[A

30it [00:10,  3.08it/s][A[A

31it [00:10,  2.53it/s][A[A

32it [00:11,  2.50it/s][A[A

33it [00:11,  2

----DONE----


In [261]:
print('----Starting feature generation----')
adata = build_all_features(atrain,include_bigrams='FALSE')
print('----DONE----')
mdata = build_all_features(mtrain_new,include_bigrams='TRUE')
print('----DONE----')

----Starting feature generation----




0it [00:00, ?it/s][A[A

1it [00:00,  6.15it/s][A[A

Creating features...




2it [00:00,  5.51it/s][A[A

3it [00:00,  5.60it/s][A[A

4it [00:00,  5.31it/s][A[A

5it [00:00,  5.18it/s][A[A

6it [00:01,  5.26it/s][A[A

7it [00:01,  4.76it/s][A[A

8it [00:01,  4.80it/s][A[A

9it [00:01,  4.74it/s][A[A

10it [00:02,  4.95it/s][A[A

11it [00:02,  5.79it/s][A[A

12it [00:02,  5.54it/s][A[A

13it [00:02,  4.37it/s][A[A

14it [00:02,  4.19it/s][A[A

15it [00:03,  3.55it/s][A[A

16it [00:03,  3.39it/s][A[A

18it [00:03,  3.87it/s][A[A

19it [00:04,  4.03it/s][A[A

20it [00:04,  4.35it/s][A[A

21it [00:04,  3.84it/s][A[A

22it [00:04,  4.12it/s][A[A

23it [00:05,  4.36it/s][A[A

24it [00:05,  4.62it/s][A[A

25it [00:05,  5.16it/s][A[A

26it [00:05,  4.40it/s][A[A

27it [00:06,  4.22it/s][A[A

28it [00:06,  4.28it/s][A[A

29it [00:06,  4.09it/s][A[A

30it [00:06,  4.00it/s][A[A

31it [00:07,  3.86it/s][A[A

32it [00:07,  3.43it/s][A[A

33it [00:07,  3.44it/s][A[A

34it [00:07,  3.57it/s][A[A

35it [00:08,  

518it [02:26,  4.19it/s][A[A

519it [02:26,  3.94it/s][A[A

520it [02:26,  3.57it/s][A[A

521it [02:27,  3.76it/s][A[A

522it [02:27,  3.41it/s][A[A

523it [02:27,  3.28it/s][A[A

524it [02:27,  3.52it/s][A[A

525it [02:28,  3.56it/s][A[A

526it [02:28,  3.46it/s][A[A

527it [02:28,  3.81it/s][A[A

528it [02:28,  4.40it/s][A[A

529it [02:29,  4.58it/s][A[A

530it [02:29,  3.96it/s][A[A

531it [02:29,  3.78it/s][A[A

532it [02:30,  3.29it/s][A[A

533it [02:30,  3.12it/s][A[A

534it [02:30,  3.09it/s][A[A

535it [02:31,  3.06it/s][A[A

536it [02:31,  3.29it/s][A[A

537it [02:31,  3.17it/s][A[A

538it [02:31,  3.25it/s][A[A

539it [02:32,  3.53it/s][A[A

540it [02:32,  3.74it/s][A[A

541it [02:32,  3.64it/s][A[A

542it [02:32,  3.71it/s][A[A

543it [02:33,  3.78it/s][A[A

544it [02:33,  3.39it/s][A[A

545it [02:33,  3.60it/s][A[A

546it [02:34,  3.34it/s][A[A

547it [02:34,  3.67it/s][A[A

548it [02:34,  3.30it/s][A[A

549it [0

1029it [05:09,  2.50it/s][A[A

1030it [05:10,  2.49it/s][A[A

1031it [05:10,  2.66it/s][A[A

1032it [05:10,  2.78it/s][A[A

1033it [05:11,  2.79it/s][A[A

1034it [05:11,  2.77it/s][A[A

1035it [05:12,  2.43it/s][A[A

1036it [05:12,  2.51it/s][A[A

1037it [05:12,  3.04it/s][A[A

1038it [05:13,  3.02it/s][A[A

1039it [05:13,  3.08it/s][A[A

1040it [05:13,  3.04it/s][A[A

1041it [05:14,  2.96it/s][A[A

1042it [05:14,  2.68it/s][A[A

1043it [05:14,  2.48it/s][A[A

1044it [05:15,  2.46it/s][A[A

1045it [05:15,  2.63it/s][A[A

1046it [05:16,  2.83it/s][A[A

1047it [05:16,  2.96it/s][A[A

1048it [05:16,  3.02it/s][A[A

1049it [05:16,  2.99it/s][A[A

1050it [05:17,  3.47it/s][A[A

1051it [05:17,  3.42it/s][A[A

1052it [05:17,  3.66it/s][A[A

1053it [05:18,  3.44it/s][A[A

1054it [05:18,  3.63it/s][A[A

1055it [05:18,  3.36it/s][A[A

1056it [05:19,  2.80it/s][A[A

1057it [05:19,  2.79it/s][A[A

1058it [05:19,  3.04it/s][A[A

1059it [05

1526it [07:39,  3.27it/s][A[A

1527it [07:39,  3.23it/s][A[A

1528it [07:40,  3.89it/s][A[A

1529it [07:40,  3.56it/s][A[A

1530it [07:40,  3.14it/s][A[A

1531it [07:41,  3.48it/s][A[A

1532it [07:41,  3.27it/s][A[A

1533it [07:41,  3.44it/s][A[A

1534it [07:41,  3.55it/s][A[A

1535it [07:42,  3.84it/s][A[A

1536it [07:42,  3.75it/s][A[A

1537it [07:42,  4.45it/s][A[A

1538it [07:42,  4.47it/s][A[A

1539it [07:43,  4.48it/s][A[A

1540it [07:43,  3.77it/s][A[A

1541it [07:43,  3.83it/s][A[A

1542it [07:43,  3.72it/s][A[A

1543it [07:44,  4.25it/s][A[A

1544it [07:44,  3.62it/s][A[A

1545it [07:44,  3.57it/s][A[A

1546it [07:44,  4.33it/s][A[A

1547it [07:45,  3.56it/s][A[A

1548it [07:45,  4.27it/s][A[A

1549it [07:45,  4.08it/s][A[A

1550it [07:45,  4.27it/s][A[A

1551it [07:46,  3.96it/s][A[A

1552it [07:46,  4.14it/s][A[A

1553it [07:46,  4.39it/s][A[A

1554it [07:46,  3.65it/s][A[A

1555it [07:47,  3.86it/s][A[A

1556it [07

----DONE----




0it [00:00, ?it/s][A[A

Creating features...




1it [00:00,  2.58it/s][A[A

2it [00:00,  3.11it/s][A[A

3it [00:00,  3.69it/s][A[A

4it [00:00,  3.68it/s][A[A

5it [00:01,  3.57it/s][A[A

6it [00:01,  3.32it/s][A[A

7it [00:01,  3.70it/s][A[A

9it [00:02,  4.41it/s][A[A

10it [00:02,  4.72it/s][A[A

11it [00:02,  4.52it/s][A[A

12it [00:02,  4.52it/s][A[A

14it [00:03,  4.46it/s][A[A

15it [00:03,  4.37it/s][A[A

16it [00:03,  3.37it/s][A[A

17it [00:04,  3.92it/s][A[A

18it [00:04,  3.61it/s][A[A

19it [00:04,  3.32it/s][A[A

20it [00:04,  3.94it/s][A[A

21it [00:05,  4.55it/s][A[A

22it [00:05,  4.37it/s][A[A

23it [00:05,  4.23it/s][A[A

24it [00:05,  3.94it/s][A[A

25it [00:06,  3.54it/s][A[A

26it [00:06,  3.72it/s][A[A

27it [00:06,  3.44it/s][A[A

28it [00:06,  3.65it/s][A[A

29it [00:07,  3.55it/s][A[A

30it [00:07,  3.84it/s][A[A

31it [00:07,  3.95it/s][A[A

32it [00:07,  4.44it/s][A[A

33it [00:08,  3.63it/s][A[A

34it [00:08,  3.19it/s][A[A

35it [00:08,  

----DONE----


In [262]:
gdata_features_class = gdata.data.copy()
cdata_features_class = cdata.data.copy()
bdata_features_class = bdata.data.copy()
adata_features_class = adata.data.copy()
mdata_features_class = mdata.data.copy()

In [263]:
gdata_features_class.to_csv('data_with_feature/gdata_features_class.csv')
cdata_features_class.to_csv('data_with_feature/cdata_features_class.csv')
bdata_features_class.to_csv('data_with_feature/bdata_features_class.csv')
adata_features_class.to_csv('data_with_feature/adata_features_class.csv')
mdata_features_class.to_csv('data_with_feature/mdata_features_class.csv')

In [3]:
gdata_features_class = pd.read_csv('data_with_feature/gdata_features_class.csv')
cdata_features_class = pd.read_csv('data_with_feature/cdata_features_class.csv')
bdata_features_class = pd.read_csv('data_with_feature/bdata_features_class.csv')
adata_features_class = pd.read_csv('data_with_feature/adata_features_class.csv')
mdata_features_class = pd.read_csv('data_with_feature/mdata_features_class.csv')

# Doc2vec trained on FoodBase

In [4]:
fmine = pd.read_csv('FoodBase_Abstracts_embeddings.csv')

In [5]:
embedding_list = []

for index, row in fmine.iterrows():
    local_list = []
    for x in row['vectors'].replace('[','').replace(']','').replace('\n',' ').replace('  ',' ').replace('   ',' ').split(' '):
        if x != '':
            local_list.append(float(x))
    embedding_list.append(local_list)
    
fmine['embeddings'] = embedding_list

In [6]:
abstract_embedding_dict = dict()

for index, row in fmine.iterrows():
    abstract_embedding_dict[row['abstract']] = row['embeddings']

In [7]:
print('Dimension of Doc2Vec: ', len(embedding_list[0]))

Dimension of Doc2Vec:  64


In [8]:
fmine_expanded = pd.concat([fmine, fmine['embeddings'].apply(pd.Series)], axis = 1)

In [11]:
gdata_train_doc2vec = pd.merge(gtrain, fmine_expanded[fmine_expanded['food'] == 'garlic'], on=['abstract'])
gdata_features_class_doc2vec = pd.merge(gdata_train_doc2vec, gdata_features_class, on=['PMID'])
gdata_features_class_doc2vec = gdata_features_class_doc2vec.drop(columns=['PMID','abstract','paper','mesh_terms','qual_terms','is_useful_x','food_x','is_useful_y','food_y','vectors','embeddings'])

In [12]:
cdata_train_doc2vec = pd.merge(ctrain, fmine_expanded[fmine_expanded['food'] == 'cocoa'], on=['abstract'])
cdata_features_class_doc2vec = pd.merge(cdata_train_doc2vec, cdata_features_class, on=['PMID'])
cdata_features_class_doc2vec = cdata_features_class_doc2vec.drop(columns=['PMID','abstract','paper','mesh_terms','qual_terms','is_useful_x','food_x','is_useful_y','food_y','vectors','embeddings'])

In [13]:
bdata_train_doc2vec = pd.merge(btrain, fmine_expanded[fmine_expanded['food'] == 'basil'], on=['abstract'])
bdata_features_class_doc2vec = pd.merge(bdata_train_doc2vec, bdata_features_class, on=['PMID'])
bdata_features_class_doc2vec = bdata_features_class_doc2vec.drop(columns=['PMID','abstract','paper','mesh_terms','qual_terms','is_useful_x','food_x','is_useful_y','food_y','vectors','embeddings'])

In [14]:
adata_train_doc2vec = pd.merge(atrain, fmine_expanded[fmine_expanded['food'] == 'apple'], on=['abstract'])
adata_features_class_doc2vec = pd.merge(adata_train_doc2vec, adata_features_class, on=['PMID'])
adata_features_class_doc2vec = adata_features_class_doc2vec.drop(columns=['PMID','abstract','paper','mesh_terms','qual_terms','is_useful_x','food_x','is_useful_y','food_y','vectors','embeddings'])

In [15]:
mdata_train_doc2vec = pd.merge(mtrain_new, fmine_expanded[fmine_expanded['food'] == 'human milk'], on=['abstract'])
mdata_features_class_doc2vec = pd.merge(mdata_train_doc2vec, mdata_features_class, on=['PMID'])
mdata_features_class_doc2vec = mdata_features_class_doc2vec.drop(columns=['PMID','abstract','paper','mesh_terms','qual_terms','is_useful_x','food_x','is_useful_y','food_y','vectors','embeddings'])

In [16]:
mdata_features_class_doc2vec = mdata_features_class_doc2vec.drop(columns=['journal','mesh_UIds','qual_UIds','webpage','year','source','measmethod','chem_ent_ratio_x','chemicals','bigram_score_x','chem_term_count_x'])

In [17]:
mdata_features_class_doc2vec = mdata_features_class_doc2vec.rename(columns={'chem_ent_ratio_y':'chem_ent_ratio','chem_term_count_y':'chem_term_count','bigram_score_y':'bigram_score'})

# Normalize the features

In [281]:
def normalize_features(total_data):
    total_data_features = total_data.drop(columns=['class'])
    for col_feature in list(total_data_features.columns): 
        col_list = total_data_features[col_feature].tolist()
        if col_list != [0.0] * len(col_list):
            col_list_normalized = [(x-np.mean(col_list))/np.std(col_list) for x in col_list]
        else:
            col_list_normalized = col_list
        total_data_features[col_feature] = col_list_normalized
    # total_data_features_normalized = (total_data_features-total_data_features.mean())/total_data_features.std()
    # total_data_features_normalized = (total_data_features-total_data_features.min())/(total_data_features.max()-total_data_features.min())
    total_data_features['class'] = total_data['class'].tolist()
    
    return total_data_features

In [282]:
gdata_features_class_normalized = normalize_features(gdata_features_class)
cdata_features_class_normalized = normalize_features(cdata_features_class)
bdata_features_class_normalized = normalize_features(bdata_features_class)
adata_features_class_normalized = normalize_features(adata_features_class)
mdata_features_class_normalized = normalize_features(mdata_features_class)

# Performances on seen food 

In [21]:
def xgboost_model(x,y,kfold,n_splits=10):

    cross_val_model_fm1 = RandomForestClassifier(max_depth=80,random_state=0)
    cross_val_model_fm2 = RandomForestClassifier(max_depth=80,random_state=0)
    
    kfold = StratifiedKFold(n_splits)
    
    auc_fm1 = []
    aup_fm1 = []
    f1_score_list_fm1 = []
    
    auc_fm2 = []
    aup_fm2 = []
    f1_score_list_fm2 = []
    
    for i,(train,test) in enumerate(kfold.split(x,y)):
        
        x_old = x.drop(columns=['chem_ent_ratio','chem_term_count','bigram_score'])
        
        cross_val_model_fm2.fit(x.loc[train],y.loc[train])
        cross_val_model_fm1.fit(x_old.loc[train],y.loc[train])
        
        #viz_fm1 = plot_roc_curve(cross_val_model_fm1, x_old.loc[test], y.loc[test],
        #                 name='ROC fold {}'.format(i),
        #                 alpha=0.3, lw=1)
        #viz_fm2 = plot_roc_curve(cross_val_model_fm2, x.loc[test], y.loc[test],
        #                 name='ROC fold {}'.format(i),
        #                 alpha=0.3, lw=1)
        
        y_predicted_fm1 = cross_val_model_fm1.predict(x_old.loc[test])
        y_predicted_fm2 = cross_val_model_fm2.predict(x.loc[test])
        
        auc_fm1.append(roc_auc_score(np.array(y.loc[test].tolist()),y_predicted_fm1))
        aup_fm1.append(average_precision_score(np.array(y.loc[test].tolist()),y_predicted_fm1))
        f1_score_list_fm1.append(f1_score(np.array(y.loc[test].tolist()),y_predicted_fm1))
        
        auc_fm2.append(roc_auc_score(np.array(y.loc[test].tolist()),y_predicted_fm2))
        aup_fm2.append(average_precision_score(np.array(y.loc[test].tolist()),y_predicted_fm2))
        f1_score_list_fm2.append(f1_score(np.array(y.loc[test].tolist()),y_predicted_fm2))
        # plot_confusion_matrix(cross_val_model,x.loc[test],y.loc[test])
        
    print('------------------------------------------------------------------------------------------------------')
    #print('Average AUC for FM1 and FM2: ', mean(auc_fm1), mean(auc_fm2))
    #print('Average AUP for FM1 and FM2:', mean(aup_fm1), mean(aup_fm2))
    #print('Average f1 Socre FM1 and FM2:', mean(f1_score_list_fm1), mean(f1_score_list_fm2))
    print('Average and SD of AUC for FM2: ', mean(auc_fm2), np.std(auc_fm2))
    print('Average and SD of AUP for FM2:', mean(aup_fm2), np.std(aup_fm2))
    print('Average and SD of f1 Socre FM2:', mean(f1_score_list_fm2), np.std(f1_score_list_fm2))
    print('------------------------------------------------------------------------------------------------------')

    return mean(auc_fm2), mean(aup_fm2), mean(f1_score_list_fm2)

In [39]:
def get_cross_validation_seen_food(fdata_features_class_normalized):

    repeat = 10

    for _ in tqdm(range(repeat)):
        oversample = SMOTE()
        y = fdata_features_class_normalized.copy()['class']
        X = fdata_features_class_normalized.copy().drop('class', axis = 1)
        X_smote, y_smote = oversample.fit_resample(X, y)
        kfold = StratifiedKFold(n_splits=10)
        auc_fm2, aup_fm2, f1_fm2 = xgboost_model(X_smote, y_smote,kfold,n_splits=10)
        
    return 

In [40]:
## Garlic
get_cross_validation_seen_food(gdata_features_class_doc2vec)

 10%|████████▎                                                                          | 1/10 [00:03<00:34,  3.88s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.874802371541502 0.0835620822171437
Average and SD of AUP for FM2: 0.8326744802213237 0.09089243739469068
Average and SD of f1 Socre FM2: 0.8699810756519089 0.09416671695220363
------------------------------------------------------------------------------------------------------


 20%|████████████████▌                                                                  | 2/10 [00:07<00:31,  3.94s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8898221343873518 0.07561354957943221
Average and SD of AUP for FM2: 0.8514715420078126 0.08382047481516611
Average and SD of f1 Socre FM2: 0.8859240805853348 0.08382159041191595
------------------------------------------------------------------------------------------------------


 30%|████████████████████████▉                                                          | 3/10 [00:12<00:28,  4.11s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8813241106719367 0.06712454302236401
Average and SD of AUP for FM2: 0.8394784092718875 0.07194853848857471
Average and SD of f1 Socre FM2: 0.8766137534117462 0.07853440020325061
------------------------------------------------------------------------------------------------------


 40%|█████████████████████████████████▏                                                 | 4/10 [00:16<00:24,  4.13s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8856719367588933 0.07833550609162332
Average and SD of AUP for FM2: 0.8443757921919978 0.09427125429110494
Average and SD of f1 Socre FM2: 0.8871953114234163 0.0787336267718931
------------------------------------------------------------------------------------------------------


 50%|█████████████████████████████████████████▌                                         | 5/10 [00:20<00:20,  4.13s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8628458498023716 0.07010073922769529
Average and SD of AUP for FM2: 0.820010735769135 0.07714095967404175
Average and SD of f1 Socre FM2: 0.8597086199958173 0.07350010550575735
------------------------------------------------------------------------------------------------------


 60%|█████████████████████████████████████████████████▊                                 | 6/10 [00:24<00:16,  4.10s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8810276679841897 0.07782923850957071
Average and SD of AUP for FM2: 0.8383772755657201 0.09280435664507763
Average and SD of f1 Socre FM2: 0.8836565091223476 0.07418164717895478
------------------------------------------------------------------------------------------------------


 70%|██████████████████████████████████████████████████████████                         | 7/10 [00:28<00:12,  4.07s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.883695652173913 0.07380179374695503
Average and SD of AUP for FM2: 0.8439900989994603 0.08678606340246048
Average and SD of f1 Socre FM2: 0.8813247346068835 0.07891461685895183
------------------------------------------------------------------------------------------------------


 80%|██████████████████████████████████████████████████████████████████▍                | 8/10 [00:32<00:08,  4.09s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8876482213438736 0.07844387364823571
Average and SD of AUP for FM2: 0.8475500542659588 0.08647883873292693
Average and SD of f1 Socre FM2: 0.8894766144021463 0.07535215943963683
------------------------------------------------------------------------------------------------------


 90%|██████████████████████████████████████████████████████████████████████████▋        | 9/10 [00:36<00:04,  4.08s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8879446640316205 0.06849358747999096
Average and SD of AUP for FM2: 0.8525949380166271 0.0806133971409499
Average and SD of f1 Socre FM2: 0.8843405962981937 0.07452445585767183
------------------------------------------------------------------------------------------------------


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:41<00:00,  4.12s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.9100790513833992 0.06568433866190193
Average and SD of AUP for FM2: 0.8774892768262333 0.07945816493595767
Average and SD of f1 Socre FM2: 0.9074742446109552 0.07338395184255064
------------------------------------------------------------------------------------------------------





In [286]:
## Garlic normalized features
# get_cross_validation_seen_food(gdata_features_class_normalized)

------------------------------------------------------------------------------------------------------
Average AUC for FM1 and FM2:  0.7492094861660079 0.78300395256917
Average AUP for FM1 and FM2: 0.7027265131662365 0.7244753841499598
Average f1 Socre FM1 and FM2: 0.7192784453830965 0.7692480587878655
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
Average AUC for FM1 and FM2:  0.733300395256917 0.7335968379446641
Average AUP for FM1 and FM2: 0.6857352430063061 0.6762415783450266
Average f1 Socre FM1 and FM2: 0.6996408308595237 0.7099844065918886
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
Average AUC for FM1 and FM2:  0.7237154150197628 0.7533596837944664
Average AUP fo

In [41]:
## Cocoa
get_cross_validation_seen_food(cdata_features_class_doc2vec)

 10%|████████▎                                                                          | 1/10 [00:04<00:37,  4.16s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8706521739130435 0.1166347343553328
Average and SD of AUP for FM2: 0.8391125423896006 0.1297662937815742
Average and SD of f1 Socre FM2: 0.8566773831736237 0.13750291098444886
------------------------------------------------------------------------------------------------------


 20%|████████████████▌                                                                  | 2/10 [00:08<00:33,  4.18s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.9006340579710145 0.09489196320139152
Average and SD of AUP for FM2: 0.8699621539508219 0.11048454277898437
Average and SD of f1 Socre FM2: 0.8935610736040346 0.10740212424824414
------------------------------------------------------------------------------------------------------


 30%|████████████████████████▉                                                          | 3/10 [00:12<00:29,  4.20s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.877445652173913 0.10874247934556072
Average and SD of AUP for FM2: 0.8478812857278398 0.12394536776714019
Average and SD of f1 Socre FM2: 0.8667199998994424 0.12327182597720156
------------------------------------------------------------------------------------------------------


 40%|█████████████████████████████████▏                                                 | 4/10 [00:17<00:26,  4.37s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8834239130434782 0.09870948589178224
Average and SD of AUP for FM2: 0.8502495482731254 0.11316272094267711
Average and SD of f1 Socre FM2: 0.8717889010676735 0.12416799231215983
------------------------------------------------------------------------------------------------------


 50%|█████████████████████████████████████████▌                                         | 5/10 [00:22<00:22,  4.56s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8990036231884058 0.10022819904888576
Average and SD of AUP for FM2: 0.8734320649557766 0.11863477300475175
Average and SD of f1 Socre FM2: 0.8921697779455607 0.10923938284800853
------------------------------------------------------------------------------------------------------


 60%|█████████████████████████████████████████████████▊                                 | 6/10 [00:27<00:18,  4.63s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8944746376811594 0.10037380414844928
Average and SD of AUP for FM2: 0.8673887848203176 0.11259909959830654
Average and SD of f1 Socre FM2: 0.8826006769462763 0.11772746019475744
------------------------------------------------------------------------------------------------------


 70%|██████████████████████████████████████████████████████████                         | 7/10 [00:32<00:14,  4.74s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8622282608695652 0.11741621811358711
Average and SD of AUP for FM2: 0.8299053368012262 0.1282219583838181
Average and SD of f1 Socre FM2: 0.8445292880675324 0.1420605008573821
------------------------------------------------------------------------------------------------------


 80%|██████████████████████████████████████████████████████████████████▍                | 8/10 [00:36<00:09,  4.69s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8923007246376812 0.1019675313756365
Average and SD of AUP for FM2: 0.8643747330066291 0.12112985795086745
Average and SD of f1 Socre FM2: 0.8826258113001298 0.11590691182567871
------------------------------------------------------------------------------------------------------


 90%|██████████████████████████████████████████████████████████████████████████▋        | 9/10 [00:41<00:04,  4.62s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8795289855072463 0.10400901744024262
Average and SD of AUP for FM2: 0.8489266146996562 0.11913756442387162
Average and SD of f1 Socre FM2: 0.8670700228511778 0.12483847792457435
------------------------------------------------------------------------------------------------------


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:46<00:00,  4.63s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8775362318840579 0.11717934097830941
Average and SD of AUP for FM2: 0.8480074014566688 0.134172134188881
Average and SD of f1 Socre FM2: 0.8658072433554738 0.1327356594806497
------------------------------------------------------------------------------------------------------





In [42]:
## Basil
get_cross_validation_seen_food(bdata_features_class_doc2vec)

 10%|████████▎                                                                          | 1/10 [00:02<00:22,  2.47s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.7266666666666667 0.18336363385966759
Average and SD of AUP for FM2: 0.7144564694564695 0.17444797417599517
Average and SD of f1 Socre FM2: 0.7529331779331779 0.15212567746109126
------------------------------------------------------------------------------------------------------


 20%|████████████████▌                                                                  | 2/10 [00:04<00:19,  2.48s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.7333333333333334 0.12018504251546633
Average and SD of AUP for FM2: 0.6960804473304474 0.12103799208027892
Average and SD of f1 Socre FM2: 0.7515873015873016 0.10428610312977586
------------------------------------------------------------------------------------------------------


 30%|████████████████████████▉                                                          | 3/10 [00:07<00:17,  2.49s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.7266666666666667 0.1190704739966117
Average and SD of AUP for FM2: 0.6905169552669552 0.11671516329969568
Average and SD of f1 Socre FM2: 0.7376712176712177 0.1200916106195977
------------------------------------------------------------------------------------------------------


 40%|█████████████████████████████████▏                                                 | 4/10 [00:10<00:15,  2.53s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.7400000000000001 0.07118052168020876
Average and SD of AUP for FM2: 0.7047294372294373 0.06445515388626867
Average and SD of f1 Socre FM2: 0.7355128205128205 0.06668897555729132
------------------------------------------------------------------------------------------------------


 50%|█████████████████████████████████████████▌                                         | 5/10 [00:12<00:12,  2.56s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.7433333333333334 0.13747727084867523
Average and SD of AUP for FM2: 0.7191396103896104 0.13425062396878218
Average and SD of f1 Socre FM2: 0.7400960150960151 0.14349155504798106
------------------------------------------------------------------------------------------------------


 60%|█████████████████████████████████████████████████▊                                 | 6/10 [00:15<00:10,  2.60s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.7616666666666667 0.11833333333333335
Average and SD of AUP for FM2: 0.7260328282828283 0.1257901611403823
Average and SD of f1 Socre FM2: 0.7703291153291153 0.10323684234158755
------------------------------------------------------------------------------------------------------


 70%|██████████████████████████████████████████████████████████                         | 7/10 [00:18<00:07,  2.64s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.7333333333333334 0.14375905768565214
Average and SD of AUP for FM2: 0.7005844155844156 0.13945272776548817
Average and SD of f1 Socre FM2: 0.7463963813963814 0.13137641896679056
------------------------------------------------------------------------------------------------------


 80%|██████████████████████████████████████████████████████████████████▍                | 8/10 [00:20<00:05,  2.66s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.7966666666666666 0.08938058451861286
Average and SD of AUP for FM2: 0.7686075036075036 0.092892720266533
Average and SD of f1 Socre FM2: 0.7848562548562548 0.08522875585610758
------------------------------------------------------------------------------------------------------


 90%|██████████████████████████████████████████████████████████████████████████▋        | 9/10 [00:23<00:02,  2.67s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.7250000000000001 0.10547511554864494
Average and SD of AUP for FM2: 0.6943777056277056 0.11449950943567495
Average and SD of f1 Socre FM2: 0.7395909645909645 0.08858135075459839
------------------------------------------------------------------------------------------------------


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:26<00:00,  2.62s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.7983333333333333 0.10657548185832111
Average and SD of AUP for FM2: 0.7592633477633478 0.1205216569319803
Average and SD of f1 Socre FM2: 0.8107747807747808 0.08730393895510094
------------------------------------------------------------------------------------------------------





In [43]:
## Apple
get_cross_validation_seen_food(adata_features_class_doc2vec)

 10%|████████▎                                                                          | 1/10 [00:23<03:30, 23.41s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8766701680672269 0.0867356866695072
Average and SD of AUP for FM2: 0.8377421988543059 0.09269476823123957
Average and SD of f1 Socre FM2: 0.8687915748040658 0.09864612515438827
------------------------------------------------------------------------------------------------------


 20%|████████████████▌                                                                  | 2/10 [00:45<03:04, 23.09s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.874562324929972 0.07974707891823585
Average and SD of AUP for FM2: 0.8356363391869955 0.0869702404054504
Average and SD of f1 Socre FM2: 0.8678436903452009 0.08977274023497224
------------------------------------------------------------------------------------------------------


 30%|████████████████████████▉                                                          | 3/10 [01:08<02:40, 22.87s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.88296918767507 0.08222563594400643
Average and SD of AUP for FM2: 0.8468820258961653 0.09234734017846516
Average and SD of f1 Socre FM2: 0.8769367377826904 0.0903914937853375
------------------------------------------------------------------------------------------------------


 40%|█████████████████████████████████▏                                                 | 4/10 [01:31<02:18, 23.07s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8745868347338935 0.07880300640501331
Average and SD of AUP for FM2: 0.8341243127950428 0.08212655400248502
Average and SD of f1 Socre FM2: 0.8671750207150295 0.0901574069868172
------------------------------------------------------------------------------------------------------


 50%|█████████████████████████████████████████▌                                         | 5/10 [01:52<01:52, 22.54s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8791911764705882 0.07935693071565013
Average and SD of AUP for FM2: 0.8403835412340488 0.08747234759436745
Average and SD of f1 Socre FM2: 0.8731855720836135 0.08830021645765204
------------------------------------------------------------------------------------------------------


 60%|█████████████████████████████████████████████████▊                                 | 6/10 [02:16<01:31, 22.93s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8695378151260504 0.09191028543560834
Average and SD of AUP for FM2: 0.8309737946865818 0.09850961003428914
Average and SD of f1 Socre FM2: 0.860401526752138 0.10508907209057539
------------------------------------------------------------------------------------------------------


 70%|██████████████████████████████████████████████████████████                         | 7/10 [02:38<01:08, 22.70s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8770973389355742 0.07952574157600543
Average and SD of AUP for FM2: 0.8365619918588554 0.08656379688572431
Average and SD of f1 Socre FM2: 0.8711495840309923 0.08895962962891635
------------------------------------------------------------------------------------------------------


 80%|██████████████████████████████████████████████████████████████████▍                | 8/10 [03:01<00:45, 22.65s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8683088235294117 0.09314660876609085
Average and SD of AUP for FM2: 0.8301402495978785 0.10186936868717034
Average and SD of f1 Socre FM2: 0.8598570162694275 0.10442627460452254
------------------------------------------------------------------------------------------------------


 90%|██████████████████████████████████████████████████████████████████████████▋        | 9/10 [03:24<00:22, 22.90s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8775070028011205 0.07568911626675413
Average and SD of AUP for FM2: 0.8387309261489969 0.08094278363950143
Average and SD of f1 Socre FM2: 0.870623325877277 0.08560568499484841
------------------------------------------------------------------------------------------------------


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [03:47<00:00, 22.71s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8842086834733894 0.07905083283870278
Average and SD of AUP for FM2: 0.8470185585905687 0.08688250748217086
Average and SD of f1 Socre FM2: 0.8780513053370393 0.08834323051785145
------------------------------------------------------------------------------------------------------





In [44]:
## Milk
get_cross_validation_seen_food(mdata_features_class_doc2vec)

 10%|████████▎                                                                          | 1/10 [00:02<00:26,  2.98s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8150641025641026 0.04649203907732369
Average and SD of AUP for FM2: 0.7562978628125687 0.0508574606272778
Average and SD of f1 Socre FM2: 0.8193670869083163 0.051346252639820095
------------------------------------------------------------------------------------------------------


 20%|████████████████▌                                                                  | 2/10 [00:06<00:24,  3.00s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.7878205128205128 0.07113363076195423
Average and SD of AUP for FM2: 0.7274456401232419 0.07586545547999103
Average and SD of f1 Socre FM2: 0.7899084249084249 0.07841148752643728
------------------------------------------------------------------------------------------------------


 30%|████████████████████████▉                                                          | 3/10 [00:09<00:21,  3.02s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8150641025641026 0.04324073135512666
Average and SD of AUP for FM2: 0.7595756551141166 0.0564488153275876
Average and SD of f1 Socre FM2: 0.8177386924428667 0.04798277560546394
------------------------------------------------------------------------------------------------------


 40%|█████████████████████████████████▏                                                 | 4/10 [00:12<00:18,  3.05s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8073717948717949 0.03644415404744715
Average and SD of AUP for FM2: 0.7494388900859489 0.04571414889526255
Average and SD of f1 Socre FM2: 0.8091847411847412 0.04625187808358838
------------------------------------------------------------------------------------------------------


 50%|█████████████████████████████████████████▌                                         | 5/10 [00:15<00:15,  3.07s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.7955128205128205 0.04686068431409504
Average and SD of AUP for FM2: 0.7315996450758895 0.04896681334032072
Average and SD of f1 Socre FM2: 0.7990870049490739 0.0574881053464839
------------------------------------------------------------------------------------------------------


 60%|█████████████████████████████████████████████████▊                                 | 6/10 [00:18<00:12,  3.10s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8150641025641026 0.060356061897360855
Average and SD of AUP for FM2: 0.7545269794308256 0.07062800254700496
Average and SD of f1 Socre FM2: 0.8280671600471378 0.056329442907410224
------------------------------------------------------------------------------------------------------


 70%|██████████████████████████████████████████████████████████                         | 7/10 [00:21<00:09,  3.14s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8 0.057302816290009696
Average and SD of AUP for FM2: 0.7360192703224377 0.0641584783366248
Average and SD of f1 Socre FM2: 0.8065776177845143 0.06071440061354455
------------------------------------------------------------------------------------------------------


 80%|██████████████████████████████████████████████████████████████████▍                | 8/10 [00:24<00:06,  3.15s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.7987179487179488 0.04599777743823145
Average and SD of AUP for FM2: 0.7439120532559447 0.04818332502199766
Average and SD of f1 Socre FM2: 0.7964892144892145 0.06141488131417044
------------------------------------------------------------------------------------------------------


 90%|██████████████████████████████████████████████████████████████████████████▋        | 9/10 [00:28<00:03,  3.20s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.7878205128205128 0.06923076923076929
Average and SD of AUP for FM2: 0.7289951249730662 0.07255825387707436
Average and SD of f1 Socre FM2: 0.7906178266178266 0.07339820335705441
------------------------------------------------------------------------------------------------------


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:31<00:00,  3.14s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.7871794871794873 0.06509172711532564
Average and SD of AUP for FM2: 0.72842092564717 0.07257263650784145
Average and SD of f1 Socre FM2: 0.7925400764110441 0.06994872168372786
------------------------------------------------------------------------------------------------------





In [45]:
# All
all_foods_doc2vec = pd.concat([gdata_features_class_doc2vec,cdata_features_class_doc2vec,bdata_features_class_doc2vec,adata_features_class_doc2vec,mdata_features_class_doc2vec])
get_cross_validation_seen_food(all_foods_doc2vec)

 10%|████████▎                                                                          | 1/10 [00:30<04:38, 30.99s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8211223400916452 0.09951555453649442
Average and SD of AUP for FM2: 0.7743326531233575 0.1062165125348052
Average and SD of f1 Socre FM2: 0.8124916750042496 0.11310118297278489
------------------------------------------------------------------------------------------------------


 20%|████████████████▌                                                                  | 2/10 [01:06<04:19, 32.44s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8258615278388048 0.09701427666464203
Average and SD of AUP for FM2: 0.7796094141947898 0.10390510061102784
Average and SD of f1 Socre FM2: 0.8161617077551503 0.11117191179512452
------------------------------------------------------------------------------------------------------


 30%|████████████████████████▉                                                          | 3/10 [01:43<03:56, 33.82s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8211192015567134 0.09701432443712395
Average and SD of AUP for FM2: 0.7738998735520883 0.10250507918968717
Average and SD of f1 Socre FM2: 0.8110211358541918 0.11301292844137738
------------------------------------------------------------------------------------------------------


 40%|█████████████████████████████████▏                                                 | 4/10 [02:20<03:27, 34.60s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.825026677546921 0.09776468705651059
Average and SD of AUP for FM2: 0.7782110356098335 0.10408662343294531
Average and SD of f1 Socre FM2: 0.8157290677262651 0.11122152543609819
------------------------------------------------------------------------------------------------------


 50%|█████████████████████████████████████████▌                                         | 5/10 [02:55<02:53, 34.75s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8188798568828071 0.10716086426247616
Average and SD of AUP for FM2: 0.7721700774205388 0.11354405629883015
Average and SD of f1 Socre FM2: 0.8080248219873524 0.12403323660790666
------------------------------------------------------------------------------------------------------


 60%|█████████████████████████████████████████████████▊                                 | 6/10 [03:27<02:16, 34.09s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8205542652689725 0.1018972964593358
Average and SD of AUP for FM2: 0.7723684612519948 0.108366752892595
Average and SD of f1 Socre FM2: 0.8119863468219748 0.11651090377481767
------------------------------------------------------------------------------------------------------


 70%|██████████████████████████████████████████████████████████                         | 7/10 [04:01<01:41, 33.86s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8208383026803089 0.10443831241243384
Average and SD of AUP for FM2: 0.7745072475644216 0.11062018440024501
Average and SD of f1 Socre FM2: 0.8109862400060163 0.11943169269690994
------------------------------------------------------------------------------------------------------


 80%|██████████████████████████████████████████████████████████████████▍                | 8/10 [04:35<01:07, 33.87s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8199893289812316 0.09812171497814434
Average and SD of AUP for FM2: 0.7713244770958276 0.1023031737034177
Average and SD of f1 Socre FM2: 0.8098990177786927 0.11559057440488185
------------------------------------------------------------------------------------------------------


 90%|██████████████████████████████████████████████████████████████████████████▋        | 9/10 [05:11<00:34, 34.52s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.831176636745967 0.09303308923268144
Average and SD of AUP for FM2: 0.7833366533892022 0.09891814731716037
Average and SD of f1 Socre FM2: 0.8231267320338997 0.10606001974227407
------------------------------------------------------------------------------------------------------


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [05:42<00:00, 34.29s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8247551942753123 0.1043852365283305
Average and SD of AUP for FM2: 0.7782744649060567 0.11010852838173381
Average and SD of f1 Socre FM2: 0.8135464974757284 0.1224972517698028
------------------------------------------------------------------------------------------------------





# Performance on unseen food

In [46]:
def xgboost_model_unseen_food(x_other,y_other,x_food,y_food,kfold,n_splits=10):

    cross_val_model_fm1 = RandomForestClassifier(max_depth=80,random_state=0)
    cross_val_model_fm2 = RandomForestClassifier(max_depth=80,random_state=0)
    
    kfold = StratifiedKFold(n_splits)
    
    auc_fm1 = []
    aup_fm1 = []
    f1_score_list_fm1 = []
    
    auc_fm2 = []
    aup_fm2 = []
    f1_score_list_fm2 = []
    
    for i,(test_food,train_food) in enumerate(kfold.split(x_food,y_food)): # test set is kept larger here 
        
        x_other_old = x_other.drop(columns=['chem_ent_ratio','chem_term_count','bigram_score'])
        x_food_old = x_food.drop(columns=['chem_ent_ratio','chem_term_count','bigram_score'])
        
        cross_val_model_fm2.fit(pd.concat([x_other,x_food.loc[test_food]]),pd.concat([y_other,y_food.loc[test_food]]))
        cross_val_model_fm1.fit(pd.concat([x_other_old,x_food_old.loc[test_food]]),pd.concat([y_other,y_food.loc[test_food]]))
        
        #viz_fm1 = plot_roc_curve(cross_val_model_fm1, x_old.loc[test], y.loc[test],
        #                 name='ROC fold {}'.format(i),
        #                 alpha=0.3, lw=1)
        #viz_fm2 = plot_roc_curve(cross_val_model_fm2, x.loc[test], y.loc[test],
        #                 name='ROC fold {}'.format(i),
        #                 alpha=0.3, lw=1)
        
        y_predicted_fm1 = cross_val_model_fm1.predict(x_food_old.loc[train_food])
        y_predicted_fm2 = cross_val_model_fm2.predict(x_food.loc[train_food])
        
        auc_fm1.append(roc_auc_score(np.array(y_food.loc[train_food].tolist()),y_predicted_fm1))
        aup_fm1.append(average_precision_score(np.array(y_food.loc[train_food].tolist()),y_predicted_fm1))
        f1_score_list_fm1.append(f1_score(np.array(y_food.loc[train_food].tolist()),y_predicted_fm1))
        
        auc_fm2.append(roc_auc_score(np.array(y_food.loc[train_food].tolist()),y_predicted_fm2))
        aup_fm2.append(average_precision_score(np.array(y_food.loc[train_food].tolist()),y_predicted_fm2))
        f1_score_list_fm2.append(f1_score(np.array(y_food.loc[train_food].tolist()),y_predicted_fm2))
        # plot_confusion_matrix(cross_val_model,x.loc[test],y.loc[test])
        
    print('------------------------------------------------------------------------------------------------------')
    #print('Average AUC for FM1 and FM2: ', mean(auc_fm1), mean(auc_fm2))
    #print('Average AUP for FM1 and FM2:', mean(aup_fm1), mean(aup_fm2))
    #print('Average f1 Socre FM1 and FM2:', mean(f1_score_list_fm1), mean(f1_score_list_fm2))
    print('Average and SD of AUC for FM2: ', mean(auc_fm2), np.std(auc_fm2))
    print('Average and SD of AUP for FM2:', mean(aup_fm2), np.std(aup_fm2))
    print('Average and SD of f1 Socre FM2:', mean(f1_score_list_fm2), np.std(f1_score_list_fm2))
    print('------------------------------------------------------------------------------------------------------')

    return mean(auc_fm2), mean(aup_fm2), mean(f1_score_list_fm2)

In [47]:
def get_cross_validation_unseen_food(other_data_features_class,fdata_features_class):

    repeat = 10

    for _ in tqdm(range(repeat)):
        oversample = SMOTE()
        y_other = other_data_features_class.copy()['class']
        X_other = other_data_features_class.copy().drop('class', axis = 1)
        X_other_smote, y_other_smote = oversample.fit_resample(X_other, y_other)
        y_food = fdata_features_class.copy()['class']
        X_food = fdata_features_class.copy().drop('class', axis = 1)
        X_food_smote, y_food_smote = oversample.fit_resample(X_food, y_food)
        kfold = StratifiedKFold(n_splits=10)
        auc_fm2, aup_fm2, f1_fm2 = xgboost_model_unseen_food(X_other_smote, y_other_smote,X_food_smote, y_food_smote,kfold,n_splits=10)
        
    return 

In [48]:
## Garlic
other_foods = pd.concat([cdata_features_class_doc2vec,bdata_features_class_doc2vec,adata_features_class_doc2vec,mdata_features_class_doc2vec])
get_cross_validation_unseen_food(other_foods,gdata_features_class_doc2vec)

 10%|████████▎                                                                          | 1/10 [00:39<05:59, 40.00s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8703557312252964 0.13856334301726794
Average and SD of AUP for FM2: 0.8425675991558231 0.1446334020391158
Average and SD of f1 Socre FM2: 0.8652633890391327 0.14568769000313242
------------------------------------------------------------------------------------------------------


 20%|████████████████▌                                                                  | 2/10 [01:19<05:19, 39.99s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8815217391304349 0.11197221928053912
Average and SD of AUP for FM2: 0.8491034683366699 0.11668853521321335
Average and SD of f1 Socre FM2: 0.878458458908731 0.11645857163485773
------------------------------------------------------------------------------------------------------


 30%|████████████████████████▉                                                          | 3/10 [01:59<04:39, 39.87s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8815217391304349 0.11112489358278342
Average and SD of AUP for FM2: 0.8498243373885667 0.1230491641800354
Average and SD of f1 Socre FM2: 0.8775684550254823 0.11714757355655002
------------------------------------------------------------------------------------------------------


 40%|█████████████████████████████████▏                                                 | 4/10 [02:43<04:06, 41.04s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8769762845849803 0.12125858811036885
Average and SD of AUP for FM2: 0.8532728204059076 0.13047393249646447
Average and SD of f1 Socre FM2: 0.8688274009305355 0.13175402902441058
------------------------------------------------------------------------------------------------------


 50%|█████████████████████████████████████████▌                                         | 5/10 [03:19<03:18, 39.73s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.865909090909091 0.12861805991675324
Average and SD of AUP for FM2: 0.8385114277084547 0.13664266906667258
Average and SD of f1 Socre FM2: 0.8585281566585797 0.13784533686715866
------------------------------------------------------------------------------------------------------


 60%|█████████████████████████████████████████████████▊                                 | 6/10 [03:58<02:37, 39.28s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8682806324110672 0.12695536654262068
Average and SD of AUP for FM2: 0.8390695655193446 0.1363982862073582
Average and SD of f1 Socre FM2: 0.8597298352864904 0.1411961988828974
------------------------------------------------------------------------------------------------------


 70%|██████████████████████████████████████████████████████████                         | 7/10 [04:35<01:56, 38.69s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8902173913043478 0.10477798844150212
Average and SD of AUP for FM2: 0.8636553090715771 0.11914601090853023
Average and SD of f1 Socre FM2: 0.8896543221699206 0.10304244844913406
------------------------------------------------------------------------------------------------------


 80%|██████████████████████████████████████████████████████████████████▍                | 8/10 [05:12<01:16, 38.30s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8839920948616601 0.11870645115986735
Average and SD of AUP for FM2: 0.8579687764181578 0.13216096933209712
Average and SD of f1 Socre FM2: 0.8764993920807874 0.12900024488887304
------------------------------------------------------------------------------------------------------


 90%|██████████████████████████████████████████████████████████████████████████▋        | 9/10 [05:49<00:37, 37.91s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8726284584980237 0.12765823241308188
Average and SD of AUP for FM2: 0.8474925816157354 0.13615742928778285
Average and SD of f1 Socre FM2: 0.8674882894253285 0.13191054283315526
------------------------------------------------------------------------------------------------------


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [06:25<00:00, 38.51s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8771739130434782 0.1264953751180666
Average and SD of AUP for FM2: 0.8517132396358005 0.134181833999699
Average and SD of f1 Socre FM2: 0.8728220162042205 0.12986143152450455
------------------------------------------------------------------------------------------------------





In [295]:
## Garlic normalized features
#other_foods_normalized = pd.concat([cdata_features_class_normalized,bdata_features_class_normalized,adata_features_class_normalized,mdata_features_class_normalized])
#get_cross_validation_unseen_food(other_foods_normalized,gdata_features_class_normalized)

------------------------------------------------------------------------------------------------------
Average AUC for FM1 and FM2:  0.6126520100502513 0.5979057788944724
Average AUP for FM1 and FM2: 0.568960919016526 0.5584431905318619
Average f1 Socre FM1 and FM2: 0.6224184595434816 0.6350375051014985
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
Average AUC for FM1 and FM2:  0.6124045226130653 0.607925879396985
Average AUP for FM1 and FM2: 0.5692603356353355 0.563863697215411
Average f1 Socre FM1 and FM2: 0.6244422243416897 0.6489699881775981
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
Average AUC for FM1 and FM2:  0.5838831658291457 0.5976633165829146
Average AUP fo

In [49]:
## Cocoa
other_foods = pd.concat([gdata_features_class_doc2vec,bdata_features_class_doc2vec,adata_features_class_doc2vec,mdata_features_class_doc2vec])
get_cross_validation_unseen_food(other_foods,cdata_features_class_doc2vec)

 10%|████████▎                                                                          | 1/10 [00:34<05:10, 34.45s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8338768115942029 0.10772528817629318
Average and SD of AUP for FM2: 0.7801526892882163 0.1141128855526579
Average and SD of f1 Socre FM2: 0.8306797824658932 0.12109026631653655
------------------------------------------------------------------------------------------------------


 20%|████████████████▌                                                                  | 2/10 [01:08<04:35, 34.39s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8685688405797102 0.07801689906082801
Average and SD of AUP for FM2: 0.8167884658156417 0.09500061977352381
Average and SD of f1 Socre FM2: 0.8715850097031858 0.08036471851641241
------------------------------------------------------------------------------------------------------


 30%|████████████████████████▉                                                          | 3/10 [01:43<04:01, 34.47s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8296195652173913 0.11864659323301831
Average and SD of AUP for FM2: 0.7769953369298659 0.1267417353537082
Average and SD of f1 Socre FM2: 0.8364042249039921 0.11549494308320149
------------------------------------------------------------------------------------------------------


 40%|█████████████████████████████████▏                                                 | 4/10 [02:17<03:26, 34.48s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8401268115942029 0.10247544063897696
Average and SD of AUP for FM2: 0.7854288560898584 0.10932689038649245
Average and SD of f1 Socre FM2: 0.8421909365266831 0.11013608697636111
------------------------------------------------------------------------------------------------------


 50%|█████████████████████████████████████████▌                                         | 5/10 [02:52<02:52, 34.57s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8403985507246376 0.11723814149433623
Average and SD of AUP for FM2: 0.792581010259646 0.13244952467345947
Average and SD of f1 Socre FM2: 0.8376959637622994 0.12716109858221597
------------------------------------------------------------------------------------------------------


 60%|█████████████████████████████████████████████████▊                                 | 6/10 [03:27<02:19, 34.76s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8271739130434782 0.10199597132412441
Average and SD of AUP for FM2: 0.768754773678908 0.10893955523794588
Average and SD of f1 Socre FM2: 0.8329038771100455 0.10514068546928532
------------------------------------------------------------------------------------------------------


 70%|██████████████████████████████████████████████████████████                         | 7/10 [04:02<01:44, 34.69s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8531702898550725 0.09710419535647573
Average and SD of AUP for FM2: 0.7981693648069466 0.10765989959133239
Average and SD of f1 Socre FM2: 0.8554869671547766 0.1026849386098731
------------------------------------------------------------------------------------------------------


 80%|██████████████████████████████████████████████████████████████████▍                | 8/10 [04:37<01:09, 34.75s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8316123188405797 0.10175488550385453
Average and SD of AUP for FM2: 0.7758367994256061 0.10749130262882546
Average and SD of f1 Socre FM2: 0.8312025540221493 0.11161242600028931
------------------------------------------------------------------------------------------------------


 90%|██████████████████████████████████████████████████████████████████████████▋        | 9/10 [05:12<00:34, 34.86s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8425724637681159 0.08600093906864584
Average and SD of AUP for FM2: 0.7845885971433455 0.09353301504893353
Average and SD of f1 Socre FM2: 0.8467018307880566 0.08954744677264352
------------------------------------------------------------------------------------------------------


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [05:47<00:00, 34.78s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8380434782608696 0.09573534071092868
Average and SD of AUP for FM2: 0.7816462730647532 0.10112259385704116
Average and SD of f1 Socre FM2: 0.8387883164140774 0.1051169247957342
------------------------------------------------------------------------------------------------------





In [50]:
## Basil
other_foods = pd.concat([gdata_features_class_doc2vec,cdata_features_class_doc2vec,adata_features_class_doc2vec,mdata_features_class_doc2vec])
get_cross_validation_unseen_food(other_foods,bdata_features_class_doc2vec)

 10%|████████▎                                                                          | 1/10 [00:37<05:38, 37.65s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.4816666666666667 0.11936172846529251
Average and SD of AUP for FM2: 0.5307676767676768 0.08351559132575402
Average and SD of f1 Socre FM2: 0.24293650793650795 0.1471730975243799
------------------------------------------------------------------------------------------------------


 20%|████████████████▌                                                                  | 2/10 [01:16<05:03, 37.96s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.49 0.09345230512584124
Average and SD of AUP for FM2: 0.5201010101010101 0.05644232586784951
Average and SD of f1 Socre FM2: 0.20515873015873015 0.1445779297741456
------------------------------------------------------------------------------------------------------


 30%|████████████████████████▉                                                          | 3/10 [01:51<04:20, 37.15s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.5083333333333333 0.10833333333333331
Average and SD of AUP for FM2: 0.5462121212121213 0.09169170707324362
Average and SD of f1 Socre FM2: 0.18214285714285713 0.19849561759414647
------------------------------------------------------------------------------------------------------


 40%|█████████████████████████████████▏                                                 | 4/10 [02:27<03:40, 36.78s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.49833333333333335 0.10552777412184484
Average and SD of AUP for FM2: 0.5378787878787878 0.08821773713945237
Average and SD of f1 Socre FM2: 0.21857142857142858 0.1972049591627131
------------------------------------------------------------------------------------------------------


 50%|█████████████████████████████████████████▌                                         | 5/10 [03:03<03:02, 36.58s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.49 0.11333333333333334
Average and SD of AUP for FM2: 0.5323232323232323 0.07726447437278036
Average and SD of f1 Socre FM2: 0.16746031746031745 0.21124980525841514
------------------------------------------------------------------------------------------------------


 60%|█████████████████████████████████████████████████▊                                 | 6/10 [03:40<02:26, 36.61s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.5016666666666667 0.10526421783092085
Average and SD of AUP for FM2: 0.5341414141414141 0.07068369021811702
Average and SD of f1 Socre FM2: 0.22380952380952382 0.20230468528704534
------------------------------------------------------------------------------------------------------


 70%|██████████████████████████████████████████████████████████                         | 7/10 [04:17<01:50, 36.73s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.525 0.12958266344950106
Average and SD of AUP for FM2: 0.5710101010101011 0.08612442504971436
Average and SD of f1 Socre FM2: 0.2563492063492063 0.21980115386666801
------------------------------------------------------------------------------------------------------


 80%|██████████████████████████████████████████████████████████████████▍                | 8/10 [04:53<01:13, 36.61s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.495 0.10221165404307973
Average and SD of AUP for FM2: 0.531489898989899 0.06605493461738814
Average and SD of f1 Socre FM2: 0.24857142857142858 0.15387650678347264
------------------------------------------------------------------------------------------------------


 90%|██████████████████████████████████████████████████████████████████████████▋        | 9/10 [05:30<00:36, 36.65s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.5166666666666667 0.1376388188137505
Average and SD of AUP for FM2: 0.5568434343434343 0.09554872498383284
Average and SD of f1 Socre FM2: 0.2715873015873016 0.20327310360332002
------------------------------------------------------------------------------------------------------


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [06:06<00:00, 36.66s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.5433333333333333 0.10780641085864151
Average and SD of AUP for FM2: 0.5654545454545454 0.0924138096446602
Average and SD of f1 Socre FM2: 0.2119047619047619 0.22219529315293723
------------------------------------------------------------------------------------------------------





In [51]:
## Apple
other_foods = pd.concat([gdata_features_class_doc2vec,cdata_features_class_doc2vec,bdata_features_class_doc2vec,mdata_features_class_doc2vec])
get_cross_validation_unseen_food(other_foods,adata_features_class_doc2vec)

 10%|████████▎                                                                          | 1/10 [00:33<05:03, 33.78s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8716316526610645 0.08332324751243583
Average and SD of AUP for FM2: 0.8290571366115325 0.08725293743708352
Average and SD of f1 Socre FM2: 0.8647800292345609 0.09466512687981132
------------------------------------------------------------------------------------------------------


 20%|████████████████▌                                                                  | 2/10 [01:07<04:30, 33.76s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8825490196078432 0.08361912865604601
Average and SD of AUP for FM2: 0.8435015369493066 0.08940250592556002
Average and SD of f1 Socre FM2: 0.8763434822084755 0.09351548876919932
------------------------------------------------------------------------------------------------------


 30%|████████████████████████▉                                                          | 3/10 [01:40<03:54, 33.54s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.872906162464986 0.08670695283332476
Average and SD of AUP for FM2: 0.8334019330741754 0.09273686812067984
Average and SD of f1 Socre FM2: 0.8665206722342947 0.0958735039868127
------------------------------------------------------------------------------------------------------


 40%|█████████████████████████████████▏                                                 | 4/10 [02:14<03:21, 33.63s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8762149859943977 0.08063935153403616
Average and SD of AUP for FM2: 0.8363923618087257 0.08650364507920309
Average and SD of f1 Socre FM2: 0.8688155365892073 0.09278328461943997
------------------------------------------------------------------------------------------------------


 50%|█████████████████████████████████████████▌                                         | 5/10 [02:48<02:48, 33.78s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8796113445378152 0.0793790566950622
Average and SD of AUP for FM2: 0.8385664981448504 0.08535264187102372
Average and SD of f1 Socre FM2: 0.8740581744602598 0.08874807831803216
------------------------------------------------------------------------------------------------------


 60%|█████████████████████████████████████████████████▊                                 | 6/10 [03:22<02:15, 33.76s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8728886554621849 0.08809305880826389
Average and SD of AUP for FM2: 0.8326759250732352 0.0940808701975369
Average and SD of f1 Socre FM2: 0.8656398242597086 0.09964121744349397
------------------------------------------------------------------------------------------------------


 70%|██████████████████████████████████████████████████████████                         | 7/10 [03:56<01:42, 34.01s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8842436974789916 0.07939403341238677
Average and SD of AUP for FM2: 0.8444338084335026 0.08721380630547396
Average and SD of f1 Socre FM2: 0.8796223522366057 0.08704586580958891
------------------------------------------------------------------------------------------------------


 80%|██████████████████████████████████████████████████████████████████▍                | 8/10 [04:30<01:07, 33.98s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8737464985994398 0.07963875621467287
Average and SD of AUP for FM2: 0.8341202608126007 0.08534630310767867
Average and SD of f1 Socre FM2: 0.8667724187349178 0.09008909052160964
------------------------------------------------------------------------------------------------------


 90%|██████████████████████████████████████████████████████████████████████████▋        | 9/10 [05:05<00:34, 34.15s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8733333333333333 0.08454620188697526
Average and SD of AUP for FM2: 0.8335548532869065 0.08937300567801269
Average and SD of f1 Socre FM2: 0.865867266754454 0.09538624862956252
------------------------------------------------------------------------------------------------------


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [05:38<00:00, 33.81s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.8758193277310925 0.08175835567371313
Average and SD of AUP for FM2: 0.8362425081454596 0.08799810392459599
Average and SD of f1 Socre FM2: 0.8687339461134617 0.0928397213732703
------------------------------------------------------------------------------------------------------





In [52]:
## Milk
other_foods = pd.concat([gdata_features_class_doc2vec,cdata_features_class_doc2vec,bdata_features_class_doc2vec,adata_features_class_doc2vec])
get_cross_validation_unseen_food(other_foods,mdata_features_class_doc2vec)

 10%|████████▎                                                                          | 1/10 [00:37<05:37, 37.52s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.6951923076923077 0.06720764848188167
Average and SD of AUP for FM2: 0.6666262626262627 0.06613956006254185
Average and SD of f1 Socre FM2: 0.6070869405941421 0.11803460003619005
------------------------------------------------------------------------------------------------------


 20%|████████████████▌                                                                  | 2/10 [01:12<04:54, 36.76s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.6868589743589744 0.09091553922650854
Average and SD of AUP for FM2: 0.6669095071595071 0.10292411411638965
Average and SD of f1 Socre FM2: 0.5973082327029695 0.12595640120935414
------------------------------------------------------------------------------------------------------


 30%|████████████████████████▉                                                          | 3/10 [01:48<04:16, 36.61s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.6830128205128205 0.06051584222138293
Average and SD of AUP for FM2: 0.6556007326007326 0.06062846370010365
Average and SD of f1 Socre FM2: 0.5949293853870513 0.10734822960782145
------------------------------------------------------------------------------------------------------


 40%|█████████████████████████████████▏                                                 | 4/10 [02:28<03:45, 37.59s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.6881410256410256 0.08138817083827193
Average and SD of AUP for FM2: 0.6628538868538868 0.08733548218209901
Average and SD of f1 Socre FM2: 0.6087168844377082 0.10749760543058778
------------------------------------------------------------------------------------------------------


 50%|█████████████████████████████████████████▌                                         | 5/10 [03:05<03:07, 37.41s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.6958333333333333 0.08637746182069224
Average and SD of AUP for FM2: 0.6653044363044364 0.08563771237950217
Average and SD of f1 Socre FM2: 0.6283649742817868 0.12299016624671698
------------------------------------------------------------------------------------------------------


 60%|█████████████████████████████████████████████████▊                                 | 6/10 [03:41<02:27, 36.90s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.691025641025641 0.0929177667300285
Average and SD of AUP for FM2: 0.6702150488400489 0.09465296170399458
Average and SD of f1 Socre FM2: 0.587875341107943 0.16108703954789105
------------------------------------------------------------------------------------------------------


 70%|██████████████████████████████████████████████████████████                         | 7/10 [04:18<01:50, 36.88s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.6602564102564102 0.08820801650011721
Average and SD of AUP for FM2: 0.6370816960816961 0.08825948666727536
Average and SD of f1 Socre FM2: 0.5649914670812504 0.12561848285671282
------------------------------------------------------------------------------------------------------


 80%|██████████████████████████████████████████████████████████████████▍                | 8/10 [04:53<01:12, 36.45s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.6669871794871796 0.09096524269214455
Average and SD of AUP for FM2: 0.6411207264957265 0.0943697744096545
Average and SD of f1 Socre FM2: 0.5704364914261939 0.1411932296944331
------------------------------------------------------------------------------------------------------


 90%|██████████████████████████████████████████████████████████████████████████▋        | 9/10 [05:29<00:36, 36.21s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.6753205128205129 0.0785230245508627
Average and SD of AUP for FM2: 0.6462296592296592 0.08103460651629943
Average and SD of f1 Socre FM2: 0.60339447102605 0.09608318640702201
------------------------------------------------------------------------------------------------------


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [06:07<00:00, 36.77s/it]

------------------------------------------------------------------------------------------------------
Average and SD of AUC for FM2:  0.691025641025641 0.08534552889923451
Average and SD of AUP for FM2: 0.6630000277500278 0.09304324116458235
Average and SD of f1 Socre FM2: 0.6206010530278265 0.121152508362762
------------------------------------------------------------------------------------------------------





# UMAP visualization 

In [53]:
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as clr
from collections import Counter

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import umap

from sklearn.cluster import KMeans, DBSCAN

import numpy as np
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot, plot_mpl
init_notebook_mode(connected=True)

In [54]:
gtrain = pd.read_csv("data/garlic_scoring.csv", encoding='latin1')[['PMID', 'abstract', 'paper', 'mesh_terms', 'qual_terms', 'is_useful']]
gtrain['food'] = 'garlic'
ctrain = pd.read_csv("data/cocoa_scoring.csv", encoding='latin1')[['PMID', 'abstract', 'paper', 'mesh_terms', 'qual_terms', 'is_useful']]
ctrain['food'] = 'cocoa'
gtrain = gtrain[gtrain['is_useful'].notnull()]
ctrain = ctrain[ctrain['is_useful'].notnull()]

In [55]:
gdata_features_class_doc2vec['class'] = gtrain['is_useful'].tolist()
cdata_features_class_doc2vec['class'] = ctrain['is_useful'].tolist()

In [56]:
all_foods_doc2vec = pd.concat([gdata_features_class_doc2vec,cdata_features_class_doc2vec,bdata_features_class_doc2vec,adata_features_class_doc2vec,mdata_features_class_doc2vec])

food_name_list = []
for _ in range(len(gdata_features_class_doc2vec)):
    food_name_list.append('garlic')
for _ in range(len(cdata_features_class_doc2vec)):
    food_name_list.append('cocoa')
for _ in range(len(bdata_features_class_doc2vec)):
    food_name_list.append('basil')
for _ in range(len(adata_features_class_doc2vec)):
    food_name_list.append('apple')
for _ in range(len(mdata_features_class_doc2vec)):
    food_name_list.append('human milk')

all_foods_doc2vec['food_name'] = food_name_list

In [57]:
all_foods_doc2vec_features = all_foods_doc2vec.drop(columns=['class','food_name'])
all_foods_doc2vec['concatenated_embedding'] = all_foods_doc2vec_features.values.tolist()

In [58]:
embeddings = all_foods_doc2vec['concatenated_embedding'].tolist()

In [59]:
reducer = umap.UMAP(n_components=3)
umap_fit = reducer.fit_transform(np.array(embeddings))


Embedding a total of 2 separate connected components using meta-embedding (experimental)



In [60]:
viz_data_full = pd.DataFrame(umap_fit, columns = ['umap_0', 'umap_1', 'umap_2'])
viz_data_full['is_useful'] = all_foods_doc2vec['class'].tolist()
viz_data_full['food'] = all_foods_doc2vec['food_name'].tolist()
viz_data_full

Unnamed: 0,umap_0,umap_1,umap_2,is_useful,food
0,13.576961,2.384406,3.442932,1.0,garlic
1,13.777128,3.247021,4.353684,0.0,garlic
2,18.548460,2.746119,3.322711,0.0,garlic
3,15.005014,1.965498,3.574461,0.0,garlic
4,16.695442,2.086138,3.052322,2.0,garlic
...,...,...,...,...,...
2599,14.126314,3.582968,3.064538,1.0,human milk
2600,18.857845,3.333478,2.200412,1.0,human milk
2601,18.647823,3.648624,1.830497,1.0,human milk
2602,18.358011,3.815198,1.590334,0.0,human milk


In [61]:
# Garlic
viz_data = viz_data_full[viz_data_full['food'] == 'garlic']

In [62]:
def fill_na_w_list(v):
    if not isinstance(v, list):
        return []
    else:
        return v


def make_label(row, name='chem'):
    return #'<br>Abstract: ' + row['is_useful'] #\
        #+ '<br>InChiKey: ' + row['InChiKey'] \
        #+ '<br>cid: ' + row['cid']

traces = []
# class_col = 'chebi_class'
# class_col = 'Super Class'
class_col = 'is_useful'
# class_col = 'cluster'
color_list = ['#fa9fb5','#377eb8','#1b9e77']

for i, c in enumerate(set(viz_data[class_col].dropna().drop_duplicates().tolist()) - set(['NA', '-1'])):
    print(c)
    traces.append( go.Scatter3d(
        x=viz_data[viz_data[class_col] == c].umap_0.tolist(),
        y=viz_data[viz_data[class_col] == c].umap_1.tolist(),
        z=viz_data[viz_data[class_col] == c].umap_2.tolist(),
        mode='markers',
        marker=dict(
            opacity=0.7,
            size=3,
            color=  color_list[i] # sns.color_palette("hsv", 17,desat=.7).as_hex()[i]
        ),
        name=c,
        text=viz_data[viz_data[class_col] == c].apply(make_label, axis=1),
        hoverinfo='text'
#         customdata=viz_data[viz_data['Super Class'] == c].diseases.tolist()
    ))


#v2 = viz_data[viz_data[class_col].isnull()].copy()
#trace2 = go.Scatter3d(
#    x=v2.umap_0.tolist(),
#    y=v2.umap_1.tolist(),
#    z=v2.umap_2.tolist(),
#    mode='markers',
#    marker=dict(
#        opacity=0.05,
#        size=3,
#        color='lightgray'
#    ),
#    name='No Class Labels',
#    text=v2.apply(make_label, axis=1),
#    hoverinfo='text'
#)

#traces = [trace2] + traces

layout = go.Layout(
    title='FoodMine Space | Garlic'
)

fig = go.Figure(data=traces, layout=layout)

fn = 'FoodMine_Space_Garlic_2'
# fig.write_image(f"figs/{fn}.svg")
plot(fig, filename = f'{fn}.html', auto_open=False)

0.0
1.0
2.0


'FoodMine_Space_Garlic_2.html'

In [63]:
# Cocoa
viz_data = viz_data_full[viz_data_full['food'] == 'cocoa']

In [64]:
def fill_na_w_list(v):
    if not isinstance(v, list):
        return []
    else:
        return v


def make_label(row, name='chem'):
    return #'<br>Abstract: ' + row['is_useful'] #\
        #+ '<br>InChiKey: ' + row['InChiKey'] \
        #+ '<br>cid: ' + row['cid']

traces = []
# class_col = 'chebi_class'
# class_col = 'Super Class'
class_col = 'is_useful'
# class_col = 'cluster'
color_list = ['#fa9fb5','#377eb8','#1b9e77']

for i, c in enumerate(set(viz_data[class_col].dropna().drop_duplicates().tolist()) - set(['NA', '-1'])):
    print(c)
    traces.append( go.Scatter3d(
        x=viz_data[viz_data[class_col] == c].umap_0.tolist(),
        y=viz_data[viz_data[class_col] == c].umap_1.tolist(),
        z=viz_data[viz_data[class_col] == c].umap_2.tolist(),
        mode='markers',
        marker=dict(
            opacity=0.7,
            size=3,
            color=  color_list[i] # sns.color_palette("hsv", 17,desat=.7).as_hex()[i]
        ),
        name=c,
        text=viz_data[viz_data[class_col] == c].apply(make_label, axis=1),
        hoverinfo='text'
#         customdata=viz_data[viz_data['Super Class'] == c].diseases.tolist()
    ))


#v2 = viz_data[viz_data[class_col].isnull()].copy()
#trace2 = go.Scatter3d(
#    x=v2.umap_0.tolist(),
#    y=v2.umap_1.tolist(),
#    z=v2.umap_2.tolist(),
#    mode='markers',
#    marker=dict(
#        opacity=0.05,
#        size=3,
#        color='lightgray'
#    ),
#    name='No Class Labels',
#    text=v2.apply(make_label, axis=1),
#    hoverinfo='text'
#)

#traces = [trace2] + traces

layout = go.Layout(
    title='FoodMine Space | Cocoa'
)

fig = go.Figure(data=traces, layout=layout)

fn = 'FoodMine_Space_Cocoa_2'
# fig.write_image(f"figs/{fn}.svg")
plot(fig, filename = f'{fn}.html', auto_open=False)

0.0
1.0
2.0


'FoodMine_Space_Cocoa_2.html'