In [1]:
%time
%load_ext autotime
%load_ext autoreload
%autoreload 2


# if cannot import the modules, add the parent directory to system path might help

import os, tqdm, sys
parent_dir = os.path.abspath(os.getcwd()+'/..')+'/'
sys.path.append(parent_dir)

from utils.path import dir_HugeFiles
from utils.preprocessing import load
from utils.save import make_dir, save_pickle, load_pickle, save
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
from collections import Counter
import spacy
import copy
import re
random_seed = 2019

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 7.63 µs


In [2]:
dic = load_pickle('../big_data/recipe1M_cleaned.pickle')

time: 11 s


In [3]:
def reverse(text):
    '''
    Important data cleaning before NY times parser
    '''
    # replace things in brace
    text = re.sub(r'\([^)]*\)', '', text)

    # remove space before punct
    text = re.sub(r'\s([?.!,"](?:\s|$))', r'\1', text)

    # remove consecutive spaces
    text = re.sub(' +',' ',text).strip()
    return text

def reverse_list(listoftext):
    output=[]
    for text in listoftext:
        rev = reverse(text)
        if rev:
            output.append(rev)
    return output

def load_dir_data(filename):
    ls = []
    if os.path.isdir(filename):
        print('load', filename)
        # Directory
        for (dirpath, _, fnames) in os.walk(filename):
            for fname in fnames:
                path = os.path.join(dirpath, fname)
                with open(path, 'r') as fp:
                    raw_text = fp.read()
                    
                # if it contains instr
                if fname[-5] == 'd':
                    dic[int(fname[:-5])]['generated_instr'] = reverse_list(raw_text.split('.'))

                # if it contains ingred
                if fname[-5] == 'i':
                    dic[int(fname[:-5])]['generated_ingred'] = reverse_list(raw_text.split('$'))
                    
                # if it contains name
                if fname[-5] == 't':
                    dic[int(fname[:-5])]['generated_name'] = raw_text
                ls.append(int(fname[:-5]))# only interested in instr
                    
    return sorted(list(set(ls)))

time: 23.7 ms


In [6]:
filename = '../../to_gpt2/generation_20191010-1M_test/'
ls = load_dir_data(filename)

load ../../to_gpt2/generation_20191010-1M_test/
time: 265 ms


In [8]:
def add_space(line):
    # add space before punct
    line = re.sub('([.,!?()])', r' \1 ', line)
    line = re.sub('\s{2,}', ' ', line)
    return line

time: 41.7 ms


In [10]:
to_write = {'truth_t':'', 'truth_i':'', 'truth_d':'',
            'pred_t':'', 'pred_i':'', 'pred_d':''
           }
for i, v in enumerate(dic):
    if i in ls:
        to_write['truth_t'] += add_space(v['title']) + '\n'
        to_write['truth_i'] += add_space(' $ '.join(v['ingredients']))+ ' $ \n'
        to_write['truth_d'] += add_space(' '.join(v['instructions'])) + ' . \n'
        to_write['pred_t'] += add_space(v['generated_name']) + '\n'
        to_write['pred_i'] += add_space(' $ '.join(v['generated_ingred'])) + ' $ \n'
        to_write['pred_d'] += add_space(' . '.join(v['generated_instr'])) + ' . \n'
        
for k, v in to_write.items():
    save('../../to_gpt2/generation_overwrite_%s.txt'%(k), v ,overwrite = True)

saved ../../to_gpt2/generation_overwrite_truth_t.txt
saved ../../to_gpt2/generation_overwrite_truth_i.txt
saved ../../to_gpt2/generation_overwrite_truth_d.txt
saved ../../to_gpt2/generation_overwrite_pred_t.txt
saved ../../to_gpt2/generation_overwrite_pred_i.txt
saved ../../to_gpt2/generation_overwrite_pred_d.txt
time: 6.71 s


In [11]:
!perl multi-bleu.perl ../../to_gpt2/generation_overwrite_truth_t.txt < ../../to_gpt2/generation_overwrite_pred_t.txt

BLEU = 11.07, 44.4/16.7/7.2/3.9 (BP=0.922, ratio=0.925, hyp_len=1898, ref_len=2053)
It is not advisable to publish scores from multi-bleu.perl.  The scores depend on your tokenizer, which is unlikely to be reproducible from your paper or consistent across research groups.  Instead you should detokenize then use mteval-v14.pl, which has a standard tokenization.  Scores from multi-bleu.perl can still be used for internal purposes when you have a consistent tokenizer.
time: 338 ms


In [12]:
!perl multi-bleu.perl ../../to_gpt2/generation_overwrite_truth_i.txt < ../../to_gpt2/generation_overwrite_pred_i.txt

BLEU = 22.32, 56.8/32.2/16.9/8.0 (BP=1.000, ratio=1.035, hyp_len=29418, ref_len=28430)
It is not advisable to publish scores from multi-bleu.perl.  The scores depend on your tokenizer, which is unlikely to be reproducible from your paper or consistent across research groups.  Instead you should detokenize then use mteval-v14.pl, which has a standard tokenization.  Scores from multi-bleu.perl can still be used for internal purposes when you have a consistent tokenizer.
time: 639 ms


In [13]:
!perl multi-bleu.perl ../../to_gpt2/generation_overwrite_truth_d.txt < ../../to_gpt2/generation_overwrite_pred_d.txt

BLEU = 9.01, 55.9/20.3/7.8/3.3 (BP=0.690, ratio=0.729, hyp_len=48415, ref_len=66394)
It is not advisable to publish scores from multi-bleu.perl.  The scores depend on your tokenizer, which is unlikely to be reproducible from your paper or consistent across research groups.  Instead you should detokenize then use mteval-v14.pl, which has a standard tokenization.  Scores from multi-bleu.perl can still be used for internal purposes when you have a consistent tokenizer.
time: 1.02 s


In [14]:
### STEP3 sent to the NYtimes
### assign indices to each ingredient <---> NYtimes
class ny_ingredients:
    def __init__(self, fields):
        # this function will take the global variable ls and dic
        # static & reuseable
        self.ny_ingred = '../../NYtime-parser2/ingred.txt'
        self.ny_result = '../../NYtime-parser2/result.json'
        
        # spacy
        self.spacy = spacy.load('en_core_web_lg')
        self.fields = fields #['ingredients', 'generated_ingred']

    def to_ny(self):
        '''
        using global variables dic and ls
        '''
        to_write = []
        #for i, v in dic.items():
        for i, v in enumerate(dic):    
            if i in ls:
                # assing index
                for field in self.fields:
                    line_ids = []
                    for line in v[field]:
                        reversed_line = reverse(line)
                        if line in to_write:
                            ny_id = to_write.index(reversed_line)
                        else:
                            ny_id = len(to_write)
                            to_write.append(reversed_line)
                        line_ids.append(ny_id)
                    dic[i]['ny_%s'%(field)] = line_ids

        # save the file to the folder under NYtime-parser2
        save(filename = self.ny_ingred, 
             to_write = '\n'.join(to_write),
             overwrite = True, 
             print_=True)

        self.to_write = to_write
        
    # step 3
    def to_ingred(self):
        '''
        using global variables dic and ls
        '''
        ny_result = pd.read_json(self.ny_result)
        to_write = []
        #for i, v in dic.items():
        for i, v in enumerate(dic):    
            if i in ls:
                # assing index
                for field in self.fields:
                    temp = [ny_result.loc[ny_id]['name'] for ny_id in v['ny_%s'%(field)]]
                    exact, root = self.extract(temp)
                    dic[i]['ny_%s'%(field)] = {'ny':temp, 'exact':exact, 'root':root}
                    
    def extract(self, ny_ingred):
        '''
        Args: ny_ingred: a list of ingredient names
        '''
        phrases_to_sentences = ' '.join(['Mix the %s and water.'%ingr for ingr in ny_ingred])
        doc = self.spacy(phrases_to_sentences)
        exact_match, root_match = [],[]
        for chunk in doc.noun_chunks:
            if chunk.text != 'water':
                root_lemma = [token.lemma_ for token in doc if token.text == chunk.root.text][0]
                exact_match.append(chunk.lemma_.replace('the ',''))
                root_match.append(root_lemma)
        return exact_match, root_match

time: 47.2 ms


In [15]:
ny_ingr = ny_ingredients(fields = ['generated_ingred','ingredients'])
### step 3-1 save it as ingred.txt
ny_ingr.to_ny()
### step 3-2 go to python2 and run NLG_notebooks/Control Nytimes

saved ../../NYtime-parser2/ingred.txt
time: 22.5 s


In [16]:
### step 3-3 load the result.json back to dic
ny_ingr.to_ingred()

time: 29.6 s


In [17]:
from utils.evaluation import metrics

time: 43.5 ms


In [18]:
#for i, v in tqdm.tqdm(dic.items()):
for i, v in tqdm.tqdm(enumerate(dic)):    
    if i in ls:
        score = metrics(v['ny_ingredients']['exact'], v['ny_generated_ingred']['exact'])
        dic[i].update(score.all_ngram_recall(name='@recall_exact'))
        dic[i].update(score.all_ngram_precision(name='@precision_exact'))
        score = metrics(v['ny_ingredients']['root'], v['ny_generated_ingred']['root'])
        dic[i].update(score.all_recall(name='@recall_root'))
        dic[i].update(score.all_precision(name='@precision_root'))       
    
# df2 = pd.DataFrame.from_dict(dic, orient = 'index')
df2 = pd.DataFrame(dic)
temp = df2[[col for col in df2.columns if '@' in col]].iloc[ls]
print(str(temp.mean()))

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
1029720it [00:07, 139782.96it/s]


precision_@precision_root                0.731566
precision_freq_@precision_root           0.678349
precision_ngram_@precision_exact         0.632323
precision_ngram_freq_@precision_exact    0.598198
recall_@recall_root                      0.711097
recall_freq_@recall_root                 0.699889
recall_ngram_@recall_exact               0.574229
recall_ngram_freq_@recall_exact          0.572469
dtype: float64
time: 11.4 s


In [19]:
for metric_name in ['','freq_']:#,'ngram','ngram_freq']:
    print('f1',metric_name)
    str_p = 'precision_%s@precision_root'%(metric_name)
    str_r = 'recall_%s@recall_root'%(metric_name)
    score = temp.apply(lambda x: 2*x[str_p]*x[str_r]/(x[str_p]+x[str_r]) , axis = 1).mean()
    print(round(score, 3))
for metric_name in ['ngram_','ngram_freq_']:
    print('f1',metric_name)
    str_p = 'precision_%s@precision_exact'%(metric_name)
    str_r = 'recall_%s@recall_exact'%(metric_name)
    score = temp.apply(lambda x: 2*x[str_p]*x[str_r]/(x[str_p]+x[str_r]) , axis = 1).mean()
    print(round(score, 3))

f1 
0.728
f1 freq_
0.693
f1 ngram_
0.582
f1 ngram_freq_
0.566
time: 253 ms


  """
  # This is added back by InteractiveShellApp.init_path()


In [20]:
from utils.tree import instr2tree, tree_distance, build_tree
from utils.evaluation import spacy_extension
treemaker = instr2tree()
sp = spacy_extension()
def stem(x):
    return [{'word':d['word'], 'ingredient':[]} for d in x]

def allinone(v_directions, v_generated_instr, tag = '', rev=False):
    true = treemaker.sents2tree(v_directions)
    pred = treemaker.sents2tree(v_generated_instr)
    if rev:
        pred = pred[::-1]
    tree_dist = tree_distance(build_tree(true), build_tree(pred))
    
    true_nodes = sum([len(line['ingredient']) +1 for line in true])
    pred_nodes = sum([len(line['ingredient']) +1 for line in pred])
    
    true, pred = stem(true), stem(pred)
    stem_dist = tree_distance(build_tree(true), build_tree(pred))
    true_stem = sum([len(line['ingredient']) +1 for line in true])
    pred_stem = sum([len(line['ingredient']) +1 for line in pred])
    
    return {'tree_dist_%s'%tag:  tree_dist,
            'true_nodes_%s'%tag: true_nodes,
            'pred_nodes_%s'%tag: pred_nodes,
            'stem_dist_%s'%tag:  stem_dist,
            'true_stem_%s'%tag:  true_stem,
            'pred_stem_%s'%tag:  pred_stem}

time: 20.5 s


In [25]:
# recipe1M
for i, v in tqdm.tqdm(enumerate(dic)):  
    if i in ls:
        dic[i].update(allinone(v['instructions'], v['generated_instr'], tag = '@'))
        dic[i].update(allinone(v['instructions'], v['generated_instr'], tag = '@rev', rev = True))
        
df2 = pd.DataFrame(dic)
temp = df2[[col for col in df2.columns if '@' in col]].iloc[ls]
#logger.report_text(str(temp))
print(str(temp.mean()))

precision_@precision_root                 0.731566
precision_freq_@precision_root            0.678349
precision_ngram_@precision_exact          0.632323
precision_ngram_freq_@precision_exact     0.598198
pred_nodes_@                             36.620000
pred_nodes_@rev                          36.620000
pred_stem_@                              15.484000
pred_stem_@rev                           15.484000
recall_@recall_root                       0.711097
recall_freq_@recall_root                  0.699889
recall_ngram_@recall_exact                0.574229
recall_ngram_freq_@recall_exact           0.572469
stem_dist_@                              17.569806
stem_dist_@rev                           19.230115
tree_dist_@                              44.439031
tree_dist_@rev                           48.258554
true_nodes_@                             48.600000
true_nodes_@rev                          48.600000
true_stem_@                              21.268000
true_stem_@rev                 

In [26]:
val = temp.apply(lambda x: x['tree_dist_@']/(x['true_nodes_@']+x['pred_nodes_@']), axis=1).mean()
print (val)

val = temp.apply(lambda x: x['stem_dist_@']/(x['true_stem_@']+x['pred_stem_@']), axis=1).mean()
print (val)

val = temp.apply(lambda x: x['tree_dist_@rev']/(x['true_nodes_@rev']+x['pred_nodes_@rev']), axis=1).mean()
print (val)

val = temp.apply(lambda x: x['stem_dist_@rev']/(x['true_stem_@rev']+x['pred_stem_@rev']), axis=1).mean()
print (val)

0.510792893873668
0.465785985623208
0.561067984836763
0.5156743513430984
time: 191 ms


In [27]:
filename = '../../to_gpt2/generation_20191010-s/'
ls = load_dir_data(filename)

load ../../to_gpt2/generation_20191010-s/
time: 985 ms


In [28]:
# recipe1M subset
for i, v in tqdm.tqdm(enumerate(dic)):  
    if i in ls:
        dic[i].update(allinone(v['instructions'], v['generated_instr'], tag = '@'))
        dic[i].update(allinone(v['instructions'], v['generated_instr'], tag = '@rev', rev = True))
        
df2 = pd.DataFrame(dic)
temp = df2[[col for col in df2.columns if '@' in col]].iloc[ls]
#logger.report_text(str(temp))
print(str(temp.mean()))

1029720it [06:46, 2532.29it/s]


precision_@precision_root                 0.731566
precision_freq_@precision_root            0.678349
precision_ngram_@precision_exact          0.632323
precision_ngram_freq_@precision_exact     0.598198
pred_nodes_@                             35.510000
pred_nodes_@rev                          35.510000
pred_stem_@                              15.378000
pred_stem_@rev                           15.378000
recall_@recall_root                       0.711097
recall_freq_@recall_root                  0.699889
recall_ngram_@recall_exact                0.574229
recall_ngram_freq_@recall_exact           0.572469
stem_dist_@                              19.339024
stem_dist_@rev                           20.251536
tree_dist_@                              48.110847
tree_dist_@rev                           50.140224
true_nodes_@                             48.600000
true_nodes_@rev                          48.600000
true_stem_@                              21.268000
true_stem_@rev                 

In [29]:
val = temp.apply(lambda x: x['tree_dist_@']/(x['true_nodes_@']+x['pred_nodes_@']), axis=1).mean()
print (val)

val = temp.apply(lambda x: x['stem_dist_@']/(x['true_stem_@']+x['pred_stem_@']), axis=1).mean()
print (val)

val = temp.apply(lambda x: x['tree_dist_@rev']/(x['true_nodes_@rev']+x['pred_nodes_@rev']), axis=1).mean()
print (val)

val = temp.apply(lambda x: x['stem_dist_@rev']/(x['true_stem_@rev']+x['pred_stem_@rev']), axis=1).mean()
print (val)

0.569779852598404
0.5226003813029513
0.5968378247543937
0.5520942707757207
time: 128 ms
