In [1]:
%time
%load_ext autotime
%load_ext autoreload
%autoreload 2


# if cannot import the modules, add the parent directory to system path might help

import os, tqdm, sys
parent_dir = os.path.abspath(os.getcwd()+'/..')+'/'
sys.path.append(parent_dir)

from utils.path import dir_HugeFiles
from utils.preprocessing import load
from utils.save import make_dir, save_pickle, load_pickle, save
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
from collections import Counter
import spacy
import copy
import re
random_seed = 2019

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 6.2 µs


In [2]:
dic = load_pickle('../big_data/recipe1M_cleaned.pickle')

time: 11.1 s


In [3]:
def reverse(text):
    '''
    Important data cleaning before NY times parser
    '''
    # replace things in brace
    text = re.sub(r'\([^)]*\)', '', text)

    # remove space before punct
    text = re.sub(r'\s([?.!,"](?:\s|$))', r'\1', text)

    # remove consecutive spaces
    text = re.sub(' +',' ',text).strip()
    return text

def reverse_list(listoftext):
    output=[]
    for text in listoftext:
        rev = reverse(text)
        if rev:
            output.append(rev)
    return output

def load_dir_data(filename):
    ls = []
    if os.path.isdir(filename):
        print('load', filename)
        # Directory
        for (dirpath, _, fnames) in os.walk(filename):
            for fname in fnames:
                path = os.path.join(dirpath, fname)
                with open(path, 'r') as fp:
                    raw_text = fp.read()
                    
                # if it contains instr
                if fname[-5] == 'd':
                    dic[int(fname[:-5])]['generated_instr'] = reverse_list(raw_text.split('.'))

                # if it contains ingred
                if fname[-5] == 'i':
                    dic[int(fname[:-5])]['generated_ingred'] = reverse_list(raw_text.split('$'))
                    
                # if it contains name
                if fname[-5] == 't':
                    dic[int(fname[:-5])]['generated_name'] = raw_text
                ls.append(int(fname[:-5]))# only interested in instr
                    
    return sorted(list(set(ls)))

time: 54 ms


In [4]:
filename = '../../to_gpt2/generation_20191010-s/'
ls = load_dir_data(filename)

load ../../to_gpt2/generation_20191010-s/
time: 1.03 s


In [4]:
def add_space(line):
    # add space before punct
    line = re.sub('([.,!?()])', r' \1 ', line)
    line = re.sub('\s{2,}', ' ', line)
    return line

time: 28.7 ms


In [5]:
to_write = {'truth_t':'', 'truth_i':'', 'truth_d':'',
            'pred_t':'', 'pred_i':'', 'pred_d':''
           }
for i, v in enumerate(dic):
    if i in ls:
        to_write['truth_t'] += add_space(v['title']) + '\n'
        to_write['truth_i'] += add_space(' $ '.join(v['ingredients']))+ ' $ \n'
        to_write['truth_d'] += add_space(' '.join(v['instructions'])) + ' . \n'
        to_write['pred_t'] += add_space(v['generated_name']) + '\n'
        to_write['pred_i'] += add_space(' $ '.join(v['generated_ingred'])) + ' $ \n'
        to_write['pred_d'] += add_space(' . '.join(v['generated_instr'])) + ' . \n'
        
for k, v in to_write.items():
    save('../../to_gpt2/generation_20191010-s_%s.txt'%(k), v ,overwrite = True)

saved ../../to_gpt2/generation_20191010-s_truth_t.txt
saved ../../to_gpt2/generation_20191010-s_truth_i.txt
saved ../../to_gpt2/generation_20191010-s_truth_d.txt
saved ../../to_gpt2/generation_20191010-s_pred_t.txt
saved ../../to_gpt2/generation_20191010-s_pred_i.txt
saved ../../to_gpt2/generation_20191010-s_pred_d.txt
time: 6.16 s


In [6]:
!perl multi-bleu.perl ../../to_gpt2/generation_20191010-s_truth_t.txt < ../../to_gpt2/generation_20191010-s_pred_t.txt

BLEU = 5.49, 33.0/8.8/3.7/1.2 (BP=0.912, ratio=0.916, hyp_len=1880, ref_len=2053)
It is not advisable to publish scores from multi-bleu.perl.  The scores depend on your tokenizer, which is unlikely to be reproducible from your paper or consistent across research groups.  Instead you should detokenize then use mteval-v14.pl, which has a standard tokenization.  Scores from multi-bleu.perl can still be used for internal purposes when you have a consistent tokenizer.
time: 1.12 s


In [7]:
!perl multi-bleu.perl ../../to_gpt2/generation_20191010-s_truth_i.txt < ../../to_gpt2/generation_20191010-s_pred_i.txt

BLEU = 12.87, 40.6/19.9/9.2/3.7 (BP=1.000, ratio=1.223, hyp_len=34780, ref_len=28430)
It is not advisable to publish scores from multi-bleu.perl.  The scores depend on your tokenizer, which is unlikely to be reproducible from your paper or consistent across research groups.  Instead you should detokenize then use mteval-v14.pl, which has a standard tokenization.  Scores from multi-bleu.perl can still be used for internal purposes when you have a consistent tokenizer.
time: 642 ms


In [8]:
!perl multi-bleu.perl ../../to_gpt2/generation_20191010-s_truth_d.txt < ../../to_gpt2/generation_20191010-s_pred_d.txt

BLEU = 5.69, 47.7/14.4/4.6/1.7 (BP=0.661, ratio=0.708, hyp_len=46974, ref_len=66394)
It is not advisable to publish scores from multi-bleu.perl.  The scores depend on your tokenizer, which is unlikely to be reproducible from your paper or consistent across research groups.  Instead you should detokenize then use mteval-v14.pl, which has a standard tokenization.  Scores from multi-bleu.perl can still be used for internal purposes when you have a consistent tokenizer.
time: 959 ms


### examine the training set

In [5]:
filename = '../../to_gpt2/generation_20191010-s_train/'
ls = load_dir_data(filename)

to_write = {'truth_t':'', 'truth_i':'', 'truth_d':'',
            'pred_t':'', 'pred_i':'', 'pred_d':''
           }
for i, v in enumerate(dic):
    if i in ls:
        to_write['truth_t'] += add_space(v['title']) + '\n'
        to_write['truth_i'] += add_space(' $ '.join(v['ingredients']))+ ' $ \n'
        to_write['truth_d'] += add_space(' '.join(v['instructions'])) + ' . \n'
        to_write['pred_t'] += add_space(v['generated_name']) + '\n'
        to_write['pred_i'] += add_space(' $ '.join(v['generated_ingred'])) + ' $ \n'
        to_write['pred_d'] += add_space(' . '.join(v['generated_instr'])) + ' . \n'
        
for k, v in to_write.items():
    save('../../to_gpt2/generation_20191010-s_%s.txt'%(k), v ,overwrite = True)

load ../../to_gpt2/generation_20191010-s_train/
saved ../../to_gpt2/generation_20191010-s_truth_t.txt
saved ../../to_gpt2/generation_20191010-s_truth_i.txt
saved ../../to_gpt2/generation_20191010-s_truth_d.txt
saved ../../to_gpt2/generation_20191010-s_pred_t.txt
saved ../../to_gpt2/generation_20191010-s_pred_i.txt
saved ../../to_gpt2/generation_20191010-s_pred_d.txt
time: 6.93 s


In [6]:
!perl multi-bleu.perl ../../to_gpt2/generation_20191010-s_truth_t.txt < ../../to_gpt2/generation_20191010-s_pred_t.txt

BLEU = 7.60, 38.3/12.3/4.7/2.3 (BP=0.903, ratio=0.907, hyp_len=1904, ref_len=2099)
It is not advisable to publish scores from multi-bleu.perl.  The scores depend on your tokenizer, which is unlikely to be reproducible from your paper or consistent across research groups.  Instead you should detokenize then use mteval-v14.pl, which has a standard tokenization.  Scores from multi-bleu.perl can still be used for internal purposes when you have a consistent tokenizer.
time: 364 ms


In [7]:
!perl multi-bleu.perl ../../to_gpt2/generation_20191010-s_truth_i.txt < ../../to_gpt2/generation_20191010-s_pred_i.txt

BLEU = 14.53, 43.0/21.9/10.5/4.5 (BP=1.000, ratio=1.194, hyp_len=33855, ref_len=28365)
It is not advisable to publish scores from multi-bleu.perl.  The scores depend on your tokenizer, which is unlikely to be reproducible from your paper or consistent across research groups.  Instead you should detokenize then use mteval-v14.pl, which has a standard tokenization.  Scores from multi-bleu.perl can still be used for internal purposes when you have a consistent tokenizer.
time: 664 ms


In [8]:
!perl multi-bleu.perl ../../to_gpt2/generation_20191010-s_truth_d.txt < ../../to_gpt2/generation_20191010-s_pred_d.txt

BLEU = 6.19, 48.6/15.0/4.9/1.8 (BP=0.691, ratio=0.730, hyp_len=47083, ref_len=64463)
It is not advisable to publish scores from multi-bleu.perl.  The scores depend on your tokenizer, which is unlikely to be reproducible from your paper or consistent across research groups.  Instead you should detokenize then use mteval-v14.pl, which has a standard tokenization.  Scores from multi-bleu.perl can still be used for internal purposes when you have a consistent tokenizer.
time: 1.01 s


In [20]:
filename = '../../to_gpt2/generation_20191010-s/'
ls = load_dir_data(filename)

load ../../to_gpt2/generation_20191010-s/
time: 868 ms


In [21]:
### STEP3 sent to the NYtimes
### assign indices to each ingredient <---> NYtimes
class ny_ingredients:
    def __init__(self, fields):
        # this function will take the global variable ls and dic
        # static & reuseable
        self.ny_ingred = '../../NYtime-parser2/ingred.txt'
        self.ny_result = '../../NYtime-parser2/result.json'
        
        # spacy
        self.spacy = spacy.load('en_core_web_lg')
        self.fields = fields #['ingredients', 'generated_ingred']

    def to_ny(self):
        '''
        using global variables dic and ls
        '''
        to_write = []
        #for i, v in dic.items():
        for i, v in enumerate(dic):    
            if i in ls:
                # assing index
                for field in self.fields:
                    line_ids = []
                    for line in v[field]:
                        reversed_line = reverse(line)
                        if line in to_write:
                            ny_id = to_write.index(reversed_line)
                        else:
                            ny_id = len(to_write)
                            to_write.append(reversed_line)
                        line_ids.append(ny_id)
                    dic[i]['ny_%s'%(field)] = line_ids

        # save the file to the folder under NYtime-parser2
        save(filename = self.ny_ingred, 
             to_write = '\n'.join(to_write),
             overwrite = True, 
             print_=True)

        self.to_write = to_write
        
    # step 3
    def to_ingred(self):
        '''
        using global variables dic and ls
        '''
        ny_result = pd.read_json(self.ny_result)
        to_write = []
        #for i, v in dic.items():
        for i, v in enumerate(dic):    
            if i in ls:
                # assing index
                for field in self.fields:
                    temp = [ny_result.loc[ny_id]['name'] for ny_id in v['ny_%s'%(field)]]
                    exact, root = self.extract(temp)
                    dic[i]['ny_%s'%(field)] = {'ny':temp, 'exact':exact, 'root':root}
                    
    def extract(self, ny_ingred):
        '''
        Args: ny_ingred: a list of ingredient names
        '''
        phrases_to_sentences = ' '.join(['Mix the %s and water.'%ingr for ingr in ny_ingred])
        doc = self.spacy(phrases_to_sentences)
        exact_match, root_match = [],[]
        for chunk in doc.noun_chunks:
            if chunk.text != 'water':
                root_lemma = [token.lemma_ for token in doc if token.text == chunk.root.text][0]
                exact_match.append(chunk.lemma_.replace('the ',''))
                root_match.append(root_lemma)
        return exact_match, root_match

time: 29.3 ms


In [22]:
ny_ingr = ny_ingredients(fields = ['generated_ingred','ingredients'])
### step 3-1 save it as ingred.txt
ny_ingr.to_ny()
### step 3-2 go to python2 and run NLG_notebooks/Control Nytimes

saved ../../NYtime-parser2/ingred.txt
time: 16.4 s


In [23]:
### step 3-3 load the result.json back to dic
ny_ingr.to_ingred()

time: 28 s


In [13]:
from utils.evaluation import metrics

time: 46 ms


In [24]:
#for i, v in tqdm.tqdm(dic.items()):
for i, v in tqdm.tqdm(enumerate(dic)):    
    if i in ls:
        score = metrics(v['ny_ingredients']['exact'], v['ny_generated_ingred']['exact'])
        dic[i].update(score.all_ngram_recall(name='@recall_exact'))
        dic[i].update(score.all_ngram_precision(name='@precision_exact'))
        score = metrics(v['ny_ingredients']['root'], v['ny_generated_ingred']['root'])
        dic[i].update(score.all_recall(name='@recall_root'))
        dic[i].update(score.all_precision(name='@precision_root'))       
    
# df2 = pd.DataFrame.from_dict(dic, orient = 'index')
df2 = pd.DataFrame(dic)
temp = df2[[col for col in df2.columns if '@' in col]].iloc[ls]
print(str(temp.mean()))

1029720it [00:06, 151372.60it/s]


precision_@precision_root                0.626067
precision_freq_@precision_root           0.457042
precision_ngram_@precision_exact         0.542799
precision_ngram_freq_@precision_exact    0.423401
recall_@recall_root                      0.558612
recall_freq_@recall_root                 0.551796
recall_ngram_@recall_exact               0.445695
recall_ngram_freq_@recall_exact          0.443512
dtype: float64
time: 11.8 s


In [41]:
for metric_name in ['','freq_']:#,'ngram','ngram_freq']:
    print('f1',metric_name)
    str_p = 'precision_%s@precision_root'%(metric_name)
    str_r = 'recall_%s@recall_root'%(metric_name)
    score = temp.apply(lambda x: 2*x[str_p]*x[str_r]/(x[str_p]+x[str_r]) , axis = 1).mean()
    print(round(score, 3))
for metric_name in ['ngram_','ngram_freq_']:
    print('f1',metric_name)
    str_p = 'precision_%s@precision_exact'%(metric_name)
    str_r = 'recall_%s@recall_exact'%(metric_name)
    score = temp.apply(lambda x: 2*x[str_p]*x[str_r]/(x[str_p]+x[str_r]) , axis = 1).mean()
    print(round(score, 3))

f1 
0.593
f1 freq_
0.499
f1 ngram_
0.468
f1 ngram_freq_
0.409
time: 208 ms


  """
  # This is added back by InteractiveShellApp.init_path()


In [42]:
from utils.tree import instr2tree, tree_distance, build_tree
from utils.evaluation import spacy_extension
treemaker = instr2tree()
sp = spacy_extension()
def stem(x):
    return [{'word':d['word'], 'ingredient':[]} for d in x]

def allinone(v_directions, v_generated_instr, tag = ''):
    true = treemaker.sents2tree(v_directions)
    pred = treemaker.sents2tree(v_generated_instr)
    tree_dist = tree_distance(build_tree(true), build_tree(pred))
    
    true_nodes = sum([len(line['ingredient']) +1 for line in true])
    pred_nodes = sum([len(line['ingredient']) +1 for line in pred])
    
    true, pred = stem(true), stem(pred)
    stem_dist = tree_distance(build_tree(true), build_tree(pred))
    true_stem = sum([len(line['ingredient']) +1 for line in true])
    pred_stem = sum([len(line['ingredient']) +1 for line in pred])
    
    return {'tree_dist_%s'%tag:  tree_dist,
            'true_nodes_%s'%tag: true_nodes,
            'pred_nodes_%s'%tag: pred_nodes,
            'stem_dist_%s'%tag:  stem_dist,
            'true_stem_%s'%tag:  true_stem,
            'pred_stem_%s'%tag:  pred_stem}

time: 23.3 s


In [43]:
for i, v in tqdm.tqdm(enumerate(dic)):    
    if i in ls:
        dic[i].update(allinone(v['instructions'], v['generated_instr'], tag = '@'))
        
df2 = pd.DataFrame(dic)
temp = df2[[col for col in df2.columns if '@' in col]].iloc[ls]
print(str(temp.mean()))

1029720it [03:45, 4570.67it/s]


precision_@precision_root                 0.626067
precision_freq_@precision_root            0.457042
precision_ngram_@precision_exact          0.542799
precision_ngram_freq_@precision_exact     0.423401
pred_nodes_@                             35.510000
pred_stem_@                              15.378000
recall_@recall_root                       0.558612
recall_freq_@recall_root                  0.551796
recall_ngram_@recall_exact                0.445695
recall_ngram_freq_@recall_exact           0.443512
stem_dist_@                              19.339024
tree_dist_@                              48.110847
true_nodes_@                             48.600000
true_stem_@                              21.268000
dtype: float64
time: 3min 51s


In [44]:
filename

'../../to_gpt2/generation_20191010-s/'

time: 45.1 ms
