In [1]:
import argparse
import os
import csv
import random
import logging
from tqdm import tqdm, trange

import numpy as np
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from pytorch_pretrained_bert import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer, OpenAIAdam

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [2]:
spl_tokens = {'pos': '<POS>',
              'neg': '<NEG>',
              'Con_start':'<CON_START>',
              'start': '<START>',
              'end': '<END>'}
spl_tokens

{'pos': '<POS>',
 'neg': '<NEG>',
 'Con_start': '<CON_START>',
 'start': '<START>',
 'end': '<END>'}

In [6]:
VOCAB_PATH = os.path.join(os.getcwd(), 'amazon_attribute_vocab.txt')
POS_FILE_PATH = os.path.join(os.getcwd(), 'sentiment_train_1.txt')
NEG_FILE_PATH = os.path.join(os.getcwd(),'sentiment_train_0.txt')
POS_OUT_FILE_PATH = os.path.join(os.getcwd(), 'processed_sentiment_train_1.txt')
NEG_OUT_FILE_PATH = os.path.join(os.getcwd(), 'processed_sentiment_train_0.txt')

In [7]:
attribute_vocab = []
with open(VOCAB_PATH, 'r', encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        attribute_vocab.append(line)

In [8]:
def extract_attribure(attribute_vocab, line):
    content = []
    attribute = []
    for token in line:
        if token in attribute_vocab:
            attribute.append(token)
        else:
            content.append(token)
    return content, attribute

In [9]:
def prepare_data_file(reference_file_path, output_file_path, type_token='pos'):
    count = 0
    out_file = open(output_file_path, 'w', encoding='utf-8')
    with open(reference_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            count += 1
            line = line.strip()
            tokens = line.split(' ')
            cont, att = extract_attribure(attribute_vocab, tokens)
            ipstr = spl_tokens[type_token] + ' '+ spl_tokens['Con_start'] + ' ' + ' '.join(cont)  + ' ' + spl_tokens[
                'start'] + ' ' + line + ' '+ spl_tokens['end'] + "\n"
            out_file.write(ipstr)
            if (count % 10000 == 0):
                print(count)
    out_file.close()

In [22]:
lt = []

In [24]:
lt == []

True

In [39]:
def prepare_test_data_file(reference_file_path, output_file_path, type_token='pos'):
    count = 0
    out_file = open(output_file_path, 'w', encoding='utf-8')
    with open(reference_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            count += 1
            line = line.strip()
            tokens = line.split(' ')
            cont, att = extract_attribure(attribute_vocab, tokens)
            if att != []:
                ipstr = spl_tokens[type_token] + ' '+ spl_tokens['Con_start'] + ' ' + ' '.join(cont)  + ' ' + spl_tokens[
                'start'] + "\n"
                out_file.write(ipstr)
            if (count % 10000 == 0):
                print(count)
    out_file.close()

In [38]:
def prepare_test_data_file_v1(reference_file_path, output_file_path, output_ref_file, type_token='pos'):
    count = 0
    out_file = open(output_file_path, 'w', encoding='utf-8')
    out_ref_file = open(output_ref_file, 'w', encoding='utf-8')
    with open(reference_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            count += 1
            line = line.strip()
            tokens = line.split(' ')
            cont, att = extract_attribure(attribute_vocab, tokens)
            if att != []:
                ipstr = spl_tokens[type_token] + ' '+ spl_tokens['Con_start'] + ' ' + ' '.join(cont)  + ' ' + spl_tokens[
                'start'] + "\n"
                out_file.write(ipstr)
                out_ref_file.write(line + "\n")
            if (count % 10000 == 0):
                print(count)
    out_file.close()

In [None]:
prepare_data_file(POS_FILE_PATH, POS_OUT_FILE_PATH, type_token='pos')

In [None]:
prepare_data_file(NEG_FILE_PATH, NEG_OUT_FILE_PATH, type_token='neg')

In [40]:
POS_VAL_FILE_PATH = os.path.join(os.getcwd(), 'sentiment_dev_1.txt')
NEG_VAL_FILE_PATH = os.path.join(os.getcwd(),'sentiment_dev_0.txt')
POS_VAL_OUT_FILE_PATH = os.path.join(os.getcwd(), 'processed_sentiment_dev_1.txt')
NEG_VAL_OUT_FILE_PATH = os.path.join(os.getcwd(), 'processed_sentiment_dev_0.txt')
POS_TEST_FILE_PATH = os.path.join(os.getcwd(), 'sentiment_test_1.txt')
NEG_TEST_FILE_PATH = os.path.join(os.getcwd(),'sentiment_test_0.txt')
POS_TEST_OUT_FILE_PATH = os.path.join(os.getcwd(), 'processed_sentiment_test_1.txt')
NEG_TEST_OUT_FILE_PATH = os.path.join(os.getcwd(), 'processed_sentiment_test_0.txt')
POS_TEST_OUT_FILE_PATH1 = os.path.join(os.getcwd(), 'processed_sentiment_test_1_1.txt')
NEG_TEST_OUT_FILE_PATH1 = os.path.join(os.getcwd(), 'processed_sentiment_test_0_1.txt')
POS_TEST_FILE_PATH_FILTER = os.path.join(os.getcwd(), 'sentiment_test_filtered_1.txt')
NEG_TEST_FILE_PATH_FILTER = os.path.join(os.getcwd(),'sentiment_test_filtered_0.txt')

In [None]:
prepare_data_file(POS_VAL_FILE_PATH, POS_VAL_OUT_FILE_PATH, type_token='pos')

In [None]:
prepare_data_file(NEG_VAL_FILE_PATH, NEG_VAL_OUT_FILE_PATH, type_token='neg')

In [27]:
prepare_test_data_file(POS_TEST_FILE_PATH, POS_TEST_OUT_FILE_PATH1)

In [28]:
prepare_test_data_file(NEG_TEST_FILE_PATH, NEG_TEST_OUT_FILE_PATH1, type_token='neg')

In [41]:
prepare_test_data_file_v1(POS_TEST_FILE_PATH, POS_TEST_OUT_FILE_PATH1, POS_TEST_FILE_PATH_FILTER)

In [42]:
prepare_test_data_file_v1(NEG_TEST_FILE_PATH, NEG_TEST_OUT_FILE_PATH1, NEG_TEST_FILE_PATH_FILTER, type_token='neg')

In [43]:
TRAIN_OUT_FILE_PATH = os.path.join(os.getcwd(), 'processed_sentiment_train.txt')
TEST_OUT_FILE_PATH = os.path.join(os.getcwd(), 'processed_sentiment_test_v1.txt')
TEST_REF_FILE_PATH = os.path.join(os.getcwd(), 'sentiment_test_filtered.txt')
VAL_OUT_FILE_PATH = os.path.join(os.getcwd(), 'processed_sentiment_dev.txt')

In [30]:
def concate_files(output_file_path, input_files):
    with open(output_file_path, 'w', encoding='utf-8') as out_f:
        for file in input_files:
            with open(file, 'r') as f1:
                for line in f1:
                    out_f.write(line)

In [None]:
concate_files(output_file_path=TRAIN_OUT_FILE_PATH, input_files=[POS_OUT_FILE_PATH, NEG_OUT_FILE_PATH])

In [34]:
concate_files(output_file_path=TEST_OUT_FILE_PATH, input_files=[POS_TEST_OUT_FILE_PATH1, NEG_TEST_OUT_FILE_PATH1])

In [44]:
concate_files(output_file_path=TEST_REF_FILE_PATH, input_files=[POS_TEST_FILE_PATH_FILTER, NEG_TEST_FILE_PATH_FILTER])

In [None]:
concate_files(output_file_path=VAL_OUT_FILE_PATH, input_files=[POS_VAL_OUT_FILE_PATH, NEG_VAL_OUT_FILE_PATH])

In [None]:
np.zeros()

In [35]:
!cat processed_sentiment_test_v1.txt

<POS> <CON_START> very price but this one harder to use than my cusinart . <START>
<POS> <CON_START> replaced my pampered press with this one . <START>
<POS> <CON_START> and seems similar to apple products . <START>
<POS> <CON_START> i have many products ,  and i ve been pleased . <START>
<POS> <CON_START> it turned out just to replace what i had lost . <START>
<POS> <CON_START> the way these roll instead of sliding on the rod . <START>
<POS> <CON_START> they pile up in the closet and up in the . <START>
<POS> <CON_START> i had it a long time now and i still it . <START>
<POS> <CON_START> this offers really protection for the phone . <START>
<POS> <CON_START> this is one of the vacuums i ve ever come across . <START>
<POS> <CON_START> one minute per slice seems a rule of thumb so far . <START>
<POS> <CON_START> the metal is very and the non stick finish is . <START>
<POS> <CON_START> the flap also is and does not come loose at all . <START>
<POS> <CON_START> sharp cutting 

In [36]:
!cat amazon_test_result_v1.txt

very good price but this one harder to use than my cusinart . <END> 
replaced my pampered chef press with this one . <END> 
quality and seems similar to apple products . <END> 
i have many oxo products , and i ve been pleased . <END> 
it turned out just to replace what i had lost . <END> 
love the way these roll instead of sliding on the rod . <END> 
they pile up in the closet and clean up in the dishwasher . <END> 
i had it a long time now and i still love it . <END> 
this offers really good protection for the phone . <END> 
this is one of the best vacuums i ve ever come across . <END> 
one minute per slice seems like a rule of thumb so far . <END> 
the metal is very sturdy and the non stick finish is great . <END> 
the flap also is sturdy and does not come loose at all . <END> 
sharp cutting edges make your knives look as good as they taste . <END> 
recently in the trash as the line wore out . <END> 
i have several of these , well worth it . <END> 
good charger , thou

In [45]:
from torchnlp.metrics import get_moses_multi_bleu

In [51]:
hypotheses = []
reference = []

In [74]:
with open(os.path.join(os.getcwd(), './amazon_test_result_v1.txt')) as fp1:
    hypotheses = fp1.readlines()
with open(TEST_REF_FILE_PATH) as fp1:
    reference = fp1.readlines()

In [75]:
reference = list(map(lambda x: x.strip(), reference))
reference

['very good price but this one harder to use than my cusinart .',
 'replaced my pampered chef garlic press with this one .',
 'durable and seems similar quality to apple products .',
 'i have many oxo products ,  and i ve always been pleased .',
 'it turned out just fine to replace what i had lost .',
 'love the way these roll instead of sliding on the rod .',
 'they pile up nice in the closet and clean up great in the dishwasher .',
 'i had it a long time now and i still love it .',
 'this offers really good protection for the phone .',
 'this is one of the best vacuums i ve ever come across .',
 'one minute per slice seems like a good rule of thumb so far .',
 'the metal is very sturdy and the non stick finish is great .',
 'the flap also is good and does not come loose at all .',
 'sharp cutting edges make your pies look as good as they taste .',
 'recently threw in the trash as the line wore out .',
 'i have several of these ,  well worth it .',
 'great charger ,  though there may 

In [78]:
hypotheses=list(map(lambda x: x.strip().replace('<END>',''), hypotheses))
hypotheses

['very good price but this one harder to use than my cusinart . ',
 'replaced my pampered chef press with this one . ',
 'quality and seems similar to apple products . ',
 'i have many oxo products , and i ve been pleased . ',
 'it turned out just to replace what i had lost . ',
 'love the way these roll instead of sliding on the rod . ',
 'they pile up in the closet and clean up in the dishwasher . ',
 'i had it a long time now and i still love it . ',
 'this offers really good protection for the phone . ',
 'this is one of the best vacuums i ve ever come across . ',
 'one minute per slice seems like a rule of thumb so far . ',
 'the metal is very sturdy and the non stick finish is great . ',
 'the flap also is sturdy and does not come loose at all . ',
 'sharp cutting edges make your knives look as good as they taste . ',
 'recently in the trash as the line wore out . ',
 'i have several of these , well worth it . ',
 'good charger , though there may be cheaper options . ',
 'so ship

In [79]:
get_moses_multi_bleu(hypotheses, reference, lowercase=True)

76.5

In [76]:
reference

['very good price but this one harder to use than my cusinart .',
 'replaced my pampered chef garlic press with this one .',
 'durable and seems similar quality to apple products .',
 'i have many oxo products ,  and i ve always been pleased .',
 'it turned out just fine to replace what i had lost .',
 'love the way these roll instead of sliding on the rod .',
 'they pile up nice in the closet and clean up great in the dishwasher .',
 'i had it a long time now and i still love it .',
 'this offers really good protection for the phone .',
 'this is one of the best vacuums i ve ever come across .',
 'one minute per slice seems like a good rule of thumb so far .',
 'the metal is very sturdy and the non stick finish is great .',
 'the flap also is good and does not come loose at all .',
 'sharp cutting edges make your pies look as good as they taste .',
 'recently threw in the trash as the line wore out .',
 'i have several of these ,  well worth it .',
 'great charger ,  though there may 

In [18]:
!cat processed_sentiment_test.txt 

<POS> <CON_START> i ve had this thermometer for about num_num years . <START>
<POS> <CON_START> will update this review should it meet a similar fate to my previous wheel . <START>
<POS> <CON_START> this product does what it is suppose to do . <START>
<POS> <CON_START> i purchased this filter for a braun num_extend coffeemaker . <START>
<POS> <CON_START> very price but this one harder to use than my cusinart . <START>
<POS> <CON_START> so ,  to me this is pretty darn  instant  . <START>
<POS> <CON_START> replaced my pampered press with this one . <START>
<POS> <CON_START> i use this as a little office supply shelf . <START>
<POS> <CON_START> and seems similar to apple products . <START>
<POS> <CON_START> i have many products ,  and i ve been pleased . <START>
<POS> <CON_START> i also prefered the blade weight and thickness of the wustof . <START>
<POS> <CON_START> make sure the pins line up with the contacts . <START>
<POS> <CON_START> it turned out just to replace what i h

In [19]:
!cat amazon_test_result.txt

i ve had this thermometer for about num _ num years . <END> 
will update this review should it meet a similar fate to my previous wheel . <END> 
this product does what it is suppose to do . <END> 
i purchased this filter for a braun num _ extend coffeemaker . <END> 
very good price but this one harder to use than my cusinart . <END> 
so , to me this is pretty darn instant . <END> 
replaced my pampered chef press with this one . <END> 
i use this as a little office supply shelf . <END> 
quality and seems similar to apple products . <END> 
i have many oxo products , and i ve been pleased . <END> 
i also prefered the blade weight and thickness of the wustof . <END> 
make sure the pins line up with the contacts . <END> 
it turned out just to replace what i had lost . <END> 
love the way these roll instead of sliding on the rod . <END> 
product arrived on time and was wrapped well . <END> 
they pile up in the closet and clean up in the dishwasher . <END> 
i had it a long tim

In [20]:
!cat processed_sentiment_test.txt | wc -l

1000


In [21]:
!cat amazon_test_result.txt | wc -l

1000
