In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Fri Mar 25 14:32:26 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 54.8 gigabytes of available RAM

You are using a high-RAM runtime!


In [3]:
!pip install ipykernel



In [4]:
!pip install simpletransformers

Collecting simpletransformers
  Downloading simpletransformers-0.63.6-py3-none-any.whl (249 kB)
[K     |████████████████████████████████| 249 kB 5.3 MB/s 
[?25hCollecting transformers>=4.6.0
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 85.1 MB/s 
Collecting datasets
  Downloading datasets-2.0.0-py3-none-any.whl (325 kB)
[K     |████████████████████████████████| 325 kB 94.0 MB/s 
[?25hCollecting streamlit
  Downloading streamlit-1.8.0-py2.py3-none-any.whl (10.1 MB)
[K     |████████████████████████████████| 10.1 MB 30.1 MB/s 
Collecting tokenizers
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 75.0 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 53.1 MB/s 
Collecting seqeval
  Downloadi

In [1]:
import logging

import pandas as pd
from simpletransformers.t5 import T5Model, T5Args
import transformers
import torch

In [2]:
import tensorflow as tf
device_name = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
# Read in train and test data

def read_in_data():
    x_train = pd.read_csv("/content/drive/MyDrive/UC Berkeley MIDS/W266/t5_dataset/x_train_stage_1_sampled_yelp_data.csv", sep = ",", header=0)
    y_train = pd.read_csv("/content/drive/MyDrive/UC Berkeley MIDS/W266/t5_dataset/y_train_stage_1_sampled_yelp_data.csv", sep = ",", header=0)
    x_test1 = pd.read_csv("/content/drive/MyDrive/UC Berkeley MIDS/W266/t5_dataset/x_train_stage_2_sampled_yelp_data.csv", sep = ",", header=0)
    y_test1 = pd.read_csv("/content/drive/MyDrive/UC Berkeley MIDS/W266/t5_dataset/y_train_stage_2_sampled_yelp_data.csv", sep = ",", header=0)
    return x_train, y_train, x_test1, y_test1
x_train, y_train, x_test1, y_test1 = read_in_data()

In [4]:
max_train_rows = 80000
train_data = pd.DataFrame(x_train[:max_train_rows]['text'])
train_data['target_text'] = y_train[:max_train_rows]['stars'].astype(str)
train_data.insert(0, 'prefix', 'predict sentiment')
train_data.columns = ["prefix", "input_text", "target_text"]
train_data.apply(lambda x: x.str.slice(0, 512))
train_data

Unnamed: 0,prefix,input_text,target_text
0,predict sentiment,RIDE RIDE RIDE!!! This is for all the bad mf'e...,5.0
1,predict sentiment,Only been here once and the vegan horchata pea...,5.0
2,predict sentiment,My teen found this place. We like our coffee b...,5.0
3,predict sentiment,My husband and I came here for an impromptu di...,3.0
4,predict sentiment,Great workout and the staff is great! I've be...,5.0
...,...,...,...
79995,predict sentiment,"Loved this place. Friendly staff, and awesome ...",5.0
79996,predict sentiment,I gotta put in a plug for Dr. Sam Epstein at t...,5.0
79997,predict sentiment,Great place for some awesome and a unique coff...,4.0
79998,predict sentiment,After learning everything I knew about Boston ...,4.0


In [5]:
max_test_rows = 1000
test_data = pd.DataFrame(x_test1[:max_test_rows]['text'])
test_data['stars'] = y_test1[:max_test_rows]['stars'].astype(str)
test_data.insert(0, 'prefix', 'predict sentiment')
test_data.columns = ["prefix", "input_text", "target_text"]
test_data.apply(lambda x: x.str.slice(0, 512))
test_data

Unnamed: 0,prefix,input_text,target_text
0,predict sentiment,This place has an amazing happy hour!!! The fo...,5.0
1,predict sentiment,Was a little nervous about this place as it do...,5.0
2,predict sentiment,I love this place Sunday nights! I just hate t...,5.0
3,predict sentiment,"Good, almost great burgers and very good fries...",4.0
4,predict sentiment,First time in since the remodel. I am impresse...,5.0
...,...,...,...
995,predict sentiment,Great customer service! I needed papers notari...,5.0
996,predict sentiment,After living with the installed windows for ab...,1.0
997,predict sentiment,The main attraction here is the location. Went...,3.0
998,predict sentiment,A very good prime rib dinner made excellent by...,4.0


In [6]:
torch.multiprocessing.set_sharing_strategy('file_system')
#ulimit -n 640000
#loss is cross-entropy

In [7]:
model_args = T5Args()
model_args.num_train_epochs = 1
model_args.no_save = True
model_args.evaluate_generated_text = True
model_args.evaluate_during_training = False
model_args.evaluate_during_training_verbose = True
model_args.overwrite_output_dir = True
model_args.use_multiprocessing = False
#model_args.train_batch_size = 300
#model_args.tokens_per_batch=512
#model_args.max_source_len = 200

cuda_available = torch.cuda.is_available()
model = T5Model("t5", "t5-base", args=model_args,use_cuda=cuda_available, Truncate=True)

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

In [9]:
def count_matches(labels, preds):
    print(labels)
    print(preds)
    #df_matches_dict = {'Actual':labels, "Predicted":preds}
    df_matches = pd.DataFrame(preds)
    df_matches.to_csv('outputs.csv', index=False)
    matches = sum([1 if label == pred else 0 for label, pred in zip(labels, preds)])
    totalLabels = len(labels)
    print("Matches: ", matches)
    print("Total labels: ", totalLabels)
    print("Val accuracy: ", matches/totalLabels)
    return sum([1 if label == pred else 0 for label, pred in zip(labels, preds)])

In [10]:
# Trial with 80000 train rows and 1000 test rows
model.train_model(train_data)
print(model.eval_model(test_data, matches=count_matches))

  0%|          | 0/80000 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/10000 [00:00<?, ?it/s]



  0%|          | 0/1000 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/125 [00:00<?, ?it/s]

Generating outputs:   0%|          | 0/125 [00:00<?, ?it/s]

Decoding outputs:   0%|          | 0/1000 [00:00<?, ?it/s]

['5.0', '5.0', '5.0', '4.0', '5.0', '1.0', '5.0', '5.0', '4.0', '4.0', '4.0', '3.0', '5.0', '5.0', '2.0', '4.0', '4.0', '5.0', '3.0', '5.0', '5.0', '5.0', '4.0', '5.0', '3.0', '3.0', '5.0', '5.0', '5.0', '1.0', '4.0', '5.0', '1.0', '5.0', '5.0', '5.0', '4.0', '4.0', '4.0', '5.0', '5.0', '5.0', '4.0', '4.0', '4.0', '5.0', '5.0', '4.0', '1.0', '1.0', '4.0', '5.0', '5.0', '4.0', '5.0', '5.0', '5.0', '5.0', '3.0', '4.0', '4.0', '2.0', '5.0', '1.0', '5.0', '5.0', '2.0', '5.0', '5.0', '2.0', '5.0', '5.0', '5.0', '1.0', '2.0', '5.0', '4.0', '1.0', '4.0', '4.0', '4.0', '2.0', '4.0', '5.0', '4.0', '4.0', '5.0', '5.0', '4.0', '5.0', '5.0', '5.0', '2.0', '5.0', '5.0', '4.0', '2.0', '1.0', '5.0', '3.0', '3.0', '4.0', '5.0', '5.0', '4.0', '3.0', '5.0', '3.0', '3.0', '2.0', '1.0', '3.0', '5.0', '4.0', '3.0', '5.0', '5.0', '5.0', '5.0', '5.0', '1.0', '5.0', '3.0', '2.0', '5.0', '1.0', '5.0', '4.0', '5.0', '2.0', '1.0', '5.0', '3.0', '1.0', '2.0', '1.0', '5.0', '3.0', '3.0', '4.0', '5.0', '3.0', '5.0'

In [12]:
outputs1k = pd.read_csv('outputs.csv')

In [39]:
finalOutput = pd.DataFrame(outputs1k)

In [14]:
min_test_rows = 1000
max_test_rows = 10000
test_data = pd.DataFrame(x_test1[min_test_rows:max_test_rows]['text'])
test_data['stars'] = y_test1[min_test_rows:max_test_rows]['stars'].astype(str)
test_data.insert(0, 'prefix', 'predict sentiment')
test_data.columns = ["prefix", "input_text", "target_text"]
test_data.apply(lambda x: x.str.slice(0, 512))
test_data

Unnamed: 0,prefix,input_text,target_text
1000,predict sentiment,Stopped to get gas at Neil and Poplar food tru...,5.0
1001,predict sentiment,It's a Golds. It used to be a Western Wear st...,4.0
1002,predict sentiment,What an astounding experience! Mike is thorou...,5.0
1003,predict sentiment,I love Sweet Hut's bubble tea. It is delicious...,4.0
1004,predict sentiment,Great place! The food was really good! Definit...,5.0
...,...,...,...
9995,predict sentiment,"Delish!!! On the pricey side, but the food is...",4.0
9996,predict sentiment,We've lived on Rainey for nearly two decades a...,1.0
9997,predict sentiment,Our first experience here - it's stated as Veg...,4.0
9998,predict sentiment,"So yeah, another painful hipster ""impression"" ...",1.0


In [15]:
print(model.eval_model(test_data, matches=count_matches))

  0%|          | 0/9000 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    la

Running Evaluation:   0%|          | 0/1125 [00:00<?, ?it/s]

Generating outputs:   0%|          | 0/1125 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



Decoding outputs:   0%|          | 0/9000 [00:00<?, ?it/s]

['5.0', '4.0', '5.0', '4.0', '5.0', '3.0', '4.0', '4.0', '4.0', '5.0', '5.0', '5.0', '4.0', '5.0', '3.0', '1.0', '5.0', '5.0', '1.0', '5.0', '4.0', '1.0', '1.0', '4.0', '2.0', '5.0', '4.0', '4.0', '3.0', '4.0', '5.0', '5.0', '5.0', '1.0', '5.0', '3.0', '5.0', '5.0', '4.0', '3.0', '1.0', '5.0', '3.0', '4.0', '5.0', '4.0', '2.0', '1.0', '5.0', '5.0', '4.0', '1.0', '4.0', '4.0', '5.0', '4.0', '5.0', '3.0', '4.0', '5.0', '1.0', '5.0', '2.0', '3.0', '3.0', '5.0', '3.0', '5.0', '5.0', '4.0', '3.0', '4.0', '5.0', '5.0', '5.0', '5.0', '5.0', '5.0', '5.0', '4.0', '3.0', '5.0', '2.0', '5.0', '4.0', '1.0', '5.0', '4.0', '4.0', '1.0', '2.0', '5.0', '4.0', '2.0', '4.0', '5.0', '4.0', '5.0', '4.0', '4.0', '5.0', '3.0', '5.0', '1.0', '4.0', '3.0', '5.0', '5.0', '4.0', '3.0', '4.0', '4.0', '5.0', '5.0', '5.0', '2.0', '5.0', '5.0', '4.0', '1.0', '4.0', '5.0', '5.0', '4.0', '4.0', '5.0', '5.0', '5.0', '2.0', '3.0', '5.0', '5.0', '4.0', '1.0', '1.0', '2.0', '2.0', '4.0', '1.0', '5.0', '1.0', '3.0', '5.0'

In [16]:
outputs10k = pd.read_csv('outputs.csv')

In [40]:
finalOutput = finalOutput.append(outputs10k)

In [21]:
min_test_rows = 10000
max_test_rows = 20000
test_data = pd.DataFrame(x_test1[min_test_rows:max_test_rows]['text'])
test_data['stars'] = y_test1[min_test_rows:max_test_rows]['stars'].astype(str)
test_data.insert(0, 'prefix', 'predict sentiment')
test_data.columns = ["prefix", "input_text", "target_text"]
test_data.apply(lambda x: x.str.slice(0, 512))
test_data

Unnamed: 0,prefix,input_text,target_text
10000,predict sentiment,Yummy French food and cozy atmosphere! From th...,5.0
10001,predict sentiment,Great owner and thrilled that they have gluten...,5.0
10002,predict sentiment,STOP THE PRESS!...I work at a local hotel and ...,5.0
10003,predict sentiment,"Most taquerias have some sort of veggie taco, ...",4.0
10004,predict sentiment,"First and foremost, commenting on the parking ...",4.0
...,...,...,...
19995,predict sentiment,I can not over emphasize what a positive exper...,5.0
19996,predict sentiment,I came into Atlas for the first time during Co...,5.0
19997,predict sentiment,I can't believe my meal had zero calories :-)....,5.0
19998,predict sentiment,"I've been here several times, but this was eas...",4.0


In [22]:
print(model.eval_model(test_data, matches=count_matches))

  0%|          | 0/10000 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    la

Running Evaluation:   0%|          | 0/1250 [00:00<?, ?it/s]

Generating outputs:   0%|          | 0/1250 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



Decoding outputs:   0%|          | 0/10000 [00:00<?, ?it/s]

['5.0', '5.0', '5.0', '4.0', '4.0', '3.0', '4.0', '5.0', '4.0', '2.0', '4.0', '1.0', '4.0', '4.0', '4.0', '5.0', '5.0', '5.0', '5.0', '2.0', '4.0', '2.0', '5.0', '3.0', '5.0', '1.0', '4.0', '5.0', '5.0', '5.0', '5.0', '1.0', '5.0', '3.0', '2.0', '5.0', '4.0', '2.0', '3.0', '1.0', '1.0', '2.0', '5.0', '4.0', '5.0', '2.0', '5.0', '5.0', '5.0', '1.0', '5.0', '1.0', '5.0', '5.0', '1.0', '1.0', '3.0', '5.0', '3.0', '4.0', '1.0', '1.0', '3.0', '5.0', '5.0', '5.0', '1.0', '5.0', '5.0', '4.0', '1.0', '4.0', '5.0', '5.0', '5.0', '2.0', '4.0', '4.0', '5.0', '4.0', '1.0', '1.0', '4.0', '3.0', '5.0', '5.0', '5.0', '2.0', '5.0', '4.0', '5.0', '5.0', '5.0', '5.0', '5.0', '5.0', '3.0', '4.0', '5.0', '4.0', '1.0', '5.0', '1.0', '4.0', '5.0', '4.0', '5.0', '5.0', '5.0', '5.0', '1.0', '3.0', '5.0', '5.0', '1.0', '5.0', '5.0', '2.0', '1.0', '4.0', '3.0', '5.0', '5.0', '1.0', '5.0', '5.0', '3.0', '4.0', '4.0', '5.0', '5.0', '5.0', '1.0', '5.0', '5.0', '4.0', '1.0', '2.0', '5.0', '4.0', '4.0', '5.0', '4.0'

In [None]:
outputs20k = pd.read_csv('outputs.csv')


In [41]:
finalOutput = finalOutput.append(outputs20k)
finalOutput

Unnamed: 0,0
0,5.0
1,4.0
2,5.0
3,4.0
4,5.0
...,...
9995,5.0
9996,4.0
9997,5.0
9998,4.0


In [28]:
min_test_rows = 20000
max_test_rows = 30000
test_data = pd.DataFrame(x_test1[min_test_rows:max_test_rows]['text'])
test_data['stars'] = y_test1[min_test_rows:max_test_rows]['stars'].astype(str)
test_data.insert(0, 'prefix', 'predict sentiment')
test_data.columns = ["prefix", "input_text", "target_text"]
test_data.apply(lambda x: x.str.slice(0, 512))
test_data

Unnamed: 0,prefix,input_text,target_text
20000,predict sentiment,"Jeff is the best, super friendly and great wit...",5.0
20001,predict sentiment,Probably the worst dealership I have ever gone...,1.0
20002,predict sentiment,New Hong Kong- the bane of every college stude...,2.0
20003,predict sentiment,I don't understand why people are so angry in ...,5.0
20004,predict sentiment,"The place is very low key, the service was gre...",5.0
...,...,...,...
29995,predict sentiment,$1 Oysters always deserve a high rating! \n\nI...,4.0
29996,predict sentiment,Horrible Customer Service. I was home in Bost...,1.0
29997,predict sentiment,My son LOVES their beans in a bean and cheese ...,3.0
29998,predict sentiment,Friendly couple working this truck next to Reg...,4.0


In [29]:
print(model.eval_model(test_data, matches=count_matches))

  0%|          | 0/10000 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    la

Running Evaluation:   0%|          | 0/1250 [00:00<?, ?it/s]

Generating outputs:   0%|          | 0/1250 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



Decoding outputs:   0%|          | 0/10000 [00:00<?, ?it/s]

['5.0', '1.0', '2.0', '5.0', '5.0', '5.0', '5.0', '3.0', '3.0', '5.0', '1.0', '3.0', '5.0', '5.0', '1.0', '4.0', '5.0', '2.0', '1.0', '4.0', '4.0', '5.0', '1.0', '1.0', '4.0', '4.0', '4.0', '5.0', '4.0', '4.0', '3.0', '1.0', '5.0', '5.0', '5.0', '5.0', '1.0', '5.0', '3.0', '5.0', '1.0', '3.0', '4.0', '5.0', '1.0', '5.0', '4.0', '4.0', '5.0', '4.0', '3.0', '5.0', '4.0', '5.0', '5.0', '5.0', '1.0', '5.0', '5.0', '5.0', '5.0', '1.0', '5.0', '5.0', '1.0', '4.0', '3.0', '5.0', '5.0', '5.0', '3.0', '5.0', '5.0', '5.0', '1.0', '5.0', '2.0', '1.0', '5.0', '2.0', '4.0', '5.0', '5.0', '4.0', '1.0', '5.0', '5.0', '5.0', '4.0', '4.0', '4.0', '5.0', '5.0', '4.0', '3.0', '5.0', '5.0', '5.0', '1.0', '5.0', '5.0', '5.0', '1.0', '1.0', '4.0', '1.0', '5.0', '5.0', '5.0', '5.0', '2.0', '1.0', '4.0', '4.0', '1.0', '1.0', '5.0', '2.0', '3.0', '4.0', '5.0', '5.0', '5.0', '4.0', '5.0', '5.0', '5.0', '5.0', '1.0', '1.0', '4.0', '4.0', '3.0', '2.0', '2.0', '2.0', '4.0', '1.0', '4.0', '5.0', '4.0', '1.0', '5.0'

In [30]:
outputs30k = pd.read_csv('outputs.csv')


Unnamed: 0,0
0,5.0
1,4.0
2,5.0
3,4.0
4,5.0
...,...
9995,4.0
9996,1.0
9997,4.0
9998,4.0


In [42]:
finalOutput = finalOutput.append(outputs30k)
finalOutput

Unnamed: 0,0
0,5.0
1,4.0
2,5.0
3,4.0
4,5.0
...,...
9995,4.0
9996,1.0
9997,4.0
9998,4.0


In [31]:
min_test_rows = 30000
max_test_rows = 40000
test_data = pd.DataFrame(x_test1[min_test_rows:max_test_rows]['text'])
test_data['stars'] = y_test1[min_test_rows:max_test_rows]['stars'].astype(str)
test_data.insert(0, 'prefix', 'predict sentiment')
test_data.columns = ["prefix", "input_text", "target_text"]
test_data.apply(lambda x: x.str.slice(0, 512))
test_data

Unnamed: 0,prefix,input_text,target_text
30000,predict sentiment,Since Violet Taco isn't open weekdays for brea...,2.0
30001,predict sentiment,All I have to say is ... GO!!! You will not be...,5.0
30002,predict sentiment,"New Taiwanese restaurant opened May 9th, next ...",4.0
30003,predict sentiment,YES. My go to crawfish place. I always get the...,5.0
30004,predict sentiment,I love this place. It's like a small sanctuary...,5.0
...,...,...,...
39995,predict sentiment,I just came for drinks and would highly recomm...,4.0
39996,predict sentiment,I've been going to AHY on and off for about 3 ...,3.0
39997,predict sentiment,Oh my God. Worse clam chowder I've ever had. S...,2.0
39998,predict sentiment,Went here three times in one week. Pizza slice...,1.0


In [32]:
print(model.eval_model(test_data, matches=count_matches))

  0%|          | 0/10000 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    la

Running Evaluation:   0%|          | 0/1250 [00:00<?, ?it/s]

Generating outputs:   0%|          | 0/1250 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



Decoding outputs:   0%|          | 0/10000 [00:00<?, ?it/s]

['2.0', '5.0', '4.0', '5.0', '5.0', '4.0', '4.0', '5.0', '2.0', '5.0', '2.0', '1.0', '5.0', '4.0', '3.0', '5.0', '1.0', '4.0', '3.0', '5.0', '2.0', '1.0', '5.0', '1.0', '5.0', '5.0', '4.0', '5.0', '2.0', '3.0', '4.0', '5.0', '5.0', '4.0', '5.0', '4.0', '1.0', '3.0', '1.0', '5.0', '5.0', '5.0', '1.0', '5.0', '2.0', '5.0', '3.0', '5.0', '4.0', '4.0', '5.0', '4.0', '4.0', '5.0', '5.0', '2.0', '5.0', '5.0', '2.0', '3.0', '5.0', '5.0', '3.0', '5.0', '5.0', '5.0', '5.0', '3.0', '5.0', '5.0', '5.0', '2.0', '1.0', '3.0', '4.0', '3.0', '5.0', '5.0', '1.0', '3.0', '5.0', '2.0', '1.0', '5.0', '4.0', '5.0', '5.0', '5.0', '3.0', '5.0', '5.0', '3.0', '2.0', '5.0', '3.0', '5.0', '5.0', '5.0', '4.0', '4.0', '4.0', '5.0', '5.0', '4.0', '5.0', '1.0', '5.0', '5.0', '4.0', '4.0', '4.0', '5.0', '5.0', '5.0', '3.0', '5.0', '5.0', '3.0', '5.0', '5.0', '4.0', '1.0', '5.0', '4.0', '3.0', '5.0', '4.0', '5.0', '1.0', '5.0', '1.0', '2.0', '4.0', '5.0', '5.0', '5.0', '3.0', '5.0', '5.0', '3.0', '2.0', '4.0', '4.0'

In [33]:
outputs40k = pd.read_csv('outputs.csv')


Unnamed: 0,0
0,5.0
1,4.0
2,5.0
3,4.0
4,5.0
...,...
9995,4.0
9996,1.0
9997,4.0
9998,4.0


In [43]:
finalOutput = finalOutput.append(outputs40k)
finalOutput

Unnamed: 0,0
0,5.0
1,4.0
2,5.0
3,4.0
4,5.0
...,...
9995,5.0
9996,4.0
9997,1.0
9998,3.0


In [34]:
min_test_rows = 40000
max_test_rows = 50000
test_data = pd.DataFrame(x_test1[min_test_rows:max_test_rows]['text'])
test_data['stars'] = y_test1[min_test_rows:max_test_rows]['stars'].astype(str)
test_data.insert(0, 'prefix', 'predict sentiment')
test_data.columns = ["prefix", "input_text", "target_text"]
test_data.apply(lambda x: x.str.slice(0, 512))
test_data

Unnamed: 0,prefix,input_text,target_text
40000,predict sentiment,Came towards the end of the night for a Pineap...,4.0
40001,predict sentiment,This has become one of my favorite food trucks...,5.0
40002,predict sentiment,"This place needs a good cleaning, some fresh p...",3.0
40003,predict sentiment,Went here for a late brunch on a beautiful Sun...,5.0
40004,predict sentiment,I've been here for dinner and Sunday morning b...,4.0
...,...,...,...
49995,predict sentiment,I've had Blaze Pizza in my hometown before but...,5.0
49996,predict sentiment,Best donuts I've ever had! The chai flight is ...,5.0
49997,predict sentiment,Delicious food and amazing service. Even thou...,4.0
49998,predict sentiment,This place is a disgrace. I know that they're ...,1.0


In [35]:
print(model.eval_model(test_data, matches=count_matches))

  0%|          | 0/10000 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    la

Running Evaluation:   0%|          | 0/1250 [00:00<?, ?it/s]

Generating outputs:   0%|          | 0/1250 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



Decoding outputs:   0%|          | 0/10000 [00:00<?, ?it/s]

['4.0', '5.0', '3.0', '5.0', '4.0', '5.0', '3.0', '5.0', '2.0', '5.0', '2.0', '5.0', '3.0', '5.0', '4.0', '4.0', '1.0', '5.0', '4.0', '2.0', '2.0', '5.0', '5.0', '4.0', '2.0', '3.0', '5.0', '5.0', '4.0', '5.0', '5.0', '5.0', '3.0', '5.0', '5.0', '2.0', '4.0', '5.0', '5.0', '5.0', '5.0', '5.0', '3.0', '5.0', '5.0', '5.0', '4.0', '1.0', '4.0', '3.0', '1.0', '1.0', '4.0', '5.0', '5.0', '4.0', '3.0', '1.0', '5.0', '3.0', '3.0', '2.0', '2.0', '3.0', '3.0', '5.0', '2.0', '2.0', '5.0', '2.0', '4.0', '4.0', '1.0', '5.0', '5.0', '2.0', '4.0', '5.0', '2.0', '3.0', '3.0', '2.0', '5.0', '4.0', '5.0', '2.0', '5.0', '3.0', '5.0', '3.0', '5.0', '5.0', '4.0', '5.0', '1.0', '3.0', '3.0', '3.0', '3.0', '3.0', '3.0', '5.0', '4.0', '4.0', '1.0', '5.0', '4.0', '2.0', '4.0', '1.0', '1.0', '2.0', '5.0', '5.0', '5.0', '5.0', '5.0', '4.0', '5.0', '5.0', '5.0', '5.0', '5.0', '2.0', '2.0', '5.0', '1.0', '4.0', '1.0', '5.0', '5.0', '5.0', '5.0', '4.0', '4.0', '5.0', '4.0', '5.0', '4.0', '2.0', '5.0', '5.0', '4.0'

In [None]:
outputs50k = pd.read_csv('outputs.csv')


In [44]:
finalOutput = finalOutput.append(outputs50k)
finalOutput

Unnamed: 0,0
0,5.0
1,4.0
2,5.0
3,4.0
4,5.0
...,...
9995,5.0
9996,5.0
9997,5.0
9998,1.0


In [37]:
min_test_rows = 50000
max_test_rows = 60000
test_data = pd.DataFrame(x_test1[min_test_rows:max_test_rows]['text'])
test_data['stars'] = y_test1[min_test_rows:max_test_rows]['stars'].astype(str)
test_data.insert(0, 'prefix', 'predict sentiment')
test_data.columns = ["prefix", "input_text", "target_text"]
test_data.apply(lambda x: x.str.slice(0, 512))
test_data

Unnamed: 0,prefix,input_text,target_text
50000,predict sentiment,"Massive, massive restaurant in Wellesley. Stop...",4.0
50001,predict sentiment,I called first this time to make sure they wer...,3.0
50002,predict sentiment,I have been getting my hair done at Michael's ...,5.0
50003,predict sentiment,This was very good. The sauce was very rich an...,4.0
50004,predict sentiment,Doughnut and cider pairing was fabulous. The p...,5.0
...,...,...,...
59995,predict sentiment,A decent hot-pot place in town. Shabu-zen will...,4.0
59996,predict sentiment,I'm pretty sure Sway is my favorite restaurant...,5.0
59997,predict sentiment,Five stars for the beer selection. \n\nFood is...,5.0
59998,predict sentiment,The hop on hop off trolley driver recommended ...,5.0


In [38]:
print(model.eval_model(test_data, matches=count_matches))

  0%|          | 0/10000 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    la

Running Evaluation:   0%|          | 0/1250 [00:00<?, ?it/s]

Generating outputs:   0%|          | 0/1250 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



Decoding outputs:   0%|          | 0/10000 [00:00<?, ?it/s]

['4.0', '3.0', '5.0', '4.0', '5.0', '4.0', '5.0', '5.0', '5.0', '5.0', '4.0', '5.0', '4.0', '5.0', '1.0', '4.0', '4.0', '5.0', '3.0', '1.0', '3.0', '2.0', '5.0', '2.0', '5.0', '5.0', '3.0', '3.0', '5.0', '5.0', '5.0', '1.0', '4.0', '1.0', '4.0', '5.0', '5.0', '4.0', '5.0', '1.0', '4.0', '3.0', '5.0', '5.0', '5.0', '2.0', '1.0', '4.0', '5.0', '5.0', '5.0', '3.0', '5.0', '4.0', '4.0', '4.0', '4.0', '3.0', '1.0', '5.0', '1.0', '2.0', '1.0', '5.0', '1.0', '4.0', '5.0', '5.0', '1.0', '5.0', '4.0', '5.0', '5.0', '5.0', '2.0', '4.0', '1.0', '1.0', '4.0', '4.0', '1.0', '2.0', '5.0', '5.0', '2.0', '3.0', '5.0', '1.0', '5.0', '4.0', '1.0', '5.0', '1.0', '4.0', '5.0', '5.0', '1.0', '2.0', '4.0', '5.0', '4.0', '4.0', '5.0', '4.0', '4.0', '4.0', '2.0', '1.0', '1.0', '1.0', '4.0', '5.0', '4.0', '2.0', '4.0', '5.0', '3.0', '5.0', '2.0', '2.0', '3.0', '5.0', '4.0', '3.0', '5.0', '3.0', '5.0', '5.0', '4.0', '5.0', '4.0', '4.0', '4.0', '3.0', '1.0', '1.0', '1.0', '3.0', '3.0', '4.0', '5.0', '1.0', '1.0'

In [45]:
outputs60k = pd.read_csv('outputs.csv')
finalOutput = finalOutput.append(outputs60k)
finalOutput

Unnamed: 0,0
0,5.0
1,4.0
2,5.0
3,4.0
4,5.0
...,...
9995,4.0
9996,5.0
9997,5.0
9998,5.0


In [46]:
min_test_rows = 60000
max_test_rows = 70000
test_data = pd.DataFrame(x_test1[min_test_rows:max_test_rows]['text'])
test_data['stars'] = y_test1[min_test_rows:max_test_rows]['stars'].astype(str)
test_data.insert(0, 'prefix', 'predict sentiment')
test_data.columns = ["prefix", "input_text", "target_text"]
test_data.apply(lambda x: x.str.slice(0, 512))
test_data

Unnamed: 0,prefix,input_text,target_text
60000,predict sentiment,"Really bad service and food, meat is not fresh...",1.0
60001,predict sentiment,"After spending a few days in the mountains, we...",5.0
60002,predict sentiment,This place is great. I just started doing Zumb...,5.0
60003,predict sentiment,"On our way to a vacation in Disney World, my g...",5.0
60004,predict sentiment,I went there for a Brazilian breakfast and I l...,2.0
...,...,...,...
69995,predict sentiment,See. The food is bad. If you are fine with Ind...,1.0
69996,predict sentiment,This and ALL circuses with live animals are TH...,1.0
69997,predict sentiment,The food seems to be pretty decent. I like the...,4.0
69998,predict sentiment,Fantastic place for brunch when you desire a p...,5.0


In [47]:
print(model.eval_model(test_data, matches=count_matches))

  0%|          | 0/10000 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    la

Running Evaluation:   0%|          | 0/1250 [00:00<?, ?it/s]

Generating outputs:   0%|          | 0/1250 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



Decoding outputs:   0%|          | 0/10000 [00:00<?, ?it/s]

['1.0', '5.0', '5.0', '5.0', '2.0', '3.0', '1.0', '5.0', '5.0', '5.0', '4.0', '5.0', '5.0', '5.0', '4.0', '4.0', '5.0', '5.0', '5.0', '5.0', '5.0', '3.0', '1.0', '1.0', '4.0', '2.0', '5.0', '4.0', '4.0', '1.0', '5.0', '4.0', '4.0', '1.0', '5.0', '5.0', '4.0', '2.0', '4.0', '5.0', '5.0', '1.0', '5.0', '5.0', '2.0', '5.0', '4.0', '5.0', '5.0', '5.0', '4.0', '5.0', '4.0', '4.0', '4.0', '5.0', '1.0', '4.0', '5.0', '2.0', '4.0', '5.0', '4.0', '4.0', '5.0', '5.0', '4.0', '3.0', '5.0', '5.0', '2.0', '5.0', '5.0', '5.0', '1.0', '5.0', '3.0', '3.0', '5.0', '5.0', '2.0', '2.0', '2.0', '5.0', '5.0', '5.0', '5.0', '4.0', '4.0', '2.0', '4.0', '5.0', '4.0', '4.0', '1.0', '1.0', '4.0', '5.0', '5.0', '5.0', '4.0', '1.0', '4.0', '3.0', '1.0', '1.0', '4.0', '5.0', '3.0', '4.0', '5.0', '4.0', '4.0', '2.0', '4.0', '2.0', '5.0', '2.0', '4.0', '4.0', '2.0', '1.0', '4.0', '5.0', '1.0', '5.0', '5.0', '5.0', '3.0', '5.0', '2.0', '3.0', '3.0', '4.0', '5.0', '4.0', '3.0', '3.0', '5.0', '1.0', '5.0', '5.0', '5.0'

In [48]:
outputs70k = pd.read_csv('outputs.csv')
finalOutput = finalOutput.append(outputs70k)
finalOutput

Unnamed: 0,0
0,5.0
1,4.0
2,5.0
3,4.0
4,5.0
...,...
9995,4.0
9996,5.0
9997,4.0
9998,5.0


In [49]:
min_test_rows = 70000
test_data = pd.DataFrame(x_test1[min_test_rows:]['text'])
test_data['stars'] = y_test1[min_test_rows:]['stars'].astype(str)
test_data.insert(0, 'prefix', 'predict sentiment')
test_data.columns = ["prefix", "input_text", "target_text"]
test_data.apply(lambda x: x.str.slice(0, 512))
test_data

Unnamed: 0,prefix,input_text,target_text
70000,predict sentiment,I'm surprised about the bad reviews! This is m...,4.0
70001,predict sentiment,"Not a fan of Shrimp and Grits, but I would ord...",5.0
70002,predict sentiment,"Food delicious, staff great but... long... wai...",4.0
70003,predict sentiment,What a strange experience I had with Sixt Rent...,3.0
70004,predict sentiment,We were in town from Asheville to pick our son...,4.0
...,...,...,...
79995,predict sentiment,"It's delicious, most of the time. I don't like...",4.0
79996,predict sentiment,Cool little bar I stopped at while waiting on ...,5.0
79997,predict sentiment,I recently had a great stay at this hotel. The...,5.0
79998,predict sentiment,Tatsumi is located in the same plaza as Alamo ...,5.0


In [50]:
print(model.eval_model(test_data, matches=count_matches))

  0%|          | 0/10000 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    la

Running Evaluation:   0%|          | 0/1250 [00:00<?, ?it/s]

Generating outputs:   0%|          | 0/1250 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



Decoding outputs:   0%|          | 0/10000 [00:00<?, ?it/s]

['4.0', '5.0', '4.0', '3.0', '4.0', '4.0', '5.0', '5.0', '5.0', '1.0', '5.0', '4.0', '3.0', '5.0', '4.0', '4.0', '4.0', '2.0', '4.0', '1.0', '4.0', '1.0', '5.0', '5.0', '5.0', '3.0', '5.0', '5.0', '3.0', '2.0', '1.0', '4.0', '3.0', '4.0', '5.0', '4.0', '5.0', '3.0', '5.0', '4.0', '4.0', '4.0', '5.0', '1.0', '5.0', '4.0', '1.0', '5.0', '5.0', '3.0', '3.0', '4.0', '4.0', '1.0', '5.0', '5.0', '2.0', '5.0', '2.0', '1.0', '3.0', '5.0', '2.0', '4.0', '5.0', '3.0', '5.0', '1.0', '5.0', '3.0', '4.0', '1.0', '5.0', '1.0', '4.0', '5.0', '5.0', '4.0', '5.0', '5.0', '4.0', '4.0', '5.0', '5.0', '2.0', '1.0', '5.0', '2.0', '5.0', '5.0', '5.0', '5.0', '4.0', '5.0', '3.0', '4.0', '5.0', '3.0', '5.0', '2.0', '5.0', '5.0', '4.0', '5.0', '5.0', '1.0', '1.0', '5.0', '5.0', '4.0', '4.0', '5.0', '5.0', '4.0', '5.0', '5.0', '4.0', '3.0', '4.0', '5.0', '4.0', '5.0', '5.0', '1.0', '3.0', '1.0', '5.0', '5.0', '5.0', '5.0', '3.0', '5.0', '1.0', '4.0', '4.0', '1.0', '4.0', '5.0', '4.0', '5.0', '1.0', '4.0', '3.0'

In [51]:
outputs80k = pd.read_csv('outputs.csv')
finalOutput = finalOutput.append(outputs80k)
finalOutput

Unnamed: 0,0
0,5.0
1,4.0
2,5.0
3,4.0
4,5.0
...,...
9995,5.0
9996,4.0
9997,5.0
9998,4.0


In [56]:
finalOutput = finalOutput.reset_index(drop=True)
finalOutput.columns = ['Predicted']

In [58]:
finalOutput['Actual'] = y_test1

In [60]:
finalOutput.to_csv('finalOutput.csv', index=False)

In [61]:
# Read in test data 2

def read_in_data_test():
    x_test2 = pd.read_csv("/content/drive/MyDrive/UC Berkeley MIDS/W266/t5_dataset/x_test_sampled_yelp_data_NEW.csv", sep = ",", header=0)
    y_test2 = pd.read_csv("/content/drive/MyDrive/UC Berkeley MIDS/W266/t5_dataset/y_test_sampled_yelp_data_NEW.csv", sep = ",", header=0)
    return x_test2, y_test2
x_test2, y_test2 = read_in_data_test()

In [63]:
max_test_rows = 10000
test_data = pd.DataFrame(x_test2[:max_test_rows]['text'])
test_data['stars'] = y_test2[:max_test_rows]['stars'].astype(str)
test_data.insert(0, 'prefix', 'predict sentiment')
test_data.columns = ["prefix", "input_text", "target_text"]
test_data.apply(lambda x: x.str.slice(0, 512))
test_data

Unnamed: 0,prefix,input_text,target_text
0,predict sentiment,One of the best Avis First services I have got...,5.0
1,predict sentiment,"Pizza is excellent, crust can be a bit messy b...",3.0
2,predict sentiment,"Always fast, fresh and delicious. They've got ...",5.0
3,predict sentiment,Went in today to buy a gift and had the most p...,5.0
4,predict sentiment,Hmmmmm.....just looked over the reviews from o...,5.0
...,...,...,...
9995,predict sentiment,I LOVE this place. This is how dining and foo...,5.0
9996,predict sentiment,One of the best sushi places in Columbus. Serv...,5.0
9997,predict sentiment,"Overall, my experience was good but a young ch...",4.0
9998,predict sentiment,"The pizza is good, but the success has gone to...",2.0


In [64]:
print(model.eval_model(test_data, matches=count_matches))

  0%|          | 0/10000 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    la

Running Evaluation:   0%|          | 0/1250 [00:00<?, ?it/s]

Generating outputs:   0%|          | 0/1250 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



Decoding outputs:   0%|          | 0/10000 [00:00<?, ?it/s]

['5.0', '3.0', '5.0', '5.0', '5.0', '5.0', '2.0', '4.0', '5.0', '5.0', '4.0', '2.0', '4.0', '5.0', '4.0', '5.0', '2.0', '5.0', '4.0', '5.0', '2.0', '5.0', '5.0', '5.0', '2.0', '3.0', '1.0', '4.0', '5.0', '2.0', '5.0', '2.0', '1.0', '5.0', '5.0', '5.0', '1.0', '2.0', '5.0', '1.0', '4.0', '2.0', '4.0', '2.0', '5.0', '4.0', '5.0', '2.0', '4.0', '4.0', '5.0', '2.0', '3.0', '3.0', '1.0', '5.0', '5.0', '2.0', '5.0', '4.0', '5.0', '1.0', '4.0', '3.0', '5.0', '5.0', '3.0', '5.0', '4.0', '3.0', '2.0', '2.0', '1.0', '1.0', '5.0', '5.0', '5.0', '1.0', '3.0', '5.0', '5.0', '3.0', '2.0', '1.0', '4.0', '5.0', '5.0', '3.0', '5.0', '4.0', '2.0', '5.0', '3.0', '4.0', '5.0', '5.0', '4.0', '1.0', '1.0', '5.0', '5.0', '5.0', '4.0', '5.0', '1.0', '5.0', '4.0', '4.0', '5.0', '5.0', '5.0', '5.0', '5.0', '1.0', '5.0', '1.0', '2.0', '5.0', '5.0', '5.0', '3.0', '3.0', '2.0', '1.0', '4.0', '5.0', '5.0', '3.0', '5.0', '4.0', '5.0', '5.0', '4.0', '1.0', '5.0', '3.0', '1.0', '2.0', '4.0', '3.0', '4.0', '5.0', '4.0'

In [65]:
outputs10k = pd.read_csv('outputs.csv')
finalOutput2 = outputs10k
finalOutput2

Unnamed: 0,0
0,5.0
1,5.0
2,5.0
3,5.0
4,2.0
...,...
9995,5.0
9996,5.0
9997,4.0
9998,2.0


In [66]:
min_test_rows = 10000
test_data = pd.DataFrame(x_test2[min_test_rows:]['text'])
test_data['stars'] = y_test2[min_test_rows:]['stars'].astype(str)
test_data.insert(0, 'prefix', 'predict sentiment')
test_data.columns = ["prefix", "input_text", "target_text"]
test_data.apply(lambda x: x.str.slice(0, 512))
test_data

Unnamed: 0,prefix,input_text,target_text
10000,predict sentiment,Great Food Truck!\nTheir street tacos are the ...,4.0
10001,predict sentiment,My fiancé and I frequent Punch Bowl Social bec...,4.0
10002,predict sentiment,"Pretty good tacos, however I'm not sure it's w...",3.0
10003,predict sentiment,We ate in the private room in the cellar- cool...,5.0
10004,predict sentiment,"Wow, I've never been pushed out the door of a ...",1.0
...,...,...,...
19995,predict sentiment,Very nice spot that overlooks the skyline espe...,3.0
19996,predict sentiment,If you are doing a kids birthday party this is...,1.0
19997,predict sentiment,I purchased 2 apple pies with a hint of blackb...,1.0
19998,predict sentiment,Everyone else has already said what a cool pla...,4.0


In [67]:
print(model.eval_model(test_data, matches=count_matches))

  0%|          | 0/10000 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    la

Running Evaluation:   0%|          | 0/1250 [00:00<?, ?it/s]

Generating outputs:   0%|          | 0/1250 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



Decoding outputs:   0%|          | 0/10000 [00:00<?, ?it/s]

['4.0', '4.0', '3.0', '5.0', '1.0', '3.0', '5.0', '5.0', '4.0', '4.0', '5.0', '5.0', '2.0', '5.0', '5.0', '2.0', '5.0', '5.0', '4.0', '1.0', '4.0', '5.0', '3.0', '5.0', '1.0', '1.0', '2.0', '4.0', '1.0', '4.0', '5.0', '3.0', '2.0', '5.0', '5.0', '5.0', '1.0', '3.0', '5.0', '5.0', '3.0', '1.0', '5.0', '1.0', '1.0', '4.0', '4.0', '5.0', '5.0', '1.0', '5.0', '4.0', '1.0', '2.0', '1.0', '1.0', '5.0', '5.0', '5.0', '5.0', '5.0', '5.0', '3.0', '5.0', '4.0', '5.0', '5.0', '5.0', '5.0', '3.0', '1.0', '5.0', '1.0', '4.0', '1.0', '1.0', '4.0', '5.0', '5.0', '5.0', '5.0', '4.0', '1.0', '4.0', '5.0', '4.0', '4.0', '4.0', '5.0', '5.0', '5.0', '5.0', '4.0', '5.0', '4.0', '5.0', '5.0', '4.0', '2.0', '5.0', '5.0', '4.0', '5.0', '5.0', '5.0', '4.0', '1.0', '1.0', '3.0', '5.0', '5.0', '5.0', '1.0', '5.0', '2.0', '4.0', '4.0', '1.0', '4.0', '1.0', '4.0', '3.0', '4.0', '4.0', '4.0', '4.0', '5.0', '1.0', '5.0', '5.0', '1.0', '5.0', '4.0', '5.0', '4.0', '5.0', '1.0', '3.0', '5.0', '5.0', '3.0', '3.0', '4.0'

In [68]:
outputs20k = pd.read_csv('outputs.csv')
finalOutput2 = finalOutput2.append(outputs20k)
finalOutput2

Unnamed: 0,0
0,5.0
1,5.0
2,5.0
3,5.0
4,2.0
...,...
9995,4.0
9996,1.0
9997,1.0
9998,5.0


In [69]:
finalOutput2 = finalOutput2.reset_index(drop=True)
finalOutput2.columns = ['Predicted']
finalOutput2['Actual'] = y_test2
finalOutput2.to_csv('finalOutput_test20k.csv', index=False)

In [70]:
finalOutput2

Unnamed: 0,Predicted,Actual
0,5.0,5.0
1,5.0,3.0
2,5.0,5.0
3,5.0,5.0
4,2.0,5.0
...,...,...
19995,4.0,3.0
19996,1.0,1.0
19997,1.0,1.0
19998,5.0,4.0
