In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
  print("Using the GPU!")
  torch.backends.cudnn.enabled = True
  torch.backends.cudnn.benchmark = True
else:
  print("WARNING: Could not find GPU! Using CPU only. If you want to enable GPU, please to go Edit > Notebook Settings > Hardware Accelerator and select GPU.")


Using the GPU!


In [3]:
try:
  from transformers import AutoTokenizer, AutoModel
except:
  !pip install transformers
  from transformers import AutoTokenizer, AutoModel
import torch.nn as nn
import numpy as np
import json
from glob import glob
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import torch
import torch.nn as nn
import os
import matplotlib.pyplot as plt
import copy
import random
import pandas as pd
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import json

path = '/content/drive/Shareddrives/EECS595-Fall2020/Final_Project_Common/Conversational_Entailment'

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ed/db/98c3ea1a78190dac41c0127a063abf92bd01b4b0b6970a6db1c2f5b66fa0/transformers-4.0.1-py3-none-any.whl (1.4MB)
[K     |████████████████████████████████| 1.4MB 14.0MB/s 
[?25hCollecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 50.2MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 54.7MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893261 sha256=504053e935fc

# Conversational Entailment


There are 520 samples in this dataset.

The binary label indicates if this story is plausible. If lable is 0, this story is implausible. In this case, the breakpoint will not be -1.

In [4]:
max_length = 0

with open(path + '/dev_set.json') as json_file:
  dataset = json.load(json_file)

dialogue_acts_ds = dict()
with open(path + "/act_tag.json") as tag_file:
  dialogue_acts = json.load(tag_file)

for meta in dialogue_acts:
  dialogue_acts_ds[meta["id"]] = meta["items"]

dataset[:3]
item = dataset[1]['items'][0]
item = item['items'][0]
"Speaker" + item['speaker'] + ": " + item['text']

dataset_df = pd.DataFrame(columns=['sentence1', 'sentence2', 'label', 'idx'])
for j_obj in dataset:
  meta_data = dialogue_acts_ds[j_obj["id"]]

  # sentence1 = "["
  sentence1 = ""
  items = j_obj['items']
  conv = items[0]['items']
  for i, dl in enumerate(meta_data):
    # for dl in items[0]['items']:
      # sentence1 = sentence1 + "Speaker" + dl['speaker'] + ": " + dl['text'] + " "
    dialoge = "Speaker" + conv[i]['speaker'] + ": "
    # dialoge = "Speaker" + conv[i]['speaker'] + ":[ "
    tags = dl["act_tag"]
    text = dl["text"]
    
    for j, tg in enumerate(tags):
      #  dialoge = dialoge + " [ " + text[j] + ", " + tg + " ]"
       dialoge = dialoge + text[j][:-1] + "{T: " + tg + "}" + "/ "
      #  if j != len(tags) -1:
      #    dialoge = dialoge + ", "
    
    # dialoge = dialoge + " ]}"
    # dialoge = dialoge + " ]"
    # if i != len(meta_data) -1:
    #   dialoge = dialoge + ", "
    sentence1 = sentence1 + dialoge

  # sentence1 = sentence1 + " ]"
  # print("sen1: ", sentence1)
  row = pd.DataFrame(
    {"sentence1": [sentence1],
      "sentence2": [items[-1]['text']],
      "label": int(j_obj['entailment']),
      "idx": [j_obj['id']]})
  dataset_df = pd.concat([dataset_df, row])
  if len(sentence1.strip().split()) > max_length:
    max_length = len(sentence1.strip().split())

max_length

# dataset_df.iloc[4]['sentence1']
# dataset_df.iloc[4]['sentence2']

415

In [5]:
dataset_df_1 = dataset_df[dataset_df["label"] == 1]
dataset_df_0 = dataset_df[dataset_df["label"] == 0]
print(dataset_df_1.shape)
print(dataset_df_0.shape)

(282, 4)
(238, 4)


Split the dataset for train, validation, test

In [6]:
from sklearn.model_selection import train_test_split
df_train, df_val = train_test_split(dataset_df, test_size=0.2, random_state=42, shuffle=True)
# df_test, df_val = train_test_split(test, test_size=0.5, random_state=1, shuffle=True)
df_val.shape, df_train.shape

((104, 4), (416, 4))

We need to reindex the dataframes and save them!

In [7]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
# df_train
# df_test = df_test.reset_index(drop=True)

## Preparing the dataset and dataloader for CE

In [8]:
class CustomDataset(Dataset):

    def __init__(self, data, maxlen, with_labels=True, bert_model='roberta-large-mnli'):

        self.data = data  # pandas dataframe
        #Initialize the tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(bert_model)  

        self.maxlen = maxlen
        self.with_labels = with_labels 

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):

        # Selecting sentence1 and sentence2 at the specified index in the data frame
        
        sent1 = str(self.data.iloc[index]["sentence1"])
        sent2 = str(self.data.iloc[index]["sentence2"])
        # if xxx == 0:
        #   print("id: ", self.data.iloc[index]["idx"])
        #   print(sent1, sent2)
        #   print(self.data.iloc[index]["label"])
        #   xxx = 1

        # Tokenize the pair of sentences to get token ids, attention masks and token type ids
        # encoded_pair = self.tokenizer(sent1, sent2, 
        #                               padding='max_length',  # Pad to max_length
        #                               truncation=True,  # Truncate to max_length
        #                               max_length=self.maxlen,  
        #                               return_tensors='pt')  # Return torch.Tensor objects

        encoded_pair = self.tokenizer.encode_plus(
            sent1, sent2,
            add_special_tokens=True,
            max_length=self.maxlen,
            # pad_to_max_length=True,
            padding='max_length',
            truncation=True,
            return_token_type_ids=True
        )
        
        token_ids = encoded_pair['input_ids']  # tensor of token ids
        attn_masks = encoded_pair['attention_mask'] # binary tensor with "0" for padded values and "1" for the other values
        token_type_ids = encoded_pair['token_type_ids']  # binary tensor with "0" for the 1st sentence tokens & "1" for the 2nd sentence tokens

        # return {
        #     'ids': torch.tensor(token_ids, dtype=torch.long),
        #     'mask': torch.tensor(attn_masks, dtype=torch.long),
        #     'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
        #     'targets': torch.tensor(self.data.iloc[index]["label"], dtype=torch.long)
        # }

        if self.with_labels:  # True if the dataset has labels
            return {
                'ids': torch.tensor(token_ids, dtype=torch.long),
                'mask': torch.tensor(attn_masks, dtype=torch.long),
                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
                'targets': torch.tensor(self.data.iloc[index]["label"], dtype=torch.long)
            } 
        else:
            return {
                'ids': torch.tensor(token_ids, dtype=torch.long),
                'mask': torch.tensor(attn_masks, dtype=torch.long),
                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long)
            }

In [9]:
# maxlen = 512
maxlen = 448
bert_model = "roberta-large-mnli"
bs = 8
# Creating instances of training and validation set
print("Reading training data...")
train_set = CustomDataset(df_train, maxlen, bert_model)
# print("data issue: ", train_set.__getitem__(20))
print("Reading validation data...")
val_set = CustomDataset(df_val, maxlen, bert_model)
# Creating instances of training and validation dataloaders
train_loader = DataLoader(train_set, batch_size=bs, num_workers=0)
# for idx, (token_ids, attn_masks, token_type_ids, label) in enumerate(train_loader):
#   print(idx)
val_loader = DataLoader(val_set, batch_size=bs, num_workers=0)

Reading training data...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=688.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355863.0, style=ProgressStyle(descript…


Reading validation data...


## Train model

### Create our model based on pre-trained model

In [10]:
# Creating the customized model, by adding a drop out and a dense layer on top of Roberta to get the final output for the model.
# from
class RobertaClass(torch.nn.Module):
    def __init__(self):
        super(RobertaClass, self).__init__()
        self.l1 = AutoModel.from_pretrained("roberta-large-mnli")
        self.pre_classifier = torch.nn.Linear(1024, 1024)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(1024, 2)
        
        torch.nn.init.xavier_uniform_(self.pre_classifier.weight)
        nn.init.constant_(self.pre_classifier.bias, 0)

        torch.nn.init.xavier_uniform_(self.classifier.weight)
        nn.init.constant_(self.classifier.bias, 0)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        # print('op1',type(output_1),output_1.shape)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output     

In [11]:
model = RobertaClass()
for i, para in enumerate(model.l1.parameters()):
  if i < 261:
    para.requires_grad = False
# from transformers import RobertaForSequenceClassification
# model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2, output_attentions = False, output_hidden_states = False)

model.to(device)
end = 1


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1425744429.0, style=ProgressStyle(descr…




### Setting

In [12]:
epochs = 10
# Creating the loss function and optimizer
loss_fn = torch.nn.CrossEntropyLoss()
# optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-5)
lr = 2e-5
# set up optimizer and scheduler
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr, weight_decay=1e-2)
# set up scheduler
total_steps = len(train_loader) * epochs
# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)

In [13]:
# model evaluation
def calcuate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

### Helper functions

In [14]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [15]:
import random
# Set the seed value all over the place to make this reproducible.
def set_seed(seed_val):
  # seed_val = 42
  random.seed(seed_val)
  np.random.seed(seed_val)
  torch.manual_seed(seed_val)
  torch.cuda.manual_seed_all(seed_val)

### Training loop

#### Train

In [16]:
set_seed(42)
# Store the average loss after each epoch so we can plot them.
loss_values = []

best_val_loss = np.Inf
best_val_acc = 0.0
best_val_ep = 1

# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_loss = 0

    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_loader):
        # Progress update every 40 batches.
        if step % 10 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_loader), elapsed))

        # Unpack this training batch from our dataloader. 
        b_input_ids = batch['ids'].to(device)
        b_input_mask = batch['mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        b_labels = batch['targets'].to(device)

        
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        # This will return the loss (rather than the model output) because we
        # have provided the `labels`.
        
        outputs = model(b_input_ids, b_input_mask, token_type_ids)
        # print(outputs.shape)
        loss = loss_fn(outputs, b_labels)

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_loader)            
    
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables 
    # eval_loss, eval_accuracy = 0, 0
    # nb_eval_steps, nb_eval_examples = 0, 0
    n_correct = 0; total = 0; tr_loss=0; nb_tr_steps=0; nb_tr_examples=0

    # Evaluate data for one epoch
    with torch.no_grad(): 
        for _, batch in enumerate(val_loader): 
            
            b_input_ids = batch['ids'].to(device)
            b_input_mask = batch['mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            b_labels = batch['targets'].to(device)
            
            outputs = model(b_input_ids, b_input_mask, token_type_ids)
            loss = loss_fn(outputs, b_labels)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accuracy(big_idx, b_labels)

            nb_tr_steps += 1
            nb_tr_examples+=b_labels.size(0)
            
            # if _%5000==0:
            #     loss_step = tr_loss/nb_tr_steps
            #     accu_step = (n_correct*100)/nb_tr_examples
            #     print(f"Validation Loss per 100 steps: {loss_step}")
            #     print(f"Validation Accuracy per 100 steps: {accu_step}")
    val_loss = tr_loss/nb_tr_steps
    val_acc = (n_correct*100)/nb_tr_examples

    # Report the final accuracy for this validation run.
    print("  Accuracy: {}, loss: {}".format(val_acc, val_loss))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))
    if val_acc > best_val_acc:
      print("Best validation loss improved from {} to {}".format(best_val_loss, val_loss))
      best_val_loss = val_loss
      model_copy = copy.deepcopy(model)  # save a copy of the model
      best_val_acc = val_acc
      best_val_ep = epoch_i

print("")
print("Training complete!")


Training...
  Batch    10  of     52.    Elapsed: 0:00:07.
  Batch    20  of     52.    Elapsed: 0:00:13.
  Batch    30  of     52.    Elapsed: 0:00:20.
  Batch    40  of     52.    Elapsed: 0:00:26.
  Batch    50  of     52.    Elapsed: 0:00:33.

  Average training loss: 0.70
  Training epcoh took: 0:00:34

Running Validation...
  Accuracy: 74.03846153846153, loss: 0.5289399853119483
  Validation took: 0:00:05
Best validation loss improved from inf to 0.5289399853119483

Training...
  Batch    10  of     52.    Elapsed: 0:00:07.
  Batch    20  of     52.    Elapsed: 0:00:13.
  Batch    30  of     52.    Elapsed: 0:00:20.
  Batch    40  of     52.    Elapsed: 0:00:26.
  Batch    50  of     52.    Elapsed: 0:00:33.

  Average training loss: 0.63
  Training epcoh took: 0:00:34

Running Validation...
  Accuracy: 75.96153846153847, loss: 0.5408253016380163
  Validation took: 0:00:05
Best validation loss improved from 0.5289399853119483 to 0.5408253016380163

Training...
  Batch    10  of 

In [17]:
print("Creation of the results' folder...")
!mkdir models

Creation of the results' folder...


In [18]:
# Saving the model
path_to_model='models/{}_lr_{}_val_acc_{}_ep_{}_bs_{}.pt'.format(bert_model, lr, round(best_val_acc, 5), best_val_ep, bs)
torch.save(model_copy.state_dict(), path_to_model)
print("The model has been saved in {}".format(path_to_model))

The model has been saved in models/roberta-large-mnli_lr_2e-05_val_acc_79.80769_ep_5_bs_8.pt


### Test Model

#### Data Preparation

In [19]:
test_max_length = 0

with open(path + '/test_set_unlabeled.json') as json_file:
  test_dataset = json.load(json_file)

dialogue_acts_ds = dict()
with open(path + "/act_tag_test.json") as tag_file:
  test_dialogue_acts = json.load(tag_file)

for meta in test_dialogue_acts:
  dialogue_acts_ds[meta["id"]] = meta["items"]

test_dataset_df = pd.DataFrame(columns=['sentence1', 'sentence2', 'idx'])
for j_obj in test_dataset:
  meta_data = dialogue_acts_ds[j_obj["id"]]

  # sentence1 = "["
  sentence1 = ""
  items = j_obj['items']
  conv = items[0]['items']
  for i, dl in enumerate(meta_data):
    # for dl in items[0]['items']:
      # sentence1 = sentence1 + "Speaker" + dl['speaker'] + ": " + dl['text'] + " "
    dialoge = "Speaker" + conv[i]['speaker'] + ": "
    # dialoge = "Speaker" + conv[i]['speaker'] + ":[ "
    tags = dl["act_tag"]
    text = dl["text"]
    
    for j, tg in enumerate(tags):
      #  dialoge = dialoge + " [ " + text[j] + ", " + tg + " ]"
       dialoge = dialoge + text[j][:-1] + "{T: " + tg + "}" + "/ "
      #  if j != len(tags) -1:
      #    dialoge = dialoge + ", "
    
    # dialoge = dialoge + " ]}"
    # dialoge = dialoge + " ]"
    # if i != len(meta_data) -1:
    #   dialoge = dialoge + ", "
    sentence1 = sentence1 + dialoge

  # sentence1 = sentence1 + " ]"
  # print("sen1: ", sentence1)
  row = pd.DataFrame(
    {"sentence1": [sentence1],
      "sentence2": [items[-1]['text']],
      "idx": [j_obj['id']]})
  test_dataset_df = pd.concat([test_dataset_df, row])
  if len(sentence1.strip().split()) > test_max_length:
    test_max_length = len(sentence1.strip().split())
test_max_length

361

In [20]:
test_dataset_df = test_dataset_df.reset_index(drop=True)
test_dataset_df.head(5)
# test_dataset_df.iloc[17]["sentence1"]

Unnamed: 0,sentence1,sentence2,idx
0,"SpeakerB: {F Um, } actually have been to the m...",SpeakerB thought Misery was very suspenful.,117
1,"SpeakerA: Yeah, {T: b}/ {C but } [ [ what, + ...",SpeakerB thinks that the first step is finding...,713
2,"SpeakerA: [ That's, + {F uh, } that's, ] {F uh...",SpeakerA believes drugs are a big influence on...,481
3,"SpeakerB: Yeah, {T: b}/ [ I, + I, ] - {T: %}/...",SpeakerB believes racial issues must be dealt ...,299
4,SpeakerA: I hear this movie F X part two comin...,SpeakerA liked the movie FX,258


In [None]:
Cathy Smith, Richard Simmons, and Jane Fonda workouts are oldies

In [21]:
# maxlen = test_max_length
maxlen = 448
bert_model = "roberta-large-mnli"
batch_size = 64
# Creating instances of test set
print("Reading test data...")
test_set = CustomDataset(test_dataset_df, maxlen, with_labels=False, bert_model=bert_model)
test_loader = DataLoader(test_set, batch_size=batch_size, num_workers=0)

Reading test data...


In [22]:
def test_prediction(model, device, dataloader, result_file=""):
  probs_all = []

  with torch.no_grad():
    for _, batch in enumerate(test_loader):
        seq = batch['ids'].to(device)
        attn_masks = batch['mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        # seq, attn_masks, token_type_ids = seq.to(device), attn_masks.to(device), token_type_ids.to(device)
        logits = model(seq, attn_masks, token_type_ids)
        _, probs = torch.max(logits.data, dim=1)
        probs_all += probs.tolist()
  return probs_all

In [23]:
print("Creation of the results' folder...")
!mkdir results

Creation of the results' folder...


In [24]:
# path_to_model = '/content/models/roberta-large-mnli_lr_2e-05_val_acc_79.80769_ep_7_bs_8.pt' 
# path_to_model = '/content/models/roberta-large-mnli_lr_2e-05_val_acc_80.76923_ep_8_bs_8.pt' 
# path_to_model = '/content/models/...'  # You can add here your trained model

# path_to_output_file = 'results/output.txt'


# saved_model = RobertaClass()
# if torch.cuda.device_count() > 1:  # if multiple GPUs
#     print("Let's use", torch.cuda.device_count(), "GPUs!")
#     saved_model = nn.DataParallel(saved_model)

# print()
# print("Loading the weights of the model...")
# saved_model.load_state_dict(torch.load(path_to_model))
# saved_model.to(device)

print("Predicting on test data...")
test_preds = test_prediction(model=model_copy, device=device, dataloader=test_loader)  # set the with_labels parameter to False if your want to get predictions on a dataset without labels
print("Predictions: ")
print(test_preds)
# len(test_preds)
test_set_ids = test_dataset_df["idx"].values.tolist()
# print(test_set_ids)
# print(len(test_set_ids))

list_of_predictions = []
for indx, prediction in zip(test_set_ids, test_preds):
  d = dict()
  d["id"] = indx
  d["pred"] = prediction
  list_of_predictions.append(d)

with open('results/ConvEnt_9_preds.json', 'w') as fout:
    json.dump(list_of_predictions , fout)


# print("Predictions are available in : {}".format(path_to_output_file))

Predicting on test data...
Predictions: 
[1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0