In [None]:
! pip install torch
! pip install torchtext==0.6.0
! pip install scapy

In [1]:
import os
import sys
import errno
import glob
import random
import numpy as np
from argparse import ArgumentParser
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torchtext import data
from torchtext import datasets
from classifier import NLIModel
import spacy

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [3]:
spacy_en = spacy.load('en_core_web_sm')

# Define how to preprocess the text data
TEXT = data.Field(lower=True, sequential=True, tokenize=lambda text: [token.text for token in spacy_en.tokenizer(text)])
# Define how to process the labels
LABEL = data.Field(sequential=False, use_vocab=False, unk_token=None)

fields = [('premise', TEXT), ('hypothesis', TEXT), ('label', LABEL)]

In [4]:
train_data, validation_data = data.TabularDataset.splits(
        path='./',  # Directory of your CSV files
        train='train.csv', validation='dev.csv',
        format='csv',
        fields=fields,
        skip_header=True  # If your CSV has a header
)

# Build the vocabulary only for the TEXT field from the training set
TEXT.build_vocab(train_data, vectors="glove.840B.300d")
LABEL.build_vocab(train_data)

In [6]:
fields = [('premise', TEXT), ('hypothesis', TEXT)]

test_data = data.TabularDataset(
        path='./test.csv',  # Directory of your CSV files
        format='csv',
        fields=fields,
        skip_header=True  # If your CSV has a header
)

for i in range(3):
        print(vars(test_data.examples[i]))  # Print the first training example

{'premise': ['boy', 'wearing', 'red', 'hat', ',', 'blue', 'jacket', 'pushing', 'plow', 'in', 'snow', '.'], 'hypothesis': ['the', 'boy', 'is', 'surrounded', 'by', 'snow']}
{'premise': ['a', 'blond', 'woman', 'in', 'a', 'black', 'shirt', 'is', 'standing', 'behind', 'a', 'counter', '.'], 'hypothesis': ['the', 'woman', 'is', 'standing', '.']}
{'premise': ['three', 'people', 'in', 'uniform', 'are', 'outdoors', 'and', 'are', 'observing', 'a', 'scene', 'which', 'is', 'out', 'of', 'the', 'picture', '.'], 'hypothesis': ['uniformed', 'people', 'are', 'outside']}


In [7]:
# Create the test iterator
test_iter = data.Iterator(
    dataset=test_data,
    batch_size=128, # config.batch_size
    device=device,
    sort=False,  # No need to sort test data
    sort_within_batch=False,
    shuffle=False  # Do not shuffle test data
)

for batch in test_iter:
    premise = batch.premise
    hypothesis = batch.hypothesis

    # Since batch_size=1, you directly access the first (and only) item in the batch
    premise_sentence = ' '.join([TEXT.vocab.itos[idx] for idx in premise[0]])
    hypothesis_sentence = ' '.join([TEXT.vocab.itos[idx] for idx in hypothesis[0]])
    print("Premise:", premise_sentence)
    print("Hypothesis:", hypothesis_sentence)
    break

Premise: boy a three a a a a a three man a a a family the a two a a a the a a two a an three woman a a a two there a a a an a a a a four a an two a a a a a female the a a a two a two a four a a a a a there a lot a a men a a a a female a a a a a two a a boy three two a men a a a a five a a a the a a in a men a people a motorcycle a the a two a an a along a two three rider young phone the a a a a a a
Hypothesis: the the uniformed the a the a a people two a a there a the group no nobody there the there a a two a the some a a the fans people there a the a an the a a there four a an the a a a the a <unk> the a cyclists a the a two a some the the the the a children the people a a men a the a the the a a a a the two the a a three people a guys a a a a there the a a a a a the a the the people a camel a the the two someone the the along a two the horse the phone a the a a a a the


In [8]:
# Evaluate the best dev model
test_model = torch.load("best_HBMP_600D_devacc_71.93_epoch_3.pt", map_location=torch.device('cpu'))

In [9]:
test_model.eval()  # Set the model to evaluation mode


test_predictions = []

with torch.no_grad():  # Do not calculate gradients
    for batch in test_iter:
        
        # Move batch data to the correct device
        batch.premise, batch.hypothesis = batch.premise.to(device), batch.hypothesis.to(device)
        
        # Predict
        predictions = test_model(batch)
        
        # Convert predictions to labels
        predicted_labels = predictions.argmax(1)
        
        test_predictions.extend(predicted_labels.cpu().numpy())
  
print(len(test_predictions))

3302


In [12]:
# Save the predictions to a CSV file
import pandas as pd

# Convert the predictions list to a DataFrame
predictions_df = pd.DataFrame(test_predictions, columns=['prediction'])

# Save to a new CSV file
predictions_df.to_csv('./Group_9_B.csv', index=False)

print("Predictions saved to './Group_9_B.csv'")

Predictions saved to './Group_9_B.csv'


In [13]:
predictions_df

Unnamed: 0,prediction
0,1
1,1
2,1
3,1
4,1
...,...
3297,0
3298,1
3299,1
3300,0
