<a href="https://colab.research.google.com/github/AAP9002/COMP34812-NLU-NLI/blob/main/NLU_Method_C/NLI_Transformer_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Transformer-based NLI Solution: Demo Notebook

**Instructions:**
1. Install dependencies (first cell).
2. Upload your `test.csv` file (premise, hypothesis).
3. Run all cells top to bottom.
4. Predictions will be saved in a csv file for submission.

# Requirements Packages

In [1]:
!pip install pandas numpy tensorflow transformers huggingface_hub --quiet

In [2]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import transformers
from transformers import AutoTokenizer, TFRobertaModel, TFRobertaForSequenceClassification
from huggingface_hub import snapshot_download

In [3]:
RANDOM_STATE = 42
BATCH_SIZE = 16

INPUT_FILE_PATH = "./test.csv"
OUTPUT_CSV_FILE = "predictions.csv"

MODEL_REPO = "aap9002/NLI-Transformer-Ensemble-Model"
MODEL_FILE = "ensamble_model_weights_and_arch.h5"

# Download and Load Model

In [4]:
# set HF_TOKEN in your enviroment

snapshot_download(
    repo_id=MODEL_REPO,
    allow_patterns=f"*{MODEL_FILE}",
    local_dir='./'
    )

Access to the secret `HF_TOKEN` has not been granted on this notebook.
You will not be requested again.
Please restart the session if you want to be prompted again.


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

ensamble_model_weights_and_arch.h5:   0%|          | 0.00/1.92G [00:00<?, ?B/s]

'/content'

In [5]:
ensemble_model = tf.keras.models.load_model(
    MODEL_FILE,
    custom_objects={
        'TFRobertaForSequenceClassification': TFRobertaForSequenceClassification,
        'TFRobertaModel': TFRobertaModel
        }
)



# Prepare Test Data

In [6]:
test_set = pd.read_csv(INPUT_FILE_PATH)
test_set.head()

Unnamed: 0,premise,hypothesis
0,"Boy wearing red hat, blue jacket pushing plow ...",The boy is surrounded by snow
1,A blond woman in a black shirt is standing beh...,The woman is standing.
2,Three people in uniform are outdoors and are o...,Uniformed people are outside
3,"A person, in a striped blue shirt and pants, i...",The person is running
4,"A man, woman, and child get their picture take...",A family on vacation is posing.


In [7]:
roberta_large_tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-large")
roberta_base_tokenizer = AutoTokenizer.from_pretrained( 'FacebookAI/roberta-base')

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [12]:
def create_ensemble_dataset(df, nli_tokenizer = roberta_large_tokenizer, sim_tokenizer = roberta_base_tokenizer, batch_size=BATCH_SIZE):
  premises = df['premise'].tolist()
  hypotheses = df['hypothesis'].tolist()

  inputs_nli = roberta_large_tokenizer(
      premises,
      hypotheses,
      padding=True,
      truncation=True,
      return_tensors="tf"
  )

  # Tokenize each set of sentences separately
  inputs_a = sim_tokenizer(
        premises,
        padding=True,
        truncation=True,
        return_tensors='np'
  )

  inputs_b = sim_tokenizer(
        hypotheses,
        padding=True,
        truncation=True,
        return_tensors='np'
  )


  dataset = tf.data.Dataset.from_tensor_slices((
        {
            'input_ids_nli': inputs_nli['input_ids'],
            'attention_mask_nli': inputs_nli['attention_mask'],
            'input_ids_a': inputs_a['input_ids'],
            'attention_mask_a': inputs_a['attention_mask'],
            'input_ids_b': inputs_b['input_ids'],
            'attention_mask_b': inputs_b['attention_mask']
        },
        None
  ))

  dataset = dataset.batch(batch_size)
  dataset = dataset.prefetch(tf.data.AUTOTUNE)

  return dataset

In [13]:
test_ensemble_dataset = create_ensemble_dataset(test_set)

# Run model predicitons

In [None]:
predictions = ensemble_model.predict(test_ensemble_dataset)
prediction_labels = predictions.argmax(axis=-1)



# Output Predictions

In [None]:
columns = ['prediction']

df = pd.DataFrame(prediction_labels, columns=columns)

df.to_csv(OUTPUT_CSV_FILE, index=False)