# COMP34812 Natural Language Understanding Courseworklow key lemming an stemming: Transformer based approach

In [19]:
!pip install transformers datasets pandas nltk numpy scikit_learn --quiet
!pip install -U tensorflow --quiet

In [20]:
import pandas as pd
import nltk
import os
import regex as re
import numpy as np
# import keras
import tensorflow as tf
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

from nltk.corpus import stopwords
from transformers import TFBertModel, BertTokenizer
from sklearn.metrics import classification_report

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = TFBertModel.from_pretrained(model_name)

max_sequence_length = 512

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [4]:
special = tokenizer.special_tokens_map
special

{'unk_token': '[UNK]',
 'sep_token': '[SEP]',
 'pad_token': '[PAD]',
 'cls_token': '[CLS]',
 'mask_token': '[MASK]'}

## Load and Clean Data

In [5]:
from huggingface_hub import snapshot_download

# set HF_TOKEN in your enviroment
snapshot_download(repo_id="aap9002/NLU-Coursework",  repo_type="dataset", allow_patterns=f"*", local_dir='./')

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

.gitattributes:   0%|          | 0.00/2.46k [00:00<?, ?B/s]

train.csv:   0%|          | 0.00/4.26M [00:00<?, ?B/s]

dev.csv:   0%|          | 0.00/1.17M [00:00<?, ?B/s]

'/content'

In [6]:
dev_set = pd.read_csv('dev.csv')
dev_set.head()

Unnamed: 0,premise,hypothesis,label
0,"By starting at the soft underbelly, the 16,000...","General Nelson A. Miles had 30,000 troops in h...",0
1,"The class had broken into a light sweat, but w...",The class grew more tense as time went on.,1
2,"Samson had his famous haircut here, but he wou...",It was unknown where exactly within the town S...,1
3,A man with a black shirt holds a baby while a ...,A darkly dressed man passes a crying baby to a...,0
4,I know that many of you are interested in addr...,The problems must be addressed,1


In [7]:
train_set = pd.read_csv('train.csv')
train_set.head()

Unnamed: 0,premise,hypothesis,label
0,yeah i don't know cut California in half or so...,Yeah. I'm not sure how to make that fit. Maybe...,1
1,actual names will not be used,"For the sake of privacy, actual names are not ...",1
2,The film was directed by Randall Wallace.,The film was directed by Randall Wallace and s...,1
3,"""How d'you know he'll sign me on?""Anse studie...",Anse looked at himself in a cracked mirror.,1
4,In the light of the candles his cheeks looked ...,Drew regarded his best friend and noted that i...,1


In [8]:
stop_words = nltk.corpus.stopwords.words('english')
lemmatizer = nltk.stem.WordNetLemmatizer()

def clean_text(text):
    text = str(text)

    text = text.lower()

    text = re.sub(r'[^\w\s]', ' ', text)

    text = nltk.word_tokenize(text)

    processed = []
    for word in text:
        if word in stop_words:
            continue

        word = lemmatizer.lemmatize(word)

        word = word.strip()

        if len(word) < 2:
            continue

        processed.append(word)

    return processed

In [9]:
dev_set['premise_cleaned'] = dev_set['premise'].apply(clean_text)
dev_set['hypothesis_cleaned'] = dev_set['hypothesis'].apply(clean_text)

train_set['premise_cleaned'] = train_set['premise'].apply(clean_text)
train_set['hypothesis_cleaned'] = train_set['hypothesis'].apply(clean_text)

In [10]:
dev_set.head()

Unnamed: 0,premise,hypothesis,label,premise_cleaned,hypothesis_cleaned
0,"By starting at the soft underbelly, the 16,000...","General Nelson A. Miles had 30,000 troops in h...",0,"[starting, soft, underbelly, 16, 000, troop, g...","[general, nelson, mile, 30, 000, troop, attack]"
1,"The class had broken into a light sweat, but w...",The class grew more tense as time went on.,1,"[class, broken, light, sweat, gasping, air]","[class, grew, tense, time, went]"
2,"Samson had his famous haircut here, but he wou...",It was unknown where exactly within the town S...,1,"[samson, famous, haircut, would, find, hard, r...","[unknown, exactly, within, town, samson, recei..."
3,A man with a black shirt holds a baby while a ...,A darkly dressed man passes a crying baby to a...,0,"[man, black, shirt, hold, baby, blue, shirted,...","[darkly, dressed, man, pass, cry, baby, man, l..."
4,I know that many of you are interested in addr...,The problems must be addressed,1,"[know, many, interested, addressing, issue, le...","[problem, must, addressed]"


## Tokenize with Bert Tokenizer

In [11]:
def create_input(premise_tokens, hypothesis_tokens):
    return tokenizer(' '.join(premise_tokens), ' '.join(hypothesis_tokens), padding="max_length", truncation=True, return_tensors='tf')

dev_embeddings = dev_set.apply(lambda x: create_input(x['premise_cleaned'], x['hypothesis_cleaned']), axis=1)
train_embeddings = train_set.apply(lambda x: create_input(x['premise_cleaned'], x['hypothesis_cleaned']), axis=1)

In [12]:
dev_embeddings.head()

Unnamed: 0,0
0,"[input_ids, token_type_ids, attention_mask]"
1,"[input_ids, token_type_ids, attention_mask]"
2,"[input_ids, token_type_ids, attention_mask]"
3,"[input_ids, token_type_ids, attention_mask]"
4,"[input_ids, token_type_ids, attention_mask]"


## Model Design

In [13]:

# Define input layers
input_ids = tf.keras.Input(shape=(None,), dtype=tf.int32, name='input_ids')
attention_mask = tf.keras.Input(shape=(None,), dtype=tf.int32, name='attention_mask')
token_type_ids = tf.keras.Input(shape=(None,), dtype=tf.int32, name='token_type_ids')

# Get BERT outputs
bert_outputs = bert_model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
# Use the pooled output for classification
pooled_output = bert_outputs.pooler_output

# Add additional layers on top
x = tf.keras.layers.Dropout(0.3)(pooled_output)
x = tf.keras.layers.Dense(128, activation='relu')(x)
x = tf.keras.layers.Dropout(0.3)(x)
# Final classification layer (adjust num_labels accordingly)
logits = tf.keras.layers.Dense(2, activation='softmax')(x)

# Build the model
model = tf.keras.Model(inputs=[input_ids, attention_mask, token_type_ids], outputs=logits)

# Compile the model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# Print model summary
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, None)]               0         []                            
                                                                                                  
 attention_mask (InputLayer  [(None, None)]               0         []                            
 )                                                                                                
                                                                                                  
 token_type_ids (InputLayer  [(None, None)]               0         []                            
 )                                                                                                
                                                                                              

In [14]:
train_embeddings

Unnamed: 0,0
0,"[input_ids, token_type_ids, attention_mask]"
1,"[input_ids, token_type_ids, attention_mask]"
2,"[input_ids, token_type_ids, attention_mask]"
3,"[input_ids, token_type_ids, attention_mask]"
4,"[input_ids, token_type_ids, attention_mask]"
...,...
24427,"[input_ids, token_type_ids, attention_mask]"
24428,"[input_ids, token_type_ids, attention_mask]"
24429,"[input_ids, token_type_ids, attention_mask]"
24430,"[input_ids, token_type_ids, attention_mask]"


In [15]:
train_x = [e.data for e in train_embeddings]
dev_x = [e.data for e in dev_embeddings]

train_x = {k: np.vstack([d[k] for d in train_x]) for k in train_x[0]}
dev_x = {k: np.vstack([d[k] for d in dev_x]) for k in dev_x[0]}

train_y = tf.convert_to_tensor(np.reshape(train_set['label'].values, (-1, 1)))
dev_y = tf.convert_to_tensor(np.reshape(dev_set['label'].values, (-1, 1)))

# extract inner lists of y

train_y = tf.squeeze(train_y)
dev_y = tf.squeeze(dev_y)

In [16]:
model.fit(train_x, train_y, epochs=3, batch_size=32, validation_data=(dev_x, dev_y))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tf_keras.src.callbacks.History at 0x7bf39de35c50>

In [17]:
train_y

<tf.Tensor: shape=(24432,), dtype=int64, numpy=array([1, 1, 1, ..., 0, 1, 1])>

## Model Evaluation

In [21]:
# Generate predictions on the dev set
predictions = model.predict(dev_x, batch_size=32)

# For a classification task, convert logits to predicted class labels.
# If using softmax activation, predictions will be probabilities.
pred_labels = np.argmax(predictions, axis=1)

# Print detailed classification metrics
print(classification_report(dev_y, pred_labels))

              precision    recall  f1-score   support

           0       0.76      0.68      0.72      3258
           1       0.73      0.80      0.76      3478

    accuracy                           0.74      6736
   macro avg       0.75      0.74      0.74      6736
weighted avg       0.74      0.74      0.74      6736

