# COMP34812 Natural Language Understanding Courseworklow key lemming an stemming


## Install required packages

In [1]:
!pip install pandas nltk numpy matplotlib



In [4]:
import pandas as pd
import regex as re
import numpy as np
import nltk
import os
nltk.download('punkt')
nltk.download('stopwords')

from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
if not os.path.exists('glove_embeddings'):
  !wget https://nlp.stanford.edu/data/glove.6B.zip
  !unzip glove.6B.zip -d glove_embeddings

--2025-03-11 15:33:18--  https://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-03-11 15:33:18--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


## Load dataset

In [None]:
dev_set = pd.read_csv('dev.csv')
dev_set.head()

In [None]:
train_set = pd.read_csv('train.csv')
train_set.head()

In [None]:
stop_words = nltk.corpus.stopwords.words('english')
lemmatizer = nltk.stem.WordNetLemmatizer()

def clean_text(text):
    text = str(text)

    text = text.lower()

    text = re.sub(r'[^\w\s]', ' ', text)

    text = nltk.word_tokenize(text)

    processed = []
    for word in text:
        if word in stop_words:
            continue

        word = lemmatizer.lemmatize(word)

        word = word.strip()

        if len(word) < 2:
            continue

        processed.append(word)

    return processed

In [None]:
dev_set['premise_tokens'] = dev_set['premise'].apply(clean_text)
dev_set['hypothesis_tokens'] = dev_set['hypothesis'].apply(clean_text)

train_set['premise_tokens'] = train_set['premise'].apply(clean_text)
train_set['hypothesis_tokens'] = train_set['hypothesis'].apply(clean_text)

In [None]:
dev_set.head()

In [None]:
train_set.head()

Dataset analysis

In [None]:
# Labels = dev_set['label'].unique()
# Labels

# def get_word_frequency(data):
#     word_freq = {}
#     for row in data:
#         for word in row:
#             if word in word_freq:
#                 word_freq[word] += 1
#             else:
#                 word_freq[word] = 1
#     return word_freq

# word_freq = get_word_frequency(train_set['premise_tokens'] + train_set['hypothesis'])

# # nltk FreqDist
# from nltk import FreqDist

# fdist = FreqDist(word_freq)
# fdist

# embeddings/ vectorization

In [None]:
glove = "./glove_embeddings/glove.6B.200d.txt"
def load_glove(glove_file):
    embeddings_dict = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.strip().split()
            word = values[0]
            vector = np.array(values[1:], dtype=np.float32)  # <-- Convert to float32
            embeddings_dict[word] = vector
    return embeddings_dict

embedding_dim = 200
loaded_glove = load_glove(glove)



In [None]:
def sentence_embedding(tokens, embeddings_dict, embedding_dim):
    print("part 2")
    valid_embeddings = [embeddings_dict[token] for token in tokens if token in embeddings_dict]
    print("part 3")

    if not valid_embeddings:
        # Return zero-vector if no embeddings found
        return np.zeros(embedding_dim)
    print("part 4")
    print(tokens)
    print(valid_embeddings)
    sentence_emb = np.mean(valid_embeddings, axis=0)
    return sentence_emb

def pairwise_embedding(premise_tokens, hypothesis_tokens, embeddings_dict,embedding_dim):
    print("part 1")
    premise_emb = sentence_embedding(premise_tokens, embeddings_dict,embedding_dim)
    print("part 4.5")
    hypothesis_emb = sentence_embedding(hypothesis_tokens, embeddings_dict,embedding_dim)
    print("part 5")
    # Concatenate multiple useful features
    combined_emb = np.concatenate([
        premise_emb,
        hypothesis_emb,
        np.abs(premise_emb - hypothesis_emb), # capture difference
        premise_emb * hypothesis_emb           # capture interactions
    ]).astype(np.float32)

    return combined_emb



In [None]:
from tqdm import tqdm
tqdm.pandas()

# Make sure premise and hypothesis columns contain lists of tokens
train_set['combined_embedding'] = train_set.apply(
    lambda x: pairwise_embedding(x['premise_tokens'], x['hypothesis_tokens'], loaded_glove, embedding_dim),
    axis=1
)

dev_set['combined_embedding'] = dev_set.apply(
    lambda x: pairwise_embedding(x['premise_tokens'], x['hypothesis_tokens'], loaded_glove, embedding_dim),
    axis=1
)


In [None]:
train_set.head()

# Traditional Approach

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

clf = LogisticRegression(max_iter=1000)
clf.fit(train_set['combined_embedding'], train_set['label'])

# Evaluate on validation set
preds = clf.predict(dev_set['combined_embedding'])
print(classification_report(dev_set['label'], preds, target_names=['entailment', 'neutral', 'contradiction']))
