# COMP34812 Natural Language Understanding Courseworklow key lemming an stemming


## Install required packages 

In [None]:
!pip install pandas nltk numpy matplotlib

In [None]:
import pandas as pd
import regex as re
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\zaccu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zaccu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Load dataset

In [None]:
dev_set = pd.read_csv('dev.csv')
dev_set.head()

Unnamed: 0,premise,hypothesis,label
0,"By starting at the soft underbelly, the 16,000...","General Nelson A. Miles had 30,000 troops in h...",0
1,"The class had broken into a light sweat, but w...",The class grew more tense as time went on.,1
2,"Samson had his famous haircut here, but he wou...",It was unknown where exactly within the town S...,1
3,A man with a black shirt holds a baby while a ...,A darkly dressed man passes a crying baby to a...,0
4,I know that many of you are interested in addr...,The problems must be addressed,1


In [None]:
train_set = pd.read_csv('train.csv')
train_set.head()

Unnamed: 0,premise,hypothesis,label
0,yeah i don't know cut California in half or so...,Yeah. I'm not sure how to make that fit. Maybe...,1
1,actual names will not be used,"For the sake of privacy, actual names are not ...",1
2,The film was directed by Randall Wallace.,The film was directed by Randall Wallace and s...,1
3,"""How d'you know he'll sign me on?""Anse studie...",Anse looked at himself in a cracked mirror.,1
4,In the light of the candles his cheeks looked ...,Drew regarded his best friend and noted that i...,1


In [None]:
stop_words = nltk.corpus.stopwords.words('english')
lemmatizer = nltk.stem.WordNetLemmatizer()

def clean_text(text):
    text = str(text)

    text = text.lower()

    text = re.sub(r'[^\w\s]', ' ', text)

    text = nltk.word_tokenize(text)

    processed = []
    for word in text:
        if word in stop_words:
            continue

        word = lemmatizer.lemmatize(word)

        word = word.strip()

        if len(word) < 2:
            continue

        processed.append(word)

    return processed

In [None]:
dev_set['premise_tokens'] = dev_set['premise'].apply(clean_text)
dev_set['hypothesis_tokens'] = dev_set['hypothesis'].apply(clean_text)

train_set['premise_tokens'] = train_set['premise'].apply(clean_text)
train_set['hypothesis_tokens'] = train_set['hypothesis'].apply(clean_text)

In [None]:
dev_set.head()

Unnamed: 0,premise,hypothesis,label,premise_tokens,hypothesis_tokens
0,"By starting at the soft underbelly, the 16,000...","General Nelson A. Miles had 30,000 troops in h...",0,"[starting, soft, underbelly, 16, 000, troop, g...","[general, nelson, mile, 30, 000, troop, attack]"
1,"The class had broken into a light sweat, but w...",The class grew more tense as time went on.,1,"[class, broken, light, sweat, gasping, air]","[class, grew, tense, time, went]"
2,"Samson had his famous haircut here, but he wou...",It was unknown where exactly within the town S...,1,"[samson, famous, haircut, would, find, hard, r...","[unknown, exactly, within, town, samson, recei..."
3,A man with a black shirt holds a baby while a ...,A darkly dressed man passes a crying baby to a...,0,"[man, black, shirt, hold, baby, blue, shirted,...","[darkly, dressed, man, pass, cry, baby, man, l..."
4,I know that many of you are interested in addr...,The problems must be addressed,1,"[know, many, interested, addressing, issue, le...","[problem, must, addressed]"


In [None]:
train_set.head()

Unnamed: 0,premise,hypothesis,label,premise_tokens,hypothesis_tokens
0,yeah i don't know cut California in half or so...,Yeah. I'm not sure how to make that fit. Maybe...,1,"[yeah, know, cut, california, half, something]","[yeah, sure, make, fit, maybe, could, cut, cal..."
1,actual names will not be used,"For the sake of privacy, actual names are not ...",1,"[actual, name, used]","[sake, privacy, actual, name, used]"
2,The film was directed by Randall Wallace.,The film was directed by Randall Wallace and s...,1,"[film, directed, randall, wallace]","[film, directed, randall, wallace, star, mel, ..."
3,"""How d'you know he'll sign me on?""Anse studie...",Anse looked at himself in a cracked mirror.,1,"[know, sign, anse, studied, unkempt, clean, re...","[anse, looked, cracked, mirror]"
4,In the light of the candles his cheeks looked ...,Drew regarded his best friend and noted that i...,1,"[light, candle, cheek, looked, even, hollow, t...","[drew, regarded, best, friend, noted, light, l..."


Dataset analysis

In [None]:
# Labels = dev_set['label'].unique()
# Labels

# def get_word_frequency(data):
#     word_freq = {}
#     for row in data:
#         for word in row:
#             if word in word_freq:
#                 word_freq[word] += 1
#             else:
#                 word_freq[word] = 1
#     return word_freq

# word_freq = get_word_frequency(train_set['premise_tokens'] + train_set['hypothesis'])

# # nltk FreqDist
# from nltk import FreqDist

# fdist = FreqDist(word_freq)
# fdist

# embeddings/ vectorization

In [None]:
glove = "./glove_embeddings/glove.6B.300d.txt"
def load_glove(glove_file):
    with open(glove_file, 'r') as f:
        word_to_vec = {}
        for line in f:
            line = line.strip().split()
            word = line[0]
            vec = line[1:]
            word_to_vec[word] = vec
    return word_to_vec
loaded_glove = load_glove(glove)
embedding_dim = 300


In [None]:
def sentence_embedding(tokens, embeddings_dict, embedding_dim=300):
    valid_embeddings = [embeddings_dict[token] for token in tokens if token in embeddings_dict]
    
    if not valid_embeddings:
        # Return zero-vector if no embeddings found
        return np.zeros(embedding_dim)
    
    sentence_emb = np.mean(valid_embeddings, axis=0)
    return sentence_emb

def pairwise_embedding(premise_tokens, hypothesis_tokens, embeddings_dict):
    premise_emb = sentence_embedding(premise_tokens, embeddings_dict)
    hypothesis_emb = sentence_embedding(hypothesis_tokens, embeddings_dict)
    
    # Concatenate multiple useful features
    combined_emb = np.concatenate([
        premise_emb,
        hypothesis_emb,
        np.abs(premise_emb - hypothesis_emb), # capture difference
        premise_emb * hypothesis_emb           # capture interactions
    ])
    
    return combined_emb 

    

In [None]:
from tqdm import tqdm

tqdm.pandas()  # Enables `.progress_apply`

# Assuming glove_dict is already loaded
train_set['combined_embedding'] = train_set.progress_apply(
    lambda x: pairwise_embedding(x['premise_tokens'], x['hypothesis_tokens'], loaded_glove), axis=1)

dev_set['combined_embedding'] = dev_set.progress_apply(
    lambda x: pairwise_embedding(x['premise_tokens'], x['hypothesis_tokens'], loaded_glove), axis=1)


  0%|          | 1/24432 [00:00<34:21, 11.85it/s]


TypeError: the resolved dtypes are not compatible with add.reduce. Resolved (dtype('<U11'), dtype('<U11'), dtype('<U22'))

In [None]:
train_set.head()

# Traditional Approach

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

clf = LogisticRegression(max_iter=1000)
clf.fit(train_set['combined_embedding'], train_set['label'])

# Evaluate on validation set
preds = clf.predict(dev_set['combined_embedding'])
print(classification_report(dev_set['label'], preds, target_names=['entailment', 'neutral', 'contradiction']))
