# Demo for Solution B – BiLSTM Model

 Instructions for Using This Script

1. Global Parameters  
   Do not modify any of the global configuration parameters unless you're sure of what you're doing. These are fixed for consistent model behavior.

2. Environment Setup  
   Install all required packages by running the installation cell below.  
   Note: You may need to restart your environment after installation due to underlying dependency changes.

3. User Data Input  
   In the 'User Data' section, update the `USER_CSV_PATH` variable with the path to your own CSV file.  
   The CSV must contain `premise` and `hypothesis` columns.

4. Run All Cells  
   Once the path is set and dependencies are installed, run all cells in order.  
   This will generate a `predictions.csv` file containing binary predictions (0 for "Not Entailed", 1 for "Entailed").



# Global Parameters

In [None]:
GLOVE_PATH = 'glove.6B.300d.txt'
MODEL_PATH = 'bilstm_model.pt'
TEST_PATH = 'test.csv'
OUTPUT_PATH = 'predictions.csv'

EMBEDDING_DIM = 300
seed_value = 42
max_len = 80



# Requierments

In [None]:
!pip install  tensorflow  pandas nltk numpy matplotlib scikit-learn sentencepiece tokenizers --quiet
!pip install -U spacy[cuda12x] --quiet
!python -m spacy download en_core_web_sm --quiet
!pip install -q gdown --quiet
#May need to restart run time in notebook/ goole collab due to underlying depencie chnages
#--quiet used to reduce output. can be removed for sanity checks

In [None]:
import pandas as pd
import regex as re
import numpy as np
import nltk
import os
import tensorflow as tf
import spacy
import gdown
import random
from tensorflow.keras.preprocessing.sequence import pad_sequences
from huggingface_hub import snapshot_download

nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize

# Input file

In [None]:
USER_PATH = 'dev.csv' # change this to your user data path

# Download From Cloud

In [None]:
# GloVe Embeddings (300D)
glove_id = "1iVUBiXUgN__xN_x0usyXt_otb_RWAenZ"
glove_output = 'glove.6B.300d.txt'
if not os.path.exists(glove_output):
    gdown.download(f"https://drive.google.com/uc?id={glove_id}", glove_output, quiet=False)

snapshot_download(
    repo_id="aap9002/NLI-BILSTM",
    allow_patterns=f"*",
    local_dir='./'
)

# Cleaning


In [None]:
def clean_text(text):
    text = str(text).lower()

    # Keep basic punctuation (.,!?'), remove obscure punctuation
    text = re.sub(r"[^a-z0-9,.!?'\s]", ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()

    # Tokenize without removing stopwords or lemmatizing
    text = nltk.word_tokenize(text)

    return text

# Glove emeddings

In [None]:
glove = f"./glove_embeddings/glove.6B.{EMBEDDING_DIM}d.txt"
def load_glove(glove_file):
    embeddings_dict = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.strip().split()
            word = values[0]
            vector = np.array(values[1:], dtype=np.float32)  # <-- Convert to float32
            embeddings_dict[word] = vector
    return embeddings_dict

# sequnces emeddings

In [None]:
def tokens_to_embedding_sequence(tokens, glove, dim):
    return [glove.get(tok, np.zeros(dim)) for tok in tokens]
def create_sequence_embedding(row, glove, dim, max_len):
    premise_seqs = []
    hypothesis_seqs = []

    for idx, row in row.iterrows():
        prem_seq = tokens_to_embedding_sequence(row['premise_tokens'], glove, dim)
        hyp_seq = tokens_to_embedding_sequence(row['hypothesis_tokens'], glove, dim)
        # Pad separately
        prem_seq = pad_sequences([prem_seq], maxlen=max_len, dtype='float32', padding='post', truncating='post')[0]
        hyp_seq = pad_sequences([hyp_seq], maxlen=max_len, dtype='float32', padding='post', truncating='post')[0]
        if (prem_seq.shape != (max_len, dim) or hyp_seq.shape != (max_len, dim)):
            print("Heres the issue")
            print(row['premise_tokens'])
            print(row['hypothesis_tokens'])
        premise_seqs.append(prem_seq)
        hypothesis_seqs.append(hyp_seq)

    # Explicit casting ensures consistent shape
    premise_seqs = np.stack(premise_seqs)
    hypothesis_seqs = np.stack(hypothesis_seqs)

    return premise_seqs, hypothesis_seqs

# Numerical Vectorization

In [None]:
def sentence_embedding(tokens, glove, dim):
    valid_embeddings = [glove[token] for token in tokens if token in glove]
    if not valid_embeddings:
        return np.zeros(dim)
    return np.mean(valid_embeddings, axis=0)

def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-5)



def extract_verbs(doc):
    return set([token.lemma_ for token in doc if token.pos_ == "VERB"])

def precompute_ner_and_verbs(df, nlp):
    """Precompute NER and verb features for the entire dataframe."""
    docs1 = list(nlp.pipe(df['premise'].tolist(), batch_size=256))
    docs2 = list(nlp.pipe(df['hypothesis'].tolist(), batch_size=256))

    ner_features_list = []
    verb_features_list = []

    for doc1, doc2 in zip(docs1, docs2):
        ents1 = set(ent.text.lower() for ent in doc1.ents)
        ents2 = set(ent.text.lower() for ent in doc2.ents)
        ner_overlap = len(ents1 & ents2) / (len(ents1 | ents2) + 1e-5)

        verbs1 = extract_verbs(doc1)
        verbs2 = extract_verbs(doc2)
        verb_overlap = len(verbs1 & verbs2) / (len(verbs1 | verbs2) + 1e-5)

        ner_features_list.append(ner_overlap)
        verb_features_list.append(verb_overlap)

    return ner_features_list, verb_features_list

def prepare_numeric_features_optimized(df, glove, dim, nlp):
    # Precompute NER and Verb features in bulk
    ner_features_list, verb_features_list = precompute_ner_and_verbs(df, nlp)

    numeric_feats = []

    # Precompute sentence embeddings
    premise_embeddings = [sentence_embedding(tokens, glove, dim) for tokens in df['premise_tokens']]
    hypothesis_embeddings = [sentence_embedding(tokens, glove, dim) for tokens in df['hypothesis_tokens']]

    for idx in range(len(df)):
        prem_emb = premise_embeddings[idx]
        hyp_emb = hypothesis_embeddings[idx]
        cos_sim = cosine_similarity(prem_emb, hyp_emb)

        ner_overlap = ner_features_list[idx]
        verb_overlap = verb_features_list[idx]

        numeric_feats.append([cos_sim, ner_overlap, verb_overlap])

    return np.array(numeric_feats)

# Run Function

In [None]:
def get_predictions(input_csv):
    # Load the test data
    test_df = pd.read_csv(input_csv)
    # Python's built-in random module
    random.seed(seed_value)

    # NumPy
    np.random.seed(seed_value)

    # TensorFlow
    tf.random.set_seed(seed_value)
    # Preprocess the text data
    test_df['premise_tokens'] = test_df['premise'].apply(clean_text)
    test_df['hypothesis_tokens'] = test_df['hypothesis'].apply(clean_text)
    # Remove rows where premise or hypothesis are empty
    test_df = test_df[test_df['premise'].notna() & test_df['premise'].str.strip().ne('')]
    test_df = test_df[test_df['hypothesis'].notna() & test_df['hypothesis'].str.strip().ne('')]

    # Load GloVe embeddings
    glove = load_glove(GLOVE_PATH)

    # Load the trained model
    model = tf.keras.models.load_model('Most_recent_best_esim_model.keras')

    # Create sequence embeddings
    premise_seqs, hypothesis_seqs = create_sequence_embedding(test_df, glove, EMBEDDING_DIM, max_len)

    # Prepare numeric features using optimized function
    nlp = spacy.load("en_core_web_sm")
    numeric_feats = prepare_numeric_features_optimized(test_df, glove, EMBEDDING_DIM, nlp)

    # Make predictions
    predictions = model.predict([premise_seqs, hypothesis_seqs, numeric_feats])

    # Convert predictions to binary labels (0 or 1)
    binary_predictions = (predictions > 0.5).astype(int).flatten()

    return binary_predictions


In [None]:
prediction_labels = get_predictions(USER_PATH)

# Output text

In [None]:
columns = ['prediction']
df = pd.DataFrame(prediction_labels, columns=columns)
df.to_csv(OUTPUT_PATH, index=False)