# Demo for Solution A – BiLSTM Model
# Run this notebook to generate predictions on the test set.

In [None]:
!pip install  tensorflow  pandas nltk numpy matplotlib scikit-learn sentencepiece tokenizers
!pip install -U spacy[cuda12x]
!python -m spacy download en_core_web_sm
!pip install -q gdown

In [None]:
import pandas as pd
import regex as re
import numpy as np
import nltk
import os
import tensorflow as tf
import spacy
import gdown
nltk.download('punkt')
from nltk.tokenize import word_tokenize

In [None]:
# Cell 3: Settings
GLOVE_PATH = 'glove.6B.300d.txt'
MODEL_PATH = 'bilstm_model.pt'
TEST_PATH = 'test.csv'
OUTPUT_PATH = 'predictions.csv'
EMBEDDING_DIM = 300

# Input file

In [None]:
USER_PATH = 'dev.csv' # change this to your user data path
User_csv = pd.read_csv(USER_PATH)


# Download From Cloud

In [None]:
# GloVe Embeddings (300D)
glove_id = "1iVUBiXUgN__xN_x0usyXt_otb_RWAenZ"
glove_output = "glove.6B.300d.txt"
if not os.path.exists(glove_output):
    gdown.download(f"https://drive.google.com/uc?id={glove_id}", glove_output, quiet=False)

# Trained BiLSTM Model
model_id = "1-1So2oUrg6U0Hd1r_dl79lXxMs5K0vWZ"
model_output = "bilstm_model.pt"
if not os.path.exists(model_output):
    gdown.download(f"https://drive.google.com/uc?id={model_id}", model_output, quiet=False)

# Cleaning 


In [None]:
def clean_text(text):
    text = str(text).lower()

    # Keep basic punctuation (.,!?'), remove obscure punctuation
    text = re.sub(r"[^a-z0-9,.!?'\s]", ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()

    # Tokenize without removing stopwords or lemmatizing
    text = nltk.word_tokenize(text)
    
    return text

In [None]:
User_csv['text'] = User_csv['text'].apply(clean_text)
User_csv = User_csv[User_csv['premise'].notna() & User_csv['premise'].str.strip().ne('')]


#Glove emeddings

In [None]:
glove = f"./glove_embeddings/glove.6B.{EMBEDDING_DIM}d.txt"
def load_glove(glove_file):
    embeddings_dict = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.strip().split()
            word = values[0]
            vector = np.array(values[1:], dtype=np.float32)  # <-- Convert to float32
            embeddings_dict[word] = vector
    return embeddings_dict