### Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, auc

tqdm.pandas()
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Divyanshu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Divyanshu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Divyanshu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Divyanshu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

### importing data and splitting into train and validation

In [2]:
train=pd.read_csv(r"C:\Users\Divyanshu\Downloads\archive\train.csv", header=None, names=["label", "text"],quotechar='"',encoding='utf-8')
test=pd.read_csv(r"C:\Users\Divyanshu\Downloads\archive\test.csv", header=None, names=["label", "text"],quotechar='"',encoding='utf-8')
train['label'] = train['label'].map({1: 0, 2: 1})
test['label'] = test['label'].map({1: 0, 2: 1})
train_texts, val_texts, train_labels, val_labels = train_test_split(train['text'], train['label'], test_size=0.1, random_state=42)


### Text Preprocessing

In [3]:
# Helper function to convert POS tags to WordNet format
def get_wordnet_pos(word):
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {'J': wordnet.ADJ, 'N': wordnet.NOUN, 'V': wordnet.VERB, 'R': wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def preprocess(text):
    #remove HTML tags
    text=re.sub('<[^>]*>','',text)
    #remove non-alphanumeric characters and convert into lowercase
    text=re.sub('[^a-zA-Z]',' ',text).lower()
    #tokenization
    words=word_tokenize(text)
    #Remove stopwords
    stop_words=set(stopwords.words('english'))
    words=[word for word in words if word not in stop_words]
    # Lemmatization with POS tagging
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in words]
    
    preprocessed_text=' '.join(words)
    return preprocessed_text


### Parameters

In [None]:
max_words = 20000
max_len = 200
embedding_dim = 300
BATCH_SIZE = 512

### Preprocessing and Tokenising

In [None]:
print("Preprocessing train texts...")
train_texts_clean = [preprocess(text) for text in train_texts]
print("Preprocessing validation texts...")
val_texts_clean = [preprocess(text) for text in val_texts]
print("Preprocessing test texts...")
test_texts_clean = [preprocess(text) for text in test['text']]

y_train = np.array(train_labels)
y_val = np.array(val_labels)
y_test = np.array(test['label'])

#Tokenizer 
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts_clean)

### Word2Vec

In [None]:
print("Loading Word2Vec model...")
w2v_model = api.load("word2vec-google-news-300")

word_index = tokenizer.word_index
num_words = min(max_words, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embedding_dim))

for word, i in word_index.items():
    if i >= max_words:
        continue
    if word in w2v_model:
        embedding_matrix[i] = w2v_model[word]

### Preprocessing and converting into one-hot vector

In [None]:
def data_generator(texts, labels, tokenizer, max_len):
    for text, label in zip(texts, labels):
        seq = tokenizer.texts_to_sequences([text])
        pad_seq = pad_sequences(seq, maxlen=max_len)
        yield pad_seq[0], label

output_signature = (
    tf.TensorSpec(shape=(max_len,), dtype=tf.int32),
    tf.TensorSpec(shape=(), dtype=tf.int32)
)

train_dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(train_texts_clean, y_train, tokenizer, max_len),
    output_signature=output_signature
).shuffle(10000).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

val_dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(val_texts_clean, y_val, tokenizer, max_len),
    output_signature=output_signature
).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

test_dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(test_texts_clean, y_test, tokenizer, max_len),
    output_signature=output_signature
).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

### LSTM Model

In [None]:
model = Sequential([
    Embedding(input_dim=num_words,
              output_dim=embedding_dim,
              weights=[embedding_matrix],
              input_length=max_len,
              trainable=False),
    LSTM(128, activation='tanh'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=25,
    callbacks=[early_stop]
)

### Evaluation and Analysis

In [None]:
#Evaluate on test set 
loss, accuracy = model.evaluate(test_dataset)
print(f"Test Loss: {loss:.4f} | Test Accuracy: {accuracy:.4f}")

#Gather predictions and true labels
true_labels = []
pred_probs = []

for x_batch, y_batch in test_dataset:
    preds = model.predict(x_batch).flatten()
    pred_probs.extend(preds)
    true_labels.extend(y_batch.numpy())

true_labels = np.array(true_labels)
pred_probs = np.array(pred_probs)
pred_labels = (pred_probs >= 0.5).astype(int)

#Confusion Matrix
cm = confusion_matrix(true_labels, pred_labels)
print("Confusion Matrix:")
print(cm)
plt.figure(figsize=(4, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

#Classification Report 
report = classification_report(true_labels, pred_labels, target_names=['Negative', 'Positive'])
print("Classification Report:")
print(report)

#PR-AUC Curve 
precision, recall, thresholds = precision_recall_curve(true_labels, pred_probs)
pr_auc = auc(recall, precision)
print(f"PR-AUC: {pr_auc:.4f}")

plt.figure(figsize=(6, 4))
plt.plot(recall, precision, label=f'PR-AUC = {pr_auc:.4f}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.show()