In [None]:
import pandas as pd
import numpy as np
import re
import string
import collections
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from gensim.models import Word2Vec

import tensorflow as tf
from tensorflow.keras.layers import Embedding, Dense, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from tabulate import tabulate

import pandas as pd

df_train = pd.read_csv('/content/train.csv')
df_test = pd.read_csv('/content/test.csv')

df_train.head()

df_train.info()

print(df_train['1'].value_counts())
print(df_test['1'].value_counts())

# Preprocess text
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    text = [word for word in text.split() if word not in stop_words]
    return ' '.join(text)

df_train['processed_text'] = df_train['0'].apply(preprocess_text)
df_test['processed_text'] = df_test['0'].apply(preprocess_text)

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=df_train['processed_text'], vector_size=100, window=5, min_count=1, workers=4)
word2vec_model.save("word2vec.model")

# Create embedding matrix
vocab_size = 10000
embedding_dim = 100
word_index = {word: i+1 for i, word in enumerate(word2vec_model.wv.index_to_key)}

embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in word_index.items():
    if i < vocab_size:
        try:
            embedding_vector = word2vec_model.wv[word]
            embedding_matrix[i] = embedding_vector
        except KeyError:
            embedding_matrix[i] = np.random.normal(0, np.sqrt(0.25), embedding_dim)

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(df_train['processed_text'])

train_seq = tokenizer.texts_to_sequences(df_train['processed_text'])
test_seq = tokenizer.texts_to_sequences(df_test['processed_text'])

average_len = np.mean([len(seq) for seq in train_seq])
max_len = int(average_len + 100)

train_pad = pad_sequences(train_seq, maxlen=max_len, padding='post')
test_pad = pad_sequences(test_seq, maxlen=max_len, padding='post')

from tensorflow.keras.utils import to_categorical

train_label = to_categorical(df_train['1'])
test_label = to_categorical(df_test['1'])

y_train = df_train['1'].values
y_train

# Build the model
model = tf.keras.models.Sequential([
    Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, return_sequences=True)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.summary()

optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(loss=tf.keras.losses.binary_crossentropy, optimizer=optimizer, metrics=['accuracy'])

history = model.fit(train_pad, df_train['1'].values, epochs=10, batch_size=64, validation_split=0.1)

# Plot training history
plt.style.use('dark_background')
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Training', 'Validation'], loc='upper right')
plt.show()

plt.style.use('dark_background')
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Training', 'Validation'], loc='lower right')
plt.show()



# Evaluate the model
test_seq_padded = pad_sequences(test_seq, maxlen=max_len, padding='post')
test_seq_np = np.array(test_seq_padded)

predictions = model.predict(test_seq_np)
predicted_labels = np.round(predictions)
true_labels = np.array(df_test['1'])

cm = confusion_matrix(true_labels, predicted_labels)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", annot_kws={"size": 16})
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

accuracy = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

table = [
    ["Accuracy", accuracy],
    ["Precision", precision],
    ["Recall", recall],
    ["F1-score", f1]
]

print(tabulate(table, headers=["Metric", "Value"], tablefmt="fancy_grid"))