In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
# Load the datasets
df_true = pd.read_csv('datasets/isot/True.csv')
df_fake = pd.read_csv('datasets/isot/Fake.csv')

# Add labels
df_true['label'] = 1
df_fake['label'] = 0

# Drop unnecessary columns and combine datasets
df_true.drop(['subject', 'date'], axis=1, inplace=True)
df_fake.drop(['subject', 'date'], axis=1, inplace=True)

# Combine and shuffle the datasets
df = pd.concat([df_true, df_fake]).sample(frac=1).reset_index(drop=True)
print("Dataset shape:", df.shape)

In [None]:
# Text preprocessing function
def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    
    # Convert to lowercase
    text = str(text).lower()
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize and remove stopwords
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    
    return ' '.join(words)

# Apply preprocessing to text column
df['text'] = df['text'].apply(preprocess_text)

# Tokenization parameters
MAX_VOCAB_SIZE = 10000
MAX_SEQUENCE_LENGTH = 200

# Initialize tokenizer
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token='<OOV>')
tokenizer.fit_on_texts(df['text'])

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(df['text'])

# Pad sequences
padded_sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')

# Split the data
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df['label'], 
                                                    test_size=0.2, random_state=42)

# Print shapes to verify
print("Training data shape:", X_train.shape)
print("Testing data shape:", X_test.shape)

In [None]:
# Model parameters
EMBEDDING_DIM = 100
vocab_size = len(tokenizer.word_index) + 1

# Create the BiLSTM model
BILSTM_model = Sequential([
    # Embedding layer
    Embedding(vocab_size, EMBEDDING_DIM),
    
    # First BiLSTM layer with return sequences
    Bidirectional(LSTM(64, return_sequences=True)),
    
    # Second BiLSTM layer
    Bidirectional(LSTM(32)),
    
    # Dense layers with dropout
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Build the model with explicit input shape
BILSTM_model.build(input_shape=(None, MAX_SEQUENCE_LENGTH))

# Compile the model
BILSTM_model.compile(optimizer='adam',
                    loss='binary_crossentropy',
                    metrics=['accuracy'])

# Print model summary
BILSTM_model.summary()

In [None]:
# Training parameters
EPOCHS = 10
BATCH_SIZE = 64

# Train the model
history = BILSTM_model.fit(
    X_train, y_train,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_split=0.2,  # Use 20% of training data for validation
    verbose=1
)

# Plot training history
plt.figure(figsize=(12, 4))

# Plot accuracy
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

# Plot loss
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()

# Evaluate on test set
test_loss, test_accuracy = BILSTM_model.evaluate(X_test, y_test)
print(f'\nTest Accuracy: {test_accuracy:.4f}')

In [10]:
import pickle

# Save the model
with open('models/ISOT/BiLSTM_ISOT_model.pkl', 'wb') as f:
    pickle.dump(BILSTM_model, f)

# Save the tokenizer
with open('models/ISOT/BiLSTM_ISOT_tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)