In [None]:
# --- 1. Setup and Data Loading (Reusing initial steps) --

In [3]:
import pandas as pd
import numpy as np
import re
import os
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [4]:
print("--- 1. Data Loading and Initial Cleaning ---")

--- 1. Data Loading and Initial Cleaning ---


In [8]:
# We define the filename here. If you move the file, change this variable
# to the correct path (e.g., file_name = 'data/fake reviews dataset.csv')
file_name = 'fake reviews dataset.csv'
try:
    if not os.path.exists(file_name):
        raise FileNotFoundError(f"File not found: {file_name}")
    df = pd.read_csv(file_name)
    df['label_encoded'] = df['label'].map({'OR': 1, 'CG': 0})
    df['text_'] = df['text_'].fillna('')


    # Define and apply the same cleaning function as in the SVM script
    def preprocess_text(text):
        text = text.lower()
        text = re.sub(r'[^a-z\s]', '', text)
        return re.sub(r'\s+', ' ', text).strip()

    df['cleaned_text'] = df['text_'].apply(preprocess_text)

    X_text = df['cleaned_text'].values
    Y = df['label_encoded'].values

    print(f"Data loaded and cleaned. Total samples: {len(df)}")

except FileNotFoundError as e:
    print(f"Error: {e}")
    print("Cannot proceed with CNN-RNN without data. Please check 'file_name' variable.")
    exit() # Stop execution if data is missing

Data loaded and cleaned. Total samples: 40432


In [10]:
# --- 2. Deep Learning Data Preparation (Tokenization & Embeddings) ---
print("\n--- 2. Data Tokenization and Sequence Padding ---")



--- 2. Data Tokenization and Sequence Padding ---


In [11]:
# Parameters for deep learning preprocessing
MAX_WORDS = 10000        # Max number of words to keep in the vocabulary (V)
MAX_SEQUENCE_LENGTH = 150 # Max length of a review sequence (T). Adjust this if reviews are much longer/shorter.
EMBEDDING_DIM = 100       # Dimension of the word embedding vector (D)

In [12]:
# 2.1 Tokenizer: Creates a dictionary mapping words to integer IDs
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<unk>") # <unk> is for unknown words
tokenizer.fit_on_texts(X_text)

In [13]:
# 2.2 Convert text to sequences of integers
sequences = tokenizer.texts_to_sequences(X_text)

In [14]:
# 2.3 Pad Sequences: Ensures all reviews have the same length (T)
# Pre-padding is generally preferred for RNNs
X_padded = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='pre', truncating='pre')
print(f"Vocabulary Size: {len(tokenizer.word_index)} unique tokens")
print(f"Padded Input Shape: {X_padded.shape}")

Vocabulary Size: 48455 unique tokens
Padded Input Shape: (40432, 150)


In [15]:
# --- 3. Train/Test Split ---

# Use the same 80/20 split as the SVM for fair comparison
X_train, X_test, Y_train, Y_test = train_test_split(
    X_padded, Y,
    test_size=0.2,
    random_state=42,
    stratify=Y
)
print(f"Training set size: {X_train.shape[0]}, Testing set size: {X_test.shape[0]}")

Training set size: 32345, Testing set size: 8087


In [16]:
# --- 4. Hybrid CNN-RNN Model Architecture ---

print("\n--- 4. Building Hybrid CNN-RNN Model ---")


--- 4. Building Hybrid CNN-RNN Model ---


In [23]:
model = Sequential([
    Embedding(MAX_WORDS, EMBEDDING_DIM),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    LSTM(64, return_sequences=False),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

In [24]:
# --- 5. Model Training ---
print("\n--- 5. Training Hybrid CNN-RNN Model ---")


--- 5. Training Hybrid CNN-RNN Model ---


In [25]:
# Use Early Stopping to prevent overfitting and save computation time
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    X_train, Y_train,
    epochs=10,
    batch_size=32,
    validation_split=0.1, # Use 10% of training data for validation during training
    callbacks=[early_stopping],
    verbose=1
)



Epoch 1/10
[1m910/910[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 88ms/step - accuracy: 0.8881 - loss: 0.2776 - val_accuracy: 0.9258 - val_loss: 0.1822
Epoch 2/10
[1m910/910[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 92ms/step - accuracy: 0.9387 - loss: 0.1674 - val_accuracy: 0.9277 - val_loss: 0.1795
Epoch 3/10
[1m910/910[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 90ms/step - accuracy: 0.9650 - loss: 0.0985 - val_accuracy: 0.9379 - val_loss: 0.1577
Epoch 4/10
[1m910/910[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 90ms/step - accuracy: 0.9752 - loss: 0.0674 - val_accuracy: 0.9419 - val_loss: 0.1673
Epoch 5/10
[1m910/910[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 88ms/step - accuracy: 0.9838 - loss: 0.0438 - val_accuracy: 0.9382 - val_loss: 0.1997
Epoch 6/10
[1m910/910[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 88ms/step - accuracy: 0.9903 - loss: 0.0288 - val_accuracy: 0.9425 - val_loss: 0.2161


In [26]:
# --- 6. Model Evaluation and Prediction ---
print("\n--- 6. Evaluating CNN-RNN Model ---")


--- 6. Evaluating CNN-RNN Model ---


In [27]:
# Evaluate on the dedicated test set
loss, accuracy = model.evaluate(X_test, Y_test, verbose=0)
print(f"CNN-RNN Test Loss: {loss:.4f}")
print(f"CNN-RNN Test Accuracy: {accuracy:.4f}")

CNN-RNN Test Loss: 0.1437
CNN-RNN Test Accuracy: 0.9470


In [28]:
# Generate class predictions (0 or 1) and probability predictions
Y_pred_prob = model.predict(X_test).flatten()
Y_pred_class = (Y_pred_prob > 0.5).astype("int32")

[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 36ms/step


In [29]:
# Display the Classification Report
from sklearn.metrics import classification_report
print("\n--- Results: CNN-RNN Classification Report ---")
print(classification_report(Y_test, Y_pred_class, target_names=['Real (0)', 'Fake (1)']))


--- Results: CNN-RNN Classification Report ---
              precision    recall  f1-score   support

    Real (0)       0.95      0.94      0.95      4044
    Fake (1)       0.94      0.95      0.95      4043

    accuracy                           0.95      8087
   macro avg       0.95      0.95      0.95      8087
weighted avg       0.95      0.95      0.95      8087



In [30]:
# --- 7. Save Predictions for Ensemble Voting ---
# It is critical to save the PROBABILITIES, not just the classes (0 or 1),
# for the final Ensemble Voting model.
# Saving to a temporary DataFrame for easy access in the Ensemble script later.
cnn_rnn_predictions = pd.DataFrame({
    'true_label': Y_test,
    'cnn_rnn_prob': Y_pred_prob,
    'cnn_rnn_class': Y_pred_class
})

In [33]:
# In a real thesis, you would save this to a file for the next script:
cnn_rnn_predictions.to_csv('cnn_rnn_predictions_test.csv', index=False)
print("\nCNN-RNN predictions saved 'cnn_rnn_predictions_test.csv' for Ensemble stage.")


CNN-RNN predictions saved 'cnn_rnn_predictions_test.csv' for Ensemble stage.
