# Arabic Fake News Detection using LSTM

This notebook implements an LSTM model to detect Arabic fake news. The dataset is preprocessed, and various NLP techniques are applied before training the model.

In [None]:
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem.isri import ISRIStemmer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import seaborn as sns

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

### Step 1: Load the Arabic stopwords from NLTK

In [None]:
# This is used to filter out common words that don't contribute much to the meaning of the text.
stop_words = set(stopwords.words('arabic'))

### Step 2: Specify the path to your dataset directory

In [None]:
# This is where the data files (articles, sources) are located.
dataset_dir = "/kaggle/input/arabic-fake-news-dataset-afnd/AFND/Dataset

### Step 3: Read sources.json

In [None]:
# This file contains information about the credibility of each source (credible, not credible, undecided).
sources_file_path = '/kaggle/input/arabic-fake-news-dataset-afnd/AFND/sources.json'
with open(sources_file_path, 'r', encoding='utf-8') as sources_file:
    sources_data = json.load(sources_file)

# Convert the sources data into a DataFrame for easier processing
sources_df = pd.DataFrame(list(sources_data.items()), columns=['source', 'label'])

### Step 4: Read scraped_articles.json for each source

In [None]:
# This is where the articles from each source are stored. We'll iterate over each source to load its articles.
articles_data = []
for source in sources_df['source']:
    scraped_articles_path = os.path.join(dataset_dir, source, 'scraped_articles.json')
    
    # Check if the file exists before attempting to read it
    if os.path.exists(scraped_articles_path):
        with open(scraped_articles_path, 'r', encoding='utf-8') as articles_file:
            source_articles_dict = json.load(articles_file)
            source_articles_list = source_articles_dict.get("articles", [])
            
            # Add a 'source' key to each article to keep track of where it came from
            for article in source_articles_list:
                article['source'] = source
            
            # Add all articles to the main list
            articles_data.extend(source_articles_list)

# Convert articles_data to a DataFrame for processing
articles_df = pd.DataFrame(articles_data)

### Step 5: Merge the articles with their sources(labels)

In [None]:
# This merges the article data with the credibility label from the sources data.
merged_df = pd.merge(articles_df, sources_df, how='inner', left_on='source', right_on='source')

# Optional: Display the first few rows and the shape of the merged DataFrame
merged_df.head()
merged_df.shape

### Step 6: Plot the distribution of article labels

In [None]:
plt.figure(figsize=(10, 5))
plt.bar('Not Credible Articles', len(merged_df[merged_df['label'] == 'not credible']), color='orange')
plt.bar('Credible Articles', len(merged_df[merged_df['label'] == 'credible']), color='green')
plt.bar('Undecided Articles', len(merged_df[merged_df['label'] == 'undecided']), color='gray')
plt.title('Distribution of Articles', size=15)
plt.xlabel('Articles Type', size=15)
plt.ylabel('# of Articles', size=15)

### Step 7: Data Preparation

In [None]:
X = merged_df['text']  # Features: the text of the articles
y = merged_df['label']  # Labels: credibility labels

# Normalize text to lowercase
# This makes the text uniform by converting all characters to lowercase.
X = X.apply(lambda text: text.lower())

# Remove Arabic stopwords
# Stopwords are common words like "and", "the", etc., that are removed because they don't add much value.
X = X.apply(lambda text: ' '.join([word for word in text.split() if word not in stop_words]))

# Apply Stemming with ISRIStemmer
# Stemming reduces words to their root form, which helps in reducing the vocabulary size.
stemmer = ISRIStemmer()
X = X.apply(lambda text: ' '.join([stemmer.stem(word) for word in text.split()]))

### Step 8: Convert labels to numerical format using LabelEncoder

In [None]:
# Machine learning models work better with numerical data, so we encode the labels as numbers.
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

### Step 9: Split the data into training and testing sets

In [None]:
# This step splits the dataset into training data (used to train the model) and testing data (used to evaluate the model).
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

### Step 10: Tokenization and Padding

In [None]:
# Tokenization converts text into sequences of numbers (each number represents a word).
# Padding ensures all sequences have the same length by adding zeros to shorter sequences.
max_vocab = 10000  # Maximum number of words to keep in the vocabulary
tokenizer = Tokenizer(num_words=max_vocab)
tokenizer.fit_on_texts(X_train)
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)
max_sequence_length = 128  # Set an appropriate value for sequence length
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_sequence_length, padding='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length, padding='post')

### Step 11: Build the LSTM Model

In [None]:
# LSTM (Long Short-Term Memory) is a type of RNN (Recurrent Neural Network) suitable for sequential data like text.
model = Sequential()

# Add an Embedding layer to convert words into dense vectors of fixed size
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_sequence_length))

# Add a SpatialDropout1D layer for regularization (helps prevent overfitting)
model.add(SpatialDropout1D(0.2))

# Add LSTM layers with units (number of neurons)
model.add(LSTM(units=64, return_sequences=True))  # First LSTM layer returns sequences
model.add(LSTM(units=64))  # Second LSTM layer does not return sequences

# Add a Dense layer with softmax activation for multi-class classification
model.add(Dense(units=len(np.unique(y_encoded)), activation='softmax'))

### Step 12: Compile the Model

In [None]:
# This step defines the loss function, optimizer, and evaluation metrics.
model.compile(loss='sparse_categorical_crossentropy',
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

# Print the Model Summary
# This shows the architecture of the model, including the number of layers and parameters.
model.summary()

### Step 13: Train the Model

In [None]:
# We train the model using the training data, and monitor validation loss to prevent overfitting.
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

# Train the model with early stopping
model.fit(X_train_padded, y_train, epochs=10, validation_split=0.2, batch_size=64, shuffle=True, callbacks=[early_stop])

### Step 14: Evaluate the Model

In [None]:
# After training, we evaluate the model's performance on the test data.
y_pred_probs = model.predict(X_test_padded)
y_pred = np.argmax(y_pred_probs, axis=1)

# Calculate and display the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")

# Display a detailed classification report
print(classification_report(y_test, y_pred))

### Step 15: Plot the Confusion Matrix

In [None]:
# A confusion matrix helps visualize the performance of the model, showing true vs. predicted labels.
conf_matrix = confusion_matrix(y_test, y_pred)

# Plot the Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()