# DEEP LEARNING MODELS — TEXT CLASSIFICATION WITH TENSORFLOW/KERAS

### LIBRARY

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM, Conv1D, GlobalMaxPooling1D, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, accuracy_score

### LOAD CLEANED DATA

In [2]:
df = pd.read_csv("../DATA/DATA[C].csv")  # CLEANED TEXT DATASET
X = df['clean_text']
y = df['label']

### TOKENIZATION & PADDING

In [4]:
# MAKE SURE X IS ALL STRINGS AND NO NaNs
X = X.fillna("").astype(str)

# HYPERPARAMETERS
MAX_WORDS = 5000
MAX_LEN = 200

# TOKENIZATION
tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(X)

# SEQUENCES AND PADDING
X_seq = tokenizer.texts_to_sequences(X)
X_pad = pad_sequences(X_seq, maxlen=MAX_LEN)

# SPLIT
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=42, stratify=y)

# INFO
print("VOCAB SIZE:", len(tokenizer.word_index))
print("X_train SHAPE:", X_train.shape)

VOCAB SIZE: 198633
X_train SHAPE: (30917, 200)


### DENSE NEURAL NETWORK

In [5]:
model_dense = Sequential([
    Embedding(MAX_WORDS, 64, input_length=MAX_LEN),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model_dense.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# TRAIN
model_dense.fit(X_train, y_train, validation_split=0.1, epochs=5, batch_size=128, callbacks=[EarlyStopping(patience=2)])



Epoch 1/5
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step - accuracy: 0.7567 - loss: 0.4839 - val_accuracy: 0.9764 - val_loss: 0.0704
Epoch 2/5
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - accuracy: 0.9823 - loss: 0.0604 - val_accuracy: 0.9822 - val_loss: 0.0498
Epoch 3/5
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.9890 - loss: 0.0401 - val_accuracy: 0.9848 - val_loss: 0.0441
Epoch 4/5
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.9911 - loss: 0.0299 - val_accuracy: 0.9854 - val_loss: 0.0404
Epoch 5/5
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.9951 - loss: 0.0184 - val_accuracy: 0.9848 - val_loss: 0.0415


<keras.src.callbacks.history.History at 0x1d6d1bcb1d0>

### EVALUATE DENSE MODEL

In [6]:
y_pred_dense = model_dense.predict(X_test)
y_pred_dense = (y_pred_dense > 0.5).astype(int)

print("DENSE NN ACCURACY -->", accuracy_score(y_test, y_pred_dense))
print(classification_report(y_test, y_pred_dense, target_names=["Fake", "Real"]))

[1m242/242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
DENSE NN ACCURACY --> 0.98745148771022
              precision    recall  f1-score   support

        Fake       0.99      0.98      0.99      3491
        Real       0.99      0.99      0.99      4239

    accuracy                           0.99      7730
   macro avg       0.99      0.99      0.99      7730
weighted avg       0.99      0.99      0.99      7730



### LSTM NETWORK

In [7]:
model_lstm = Sequential([
    Embedding(MAX_WORDS, 128, input_length=MAX_LEN),
    LSTM(64, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

model_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# TRAIN
model_lstm.fit(X_train, y_train, validation_split=0.1, epochs=5, batch_size=128, callbacks=[EarlyStopping(patience=2)])

Epoch 1/5




[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 355ms/step - accuracy: 0.8961 - loss: 0.2954 - val_accuracy: 0.9735 - val_loss: 0.0863
Epoch 2/5
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 354ms/step - accuracy: 0.9682 - loss: 0.0944 - val_accuracy: 0.9719 - val_loss: 0.0741
Epoch 3/5
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 340ms/step - accuracy: 0.9572 - loss: 0.1115 - val_accuracy: 0.9780 - val_loss: 0.0734
Epoch 4/5
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 265ms/step - accuracy: 0.9814 - loss: 0.0578 - val_accuracy: 0.9764 - val_loss: 0.0747
Epoch 5/5
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 272ms/step - accuracy: 0.9765 - loss: 0.0762 - val_accuracy: 0.9767 - val_loss: 0.0810


<keras.src.callbacks.history.History at 0x1d6e15910a0>

### EVALUATE LSTM

In [8]:
y_pred_lstm = model_lstm.predict(X_test)
y_pred_lstm = (y_pred_lstm > 0.5).astype(int)

print("LSTM ACCURACY -->", accuracy_score(y_test, y_pred_lstm))
print(classification_report(y_test, y_pred_lstm, target_names=["Fake", "Real"]))

[1m242/242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 33ms/step
LSTM ACCURACY --> 0.9751617076326002
              precision    recall  f1-score   support

        Fake       0.99      0.96      0.97      3491
        Real       0.97      0.99      0.98      4239

    accuracy                           0.98      7730
   macro avg       0.98      0.97      0.97      7730
weighted avg       0.98      0.98      0.98      7730



### 1D CONVOLUTIONAL NEURAL NETWORK

In [9]:
model_cnn = Sequential([
    Embedding(MAX_WORDS, 128, input_length=MAX_LEN),
    Conv1D(64, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model_cnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# TRAIN
model_cnn.fit(X_train, y_train, validation_split=0.1, epochs=5, batch_size=128, callbacks=[EarlyStopping(patience=2)])

Epoch 1/5




[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 46ms/step - accuracy: 0.7948 - loss: 0.3825 - val_accuracy: 0.9835 - val_loss: 0.0509
Epoch 2/5
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 45ms/step - accuracy: 0.9906 - loss: 0.0336 - val_accuracy: 0.9858 - val_loss: 0.0428
Epoch 3/5
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 47ms/step - accuracy: 0.9979 - loss: 0.0107 - val_accuracy: 0.9854 - val_loss: 0.0462
Epoch 4/5
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 58ms/step - accuracy: 0.9994 - loss: 0.0047 - val_accuracy: 0.9877 - val_loss: 0.0501


<keras.src.callbacks.history.History at 0x1d6ff62b050>

### EVALUATE CNN

In [10]:
y_pred_cnn = model_cnn.predict(X_test)
y_pred_cnn = (y_pred_cnn > 0.5).astype(int)

print("CNN ACCURACY:", accuracy_score(y_test, y_pred_cnn))
print(classification_report(y_test, y_pred_cnn, target_names=["Fake", "Real"]))

[1m242/242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step
CNN ACCURACY: 0.9873221216041397
              precision    recall  f1-score   support

        Fake       0.99      0.98      0.99      3491
        Real       0.98      0.99      0.99      4239

    accuracy                           0.99      7730
   macro avg       0.99      0.99      0.99      7730
weighted avg       0.99      0.99      0.99      7730

