In [23]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import models, layers
from tensorflow.keras.layers import Input
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt


In [24]:
# Load data
df = pd.read_csv('data/995,000_row_cleaned.csv')

# Remove rows with ivalid values
label_map = {"fake": 1, "satire":1, "conspiracy": 1, "bias": 1, "rumor": 1, "junksci": 1, "hate": 1,  "clickbait": 1,   "political": 1,
             "reliable": 0} # unreliable isn't kept because it's not conclusive data 
df = df[df["type"].isin(label_map.keys())]  # Keep only rows with valid labels
print(df["type"].value_counts())
df["type"] = df["type"].map(label_map)
print(df["type"].value_counts())

# Split data 
texts = df["content"]
y = df["type"]

# Create a instances of the TF-IDF vectorizer
vectorizer = TfidfVectorizer(
    max_features=10000,  # make sure the number of features is 10000
)

# Split data into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(
    texts, y, 
    test_size=0.2,
    shuffle=False)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, 
    test_size=0.5,
    shuffle=False)

# Fit and transform the vectorizer on the training data
X_train = vectorizer.fit_transform(X_train)  
X_val = vectorizer.transform(X_val)       
X_test = vectorizer.transform(X_test)  

  df = pd.read_csv('data/995,000_row_cleaned.csv')


type
reliable      218564
political     194518
bias          133232
fake          104883
conspiracy     97314
rumor          56445
clickbait      27412
junksci        14040
satire         13160
hate            8779
Name: count, dtype: int64
type
1    649783
0    218564
Name: count, dtype: int64


In [25]:
# Show the shape of the data
print(f"X_train.shape: {X_train.shape} \ny_train.shape: {y_train.shape} \n")
print(f"X_val.shape: {X_val.shape} \ny_val.shape: {y_val.shape}\n")
print(f"X_test.shape: {X_test.shape} \ny_test.shape: {y_test.shape}\n")


X_train.shape: (694677, 10000) 
y_train.shape: (694677,) 

X_val.shape: (86835, 10000) 
y_val.shape: (86835,)

X_test.shape: (86835, 10000) 
y_test.shape: (86835,)



In [26]:
# Standard model
# Define the model
input_layer = Input(shape=(X_train.shape[1],), sparse=True)
x1 = layers.Dense(1000, activation="relu")(input_layer) # 1st hidden layer with 1000 neurons
x1 = layers.Dropout(0.4)(x1)                            # Dropout layer to prevent overfitting 
x2 = layers.Dense(500, activation="relu")(x1)           # 2nd hidden layer with 500 neurons
x2 = layers.Dropout(0.2)(x2)                            # Dropout layer to prevent overfitting
x3 = layers.Dense(100, activation="relu")(x2)           # 3rd hidden layer with 100 neurons
x4 = layers.Dense(25, activation="relu")(x3)            # 4th hidden layer with 25 neurons
output = layers.Dense(1, activation="sigmoid")(x4)      # Output layer with 1 neuron and sigmoid to force binary

# Create the model
NN = tf.keras.models.Model(inputs=input_layer, outputs=output)

#Initialize the NN
NN.compile(
    optimizer = "adam",
    loss = "binary_crossentropy",
    metrics=[
        tf.keras.metrics.Precision(),
        tf.keras.metrics.Recall(),
        tf.keras.metrics.F1Score()],
    weighted_metrics=["f1_score"],  # Weight the F1 metric higher
)

# Define the callbacks for early stopping
callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor='val_f1_score',     # Use F1 for early stopping
        mode='max',                 # We want to "maximize" the F1 score  
        patience=4,                 # If theres no improvements after 4 epochs, stop
        restore_best_weights=True
    )
]

# Define the weights for the classes
weight_for_0 = len(y_train) / (2.0 * (y_train == 0).sum())
weight_for_1 = len(y_train) / (2.0 * (y_train == 1).sum())
class_weights = {0: weight_for_0, 1: weight_for_1}

#Train the NN
history = NN.fit(
    X_train,y_train,                    # Trainin data as input and expected output
    validation_data = (X_val,y_val),    # Data for validation
    epochs = 10,                        # Number of iterations over the entire dataset
    batch_size = 1024,                  # Number of samples per gradient update
    callbacks=callbacks,
    class_weight=class_weights
)

Epoch 1/10
[1m679/679[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 141ms/step - f1_score: 0.8583 - loss: 0.2301 - precision_1: 0.9679 - recall_1: 0.8892 - weighted_f1_score: 0.6664 - val_f1_score: 0.8477 - val_loss: 0.1406 - val_precision_1: 0.9779 - val_recall_1: 0.9498 - val_weighted_f1_score: 0.8477
Epoch 2/10
[1m679/679[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 141ms/step - f1_score: 0.8645 - loss: 0.0973 - precision_1: 0.9867 - recall_1: 0.9682 - weighted_f1_score: 0.6777 - val_f1_score: 0.8572 - val_loss: 0.1218 - val_precision_1: 0.9810 - val_recall_1: 0.9559 - val_weighted_f1_score: 0.8572
Epoch 3/10
[1m679/679[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 143ms/step - f1_score: 0.8771 - loss: 0.0410 - precision_1: 0.9953 - recall_1: 0.9864 - weighted_f1_score: 0.7018 - val_f1_score: 0.8575 - val_loss: 0.1163 - val_precision_1: 0.9755 - val_recall_1: 0.9706 - val_weighted_f1_score: 0.8575
Epoch 4/10
[1m679/679[0m [32m━━━━━━━━━━━━━━━━━━━━[0

In [27]:
# Save the model
output_path = "models/nn_classifier11_f1_val_0.9214.keras"
NN.save(output_path) 

# Models 
## nn_classifier1_f1_val_0.9328.keras - runtime: 16,6 min
-   1000 (ReLu)(0.4 dropoff), 500 (ReLu)(0.2 dropoff), 100 (ReLu)(0.2 dropoff), 50 (ReLu)
-   5 epochs, 256 bach size
-   val_accuracy: 0.9695 - val_auc: 0.9820 - val_f1_score: 0.9328 - val_loss: 0.1359 - val_precision: 0.9732 - val_recall: 0.9870
### Model evaluation key data
* Test Loss: 0.0917
* Test Accuracy: 0.9685
* Test Precision: 0.9696
* Test Recall: 0.9895
* Test AUC: 0.9901
* Test F1 Score: 0.8937

## nn_classifier2_f1_val_0.9304.keras - runtime: 10,75 min
-   1000 (ReLu)(0.4 dropoff), 500 (ReLu)(0.2 dropoff), 100 (ReLu)(0.2 dropoff), 50 (ReLu)
-   5 epochs, 512 bach size
-   val_accuracy: 0.9690 - val_auc_2: 0.9802 - val_f1_score: 0.9304 - val_loss: 0.1467 - val_precision_2: 0.9744 - val_recall_2: 0.9851
### Model evaluation key data
* Test Loss: 0.0903
* Test Accuracy: 0.9685
* Test Precision: 0.9751
* Test Recall: 0.9836
* Test AUC: 0.9908
* Test F1 Score: 0.8951

## nn_classifier3_f1_val_0.9228.keras - runtime: 10,5 min
-   1000 (ReLu)(0.4 dropoff), 500 (ReLu)(0.2 dropoff), 50 (ReLu)
-   5 epochs, 512 bach size
-   val_accuracy: 0.9703 - val_auc_4: 0.9828 - val_f1_score: 0.9228 - val_loss: 0.1242 - val_precision_4: 0.9752 - val_recall_4: 0.9860
### Model evaluation key data
* Test Loss: 0.0919
* Test Accuracy: 0.9689
* Test Precision: 0.9729
* Test Recall: 0.9866
* Test AUC: 0.9897
* Test F1 Score: 0.8829

## nn_classifier4_f1_val_0.9292
-   5000 (ReLu)(0.4 dropoff),1000 (ReLu)(0.4 dropoff), 500 (ReLu)(0.2 dropoff), 100 (ReLu)(0.2 dropoff), 50 (ReLu)
-   5 epochs, 512 bach size
-   val_accuracy: 0.9697 - val_auc_5: 0.9818 - val_f1_score: 0.9292 - val_loss: 0.1331 - val_precision_5: 0.9750 - val_recall_5: 0.9854
### Model evaluation key data
* Test Loss: 0.0919
* Test Accuracy: 0.9689
* Test Precision: 0.9729
* Test Recall: 0.9866
* Test AUC: 0.9897
* Test F1 Score: 0.8829

## nn_classifier5_f1_val_0.9292 - runtime 10,75
-   Alternate model
-   5 epochs, 512 bach size
-   val_accuracy: 0.9692 - val_auc_6: 0.9815 - val_f1_score: 0.9271 - val_loss: 0.1377 - val_precision_6: 0.9742 - val_recall_6: 0.9856
### Model evaluation key data
* Test Loss: 0.0919
* Test Accuracy: 0.9684
* Test Precision: 0.9737
* Test Recall: 0.9849
* Test AUC: 0.9901
* Test F1 Score: 0.8983

## nn_classifier6_f1_val_0.916 - 16 min
-   1000 (ReLu)(0.4 dropoff), 500 (ReLu)(0.2 dropoff), 100 (ReLu)(0.2 dropoff), 50 (ReLu)
-   5 epochs, 256 bach size

## models/nn_classifier7_f1_val_0.9266.keras - 10 min - NO unreliable
-   1000 (ReLu)(0.4 dropoff), 500 (ReLu)(0.2 dropoff), 100 (ReLu)(0.2 dropoff), 50 (ReLu)
-   5 epochs, 512 bach size

### Model evaluation key data
* Test Loss: 0.0928
* Test Accuracy: 0.9678
* Test Precision: 0.9744
* Test Recall: 0.9827
* Test AUC: 0.9901
* Test F1 Score: 0.8861

## nn_classifier8_f1_val_0.6775 - 5 min - ONLY reliable and fake
-   1000 (ReLu)(0.4 dropoff), 500 (ReLu)(0.2 dropoff), 100 (ReLu)(0.2 dropoff), 50 (ReLu)
-   7 epochs, 1024 bach size

### Model evaluation key data
* Test Loss: 17.5059
* Test Accuracy: 0.3997
* Test Precision: 0.7829
* Test Recall: 0.2741
* Test AUC: 0.5415
* Test F1 Score: 0.6991

## nn_classifier9_f1_val_0.9214 - 11 min - NO unreliable
-   1000 (ReLu)(0.4 dropoff), 500 (ReLu)(0.2 dropoff), 100 (ReLu)(0.2 dropoff), 50 (ReLu)
-   7 epochs, 1024 bach size

### Model evaluation key data
* Test Loss: 0.0991
* Test Accuracy: 0.9665
* Test Precision: 0.9716
* Test Recall: 0.9840
* Test AUC: 0.9896
* Test F1 Score: 0.8901

In [None]:
# Load the model
input_path = "models/nn_alt_classifier2_f1_test_0.9234.keras"
NN = tf.keras.models.load_model(input_path)

In [None]:
# Evaluate on test set
test_results = NN.evaluate(X_test, y_test, verbose=1)
print(f"* Test Loss: {test_results[0]:.4f}")
print(f"* Test Accuracy: {test_results[1]:.4f}")
print(f"* Test Precision: {test_results[2]:.4f}")
print(f"* Test Recall: {test_results[3]:.4f}")
print(f"* Test AUC: {test_results[4]:.4f}")
print(f"* Test F1 Score: {test_results[5]:.4f}")

[1m2714/2714[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - f1_score: 0.9257 - loss: 0.1946 - precision: 0.9701 - recall: 0.9806 - weighted_f1_score: 0.9257
* Test Loss: 0.1939
* Test Accuracy: 0.9712
* Test Precision: 0.9794
* Test Recall: 0.9276
* Test AUC: 0.9276
* Test F1 Score: 0.9276
