In [1]:
# ❶ Imports
# ----------------------------------------------------------------------
from __future__ import annotations
import os, argparse, logging, pathlib, string
from datetime import datetime
from typing import List

import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support

import tensorflow as tf
from tensorflow.keras import layers, models, callbacks

from bokeh.plotting import figure, save as bokeh_save, output_file
from bokeh.layouts import gridplot
from bokeh.models import ColumnDataSource, ColorBar, LinearColorMapper

from wordcloud import WordCloud
import matplotlib.pyplot as plt

# ----------------------------------------------------------------------
# ❷ Config  (edit the two CSV paths to suit your machine) 
# ----------------------------------------------------------------------
DATA_FAKE_PATH = "/Users/abhijitsinha/Desktop/Fake News Detection/Data/Fake.csv"   # <- change if needed
DATA_TRUE_PATH = "/Users/abhijitsinha/Desktop/Fake News Detection/Data/True.csv"   # <- change if needed

VOCAB_SIZE, MAX_SEQUENCE_LENGTH  = 40_000, 300
EMBED_DIM,  LSTM_UNITS,  DROPOUT = 128, 128, 0.30
BATCH_SIZE, EPOCHS               = 256, 15
LEARNING_RATE                    = 2e-4     # ← ASCII minus
VALID_SPLIT, TEST_SPLIT          = 0.10, 0.15
ARTIFACT_DIR                     = pathlib.Path("artifacts")
ARTIFACT_DIR.mkdir(exist_ok=True, parents=True)

STOPWORDS = set(stopwords.words("english"))

# ----------------------------------------------------------------------
# ❸ Logging & TensorFlow one-time setup
# ----------------------------------------------------------------------
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"                  # hide TF C++ spam
for gpu in tf.config.list_physical_devices("GPU"):        # polite VRAM usage
    try:
        tf.config.experimental.set_memory_growth(gpu, True)
    except Exception as e:
        logging.warning("Could not set GPU memory-growth: %s", e)

# ----------------------------------------------------------------------
# ❹ Data utilities
# ----------------------------------------------------------------------
def load_and_combine(fake_csv: str, true_csv: str) -> pd.DataFrame:
    fake, true = pd.read_csv(fake_csv), pd.read_csv(true_csv)
    fake["target"], true["target"] = 0, 1
    df = (pd.concat([fake, true], ignore_index=True)
            .sample(frac=1.0, random_state=42)
            .reset_index(drop=True))
    logging.info("Dataset loaded → %d rows (fake=%d  real=%d)",
                 len(df), len(fake), len(true))
    return df

def _clean(text: str) -> str:
    text = text.lower().translate(str.maketrans('', '', string.punctuation))
    return " ".join([t for t in text.split() if t not in STOPWORDS])

def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    if "text" not in df.columns:
        raise ValueError("Column 'text' not found in dataset.")
    df = df.copy()
    df["text"] = df["text"].astype(str).apply(_clean)
    return df

# ----------------------------------------------------------------------
# ❺ Tokenisation helpers
# ----------------------------------------------------------------------
def build_tokenizer(texts: pd.Series) -> tf.keras.preprocessing.text.Tokenizer:
    tok = tf.keras.preprocessing.text.Tokenizer(num_words=VOCAB_SIZE,
                                                oov_token="<UNK>")
    tok.fit_on_texts(texts)
    return tok

def vectorise(tok, texts: pd.Series) -> np.ndarray:
    seqs = tok.texts_to_sequences(texts)
    return tf.keras.preprocessing.sequence.pad_sequences(
        seqs, maxlen=MAX_SEQUENCE_LENGTH, padding="post"
    )

# ----------------------------------------------------------------------
# ❻ Model
# ----------------------------------------------------------------------
def make_model(vocab_size: int) -> tf.keras.Model:
    inp = layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype=tf.int32)
    x   = layers.Embedding(vocab_size, EMBED_DIM, mask_zero=True)(inp)
    x   = layers.Dropout(DROPOUT)(x)
    x   = layers.Bidirectional(layers.LSTM(LSTM_UNITS))(x)
    x   = layers.LayerNormalization()(x)
    x   = layers.Dropout(DROPOUT)(x)
    out = layers.Dense(1, activation="sigmoid")(x)
    model = models.Model(inp, out)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(LEARNING_RATE),
        loss="binary_crossentropy",
        metrics=["accuracy",
                 tf.keras.metrics.Precision(name="precision"),
                 tf.keras.metrics.Recall(name="recall")],
    )
    return model

# ----------------------------------------------------------------------
# ❼ Custom callback for F1
# ----------------------------------------------------------------------
class F1Callback(callbacks.Callback):
    def __init__(self, val_xy): super().__init__(); self.val_x, self.val_y = val_xy; self.f1=[]
    def on_epoch_end(self, epoch, logs=None):
        preds = (self.model.predict(self.val_x, verbose=0) > .5).astype(int)
        p,r,f1,_ = precision_recall_fscore_support(self.val_y, preds,
                                                   average="binary", zero_division=0)
        self.f1.append(f1); logs |= {}; logs.update(val_precision=p,val_recall=r,val_f1=f1)
        logging.info("Epoch %-2d  val_F1=%.4f", epoch+1, f1)

# ----------------------------------------------------------------------
# ❽ Bokeh dashboards
# ----------------------------------------------------------------------
def save_training_dashboard(hist: callbacks.History, f1: List[float], out: pathlib.Path):
    epochs = range(1, len(hist.history["loss"])+1)
    src = ColumnDataSource(dict(epoch=list(epochs),
                                loss=hist.history["loss"],
                                val_loss=hist.history["val_loss"],
                                acc=hist.history["accuracy"],
                                val_acc=hist.history["val_accuracy"],
                                f1=f1))
    p_loss = figure(title="Loss", x_axis_label="Epoch",
                    y_axis_label="BCE", width=400, height=300)
    p_loss.line("epoch", "loss",     source=src, legend_label="train")
    p_loss.line("epoch", "val_loss", source=src, color="green", legend_label="val")

    p_acc = figure(title="Accuracy", x_axis_label="Epoch",
                   y_axis_label="Acc", width=400, height=300)
    p_acc.line("epoch", "acc",     source=src, legend_label="train")
    p_acc.line("epoch", "val_acc", source=src, color="green", legend_label="val")

    p_f1  = figure(title="F1", x_axis_label="Epoch",
                   y_axis_label="F1", width=400, height=300)
    p_f1.line("epoch", "f1", source=src, color="red")

    dashboard = gridplot([[p_loss, p_acc, p_f1]])
    output_file(out); bokeh_save(dashboard); logging.info("Dashboard → %s", out)

def save_confusion(cm: np.ndarray, out: pathlib.Path):
    cats = ["Fake", "Real"]
    mapper = LinearColorMapper("Viridis256", low=0, high=int(cm.max()))
    x,y = np.meshgrid(range(2), range(2))
    src = ColumnDataSource(dict(x=x.ravel(), y=y.ravel(), val=cm.ravel()))
    p = figure(x_range=cats, y_range=list(reversed(cats)), width=350, height=350,
               toolbar_location=None, title="Confusion Matrix")
    p.rect(x="x", y="y", width=1, height=1, source=src,
           fill_color={'field':'val','transform':mapper}, line_color=None)
    p.add_layout(ColorBar(color_mapper=mapper), "right")
    p.xaxis.major_label_orientation="vertical"
    output_file(out); bokeh_save(p); logging.info("Confusion matrix → %s", out)

# ----------------------------------------------------------------------
# ❾ Word-clouds
# ----------------------------------------------------------------------
def wordcloud(text: str, title: str, out: pathlib.Path):
    wc = WordCloud(width=1600, height=900, max_font_size=140,
                   collocations=False).generate(text)
    plt.figure(figsize=(14,8)); plt.imshow(wc); plt.axis("off"); plt.title(title)
    plt.tight_layout(); plt.savefig(out, dpi=140); plt.close()
    logging.info("Word-cloud → %s", out)

# ----------------------------------------------------------------------
# ❿ Main pipeline
# ----------------------------------------------------------------------
def main(fake_csv=DATA_FAKE_PATH, true_csv=DATA_TRUE_PATH):
    nltk.download("stopwords", quiet=True)

    df    = preprocess(load_and_combine(fake_csv, true_csv))
    train_val_x, X_test, train_val_y, y_test = train_test_split(
        df["text"], df["target"], test_size=TEST_SPLIT,
        stratify=df["target"], random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(
        train_val_x, train_val_y, test_size=VALID_SPLIT,
        stratify=train_val_y, random_state=42)

    tok = build_tokenizer(X_train)
    X_train_pad, X_val_pad, X_test_pad = map(
        lambda x: vectorise(tok, x), [X_train, X_val, X_test])

    train_ds = (tf.data.Dataset.from_tensor_slices((X_train_pad, y_train))
                  .shuffle(1024).batch(BATCH_SIZE).prefetch(2))
    val_ds   = (tf.data.Dataset.from_tensor_slices((X_val_pad, y_val))
                  .batch(BATCH_SIZE).cache().prefetch(2))
    test_ds  = (tf.data.Dataset.from_tensor_slices((X_test_pad, y_test))
                  .batch(BATCH_SIZE))

    model = make_model(VOCAB_SIZE)
    cb_early = callbacks.EarlyStopping(patience=3, restore_best_weights=True)
    cb_tensor= callbacks.TensorBoard(log_dir=ARTIFACT_DIR/"logs"/
                                     datetime.now().strftime("%Y%m%d-%H%M%S"))
    cb_f1    = F1Callback((X_val_pad, y_val))

    hist = model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS,
                     callbacks=[cb_early, cb_tensor, cb_f1], verbose=2)

    # Evaluation
    loss, acc, prec, rec = model.evaluate(test_ds, verbose=0)
    f1 = 2*(prec*rec)/(prec+rec+1e-12)
    logging.info("TEST  loss=%.4f  acc=%.4f  prec=%.4f  rec=%.4f  f1=%.4f",
                 loss, acc, prec, rec, f1)

    # Predictions & confusion matrix
    y_pred = (model.predict(test_ds, verbose=0) > .5).astype(int).ravel()
    cm = confusion_matrix(y_test, y_pred)

    # ------------------------------------------------------------------
    # Save artifacts
    # ------------------------------------------------------------------
    p_dir = ARTIFACT_DIR / "plots"; p_dir.mkdir(parents=True, exist_ok=True)
    save_training_dashboard(hist, cb_f1.f1, p_dir/"training_dashboard.html")
    save_confusion(cm, p_dir/"confusion_matrix.html")
    wordcloud(" ".join(df[df.target==0].text), "Fake News", p_dir/"wc_fake.png")
    wordcloud(" ".join(df[df.target==1].text), "Real News", p_dir/"wc_real.png")

    model.save(ARTIFACT_DIR/"fakenews_lstm.keras", include_optimizer=False)
    with open(ARTIFACT_DIR/"tokenizer.json", "w") as fp:
        fp.write(tok.to_json())
    logging.info("All artifacts saved under %s", ARTIFACT_DIR.resolve())

# ----------------------------------------------------------------------
# ⓫ Run immediately when the cell executes
# ----------------------------------------------------------------------
main()

2025-04-23 02:40:06.251200: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-04-23 02:40:12 [INFO] Dataset loaded → 44898 rows (fake=23481  real=21417)


Epoch 1/15


2025-04-23 02:44:30 [INFO] Epoch 1   val_F1=0.9900


135/135 - 238s - 2s/step - accuracy: 0.9464 - loss: 0.1425 - precision: 0.9549 - recall: 0.9316 - val_accuracy: 0.9906 - val_loss: 0.0395 - val_precision: 0.9967 - val_recall: 0.9835 - val_f1: 0.9900
Epoch 2/15


2025-04-23 02:48:27 [INFO] Epoch 2   val_F1=0.9956


135/135 - 237s - 2s/step - accuracy: 0.9970 - loss: 0.0177 - precision: 0.9968 - recall: 0.9969 - val_accuracy: 0.9958 - val_loss: 0.0234 - val_precision: 0.9972 - val_recall: 0.9940 - val_f1: 0.9956
Epoch 3/15


2025-04-23 02:52:09 [INFO] Epoch 3   val_F1=0.9928


135/135 - 222s - 2s/step - accuracy: 0.9994 - loss: 0.0091 - precision: 0.9995 - recall: 0.9992 - val_accuracy: 0.9932 - val_loss: 0.0327 - val_precision: 0.9983 - val_recall: 0.9874 - val_f1: 0.9928
Epoch 4/15


2025-04-23 02:55:47 [INFO] Epoch 4   val_F1=0.9956


135/135 - 219s - 2s/step - accuracy: 0.9993 - loss: 0.0077 - precision: 0.9995 - recall: 0.9990 - val_accuracy: 0.9958 - val_loss: 0.0254 - val_precision: 0.9972 - val_recall: 0.9940 - val_f1: 0.9956
Epoch 5/15


2025-04-23 02:59:27 [INFO] Epoch 5   val_F1=0.9962


135/135 - 219s - 2s/step - accuracy: 0.9999 - loss: 0.0045 - precision: 0.9999 - recall: 0.9999 - val_accuracy: 0.9963 - val_loss: 0.0228 - val_precision: 0.9972 - val_recall: 0.9951 - val_f1: 0.9962
Epoch 6/15


2025-04-23 03:03:07 [INFO] Epoch 6   val_F1=0.9964


135/135 - 221s - 2s/step - accuracy: 1.0000 - loss: 0.0034 - precision: 1.0000 - recall: 0.9999 - val_accuracy: 0.9966 - val_loss: 0.0211 - val_precision: 0.9972 - val_recall: 0.9956 - val_f1: 0.9964
Epoch 7/15


2025-04-23 03:06:47 [INFO] Epoch 7   val_F1=0.9959


135/135 - 220s - 2s/step - accuracy: 1.0000 - loss: 0.0026 - precision: 1.0000 - recall: 0.9999 - val_accuracy: 0.9961 - val_loss: 0.0243 - val_precision: 0.9972 - val_recall: 0.9945 - val_f1: 0.9959
Epoch 8/15


2025-04-23 03:10:41 [INFO] Epoch 8   val_F1=0.9956


135/135 - 234s - 2s/step - accuracy: 1.0000 - loss: 0.0021 - precision: 1.0000 - recall: 0.9999 - val_accuracy: 0.9958 - val_loss: 0.0239 - val_precision: 0.9972 - val_recall: 0.9940 - val_f1: 0.9956
Epoch 9/15


2025-04-23 03:14:22 [INFO] Epoch 9   val_F1=0.9953


135/135 - 221s - 2s/step - accuracy: 1.0000 - loss: 0.0017 - precision: 1.0000 - recall: 0.9999 - val_accuracy: 0.9955 - val_loss: 0.0249 - val_precision: 0.9967 - val_recall: 0.9940 - val_f1: 0.9953


2025-04-23 03:14:33 [INFO] TEST  loss=0.0167  acc=0.9963  prec=0.9972  rec=0.9950  f1=0.9961
2025-04-23 03:14:44 [INFO] Dashboard → artifacts/plots/training_dashboard.html
2025-04-23 03:14:45 [INFO] Confusion matrix → artifacts/plots/confusion_matrix.html
2025-04-23 03:14:52 [INFO] Word-cloud → artifacts/plots/wc_fake.png
2025-04-23 03:14:58 [INFO] Word-cloud → artifacts/plots/wc_real.png
2025-04-23 03:14:59 [INFO] All artifacts saved under /Users/abhijitsinha/Desktop/Fake News Detection/Notebook/artifacts
