# models.get_models.py

In [6]:
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import (
    Dense,
    Dropout,
    Embedding,
    LSTM,
    GRU,
    Input,
    GlobalAveragePooling1D,
    MultiHeadAttention,
    LayerNormalization,
    Add,
)


def get_machine_learning_models():
    return {
        "Logistic Regression": LogisticRegression(
            max_iter=1000, multi_class="multinomial", solver="lbfgs"
        ),
        "Random Forest": RandomForestClassifier(
            n_estimators=200, class_weight="balanced"
        ),
        "Naive Bayes": MultinomialNB(),
        "XGBoost": XGBClassifier(objective="multi:softmax", use_label_encoder=False),
    }


def get_deep_learning_models(
    vocab_size=20000, max_len=500, embed_dim=128, num_heads=4, num_classes=None
):
    """
    Returns a dictionary of deep learning models with their names as keys.
    """

    if num_classes is None:
        raise ValueError("num_classes must be specified for deep learning models")


    models = {}

    # 1. Simple Feedforward
    models["Simple Feedforward"] = Sequential(
        [
            Dense(256, activation="relu", input_shape=(max_len,)),
            Dropout(0.3),
            Dense(128, activation="relu"),
            Dropout(0.3),
            Dense(num_classes, activation="softmax"),
        ]
    )

    # 2. LSTM Model
    lstm_input = Input(shape=(max_len,))
    x = Embedding(input_dim=vocab_size, output_dim=embed_dim)(lstm_input)
    x = LSTM(128, return_sequences=True)(x)
    x = LSTM(64)(x)
    x = Dropout(0.3)(x)
    output = Dense(num_classes, activation="softmax")(x)
    models["Stacked LSTM"] = Model(inputs=lstm_input, outputs=output)

    # 3. GRU Model
    gru_input = Input(shape=(max_len,))
    x = Embedding(input_dim=vocab_size, output_dim=embed_dim)(gru_input)
    x = GRU(64)(x)
    x = Dropout(0.5)(x)
    output = Dense(num_classes, activation="sigmoid")(x)
    models["GRU"] = Model(inputs=gru_input, outputs=output)

    # 4. Transformer-like Model (simple attention block)
    trans_input = Input(shape=(max_len,))
    x = Embedding(input_dim=vocab_size, output_dim=embed_dim)(trans_input)
    attn_output = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(x, x)
    x = Add()([x, attn_output])
    x = LayerNormalization()(x)
    x = GlobalAveragePooling1D()(x)
    x = Dense(64, activation="relu")(x)
    x = Dropout(0.3)(x)
    output = Dense(num_classes, activation="softmax")(x)
    models["Transformer"] = Model(inputs=trans_input, outputs=output)

    return models


# tests/evaluate_models.py

In [7]:
import numpy as np
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Model


def evaluate_model(model, X_seq_test, y_test):
    """
    Evaluates the given model on the test data and returns the accuracy.
    """
    model_name = model.name if isinstance(model, Model) else model.__class__.__name__
    print(f"\nEvaluating model: {model_name}")
    predictions = model.predict(X_seq_test)
    if len(predictions.shape) > 1 and predictions.shape[1] > 1:
        # For multi-class classification, take the class with the highest probability
        predictions = np.argmax(predictions, axis=1)
    else:
        # For binary classification, predictions are already in the correct format
        predictions = (predictions > 0.5).astype(int)

    test_labels = np.argmax(y_test, axis=1) if len(y_test.shape) > 1 else y_test

    accuracy = accuracy_score(test_labels, predictions)
    print(f"{model_name} Accuracy: {accuracy:.4f}")
    return accuracy


# src.train_models.py

In [8]:
import tensorflow as tf


def train_and_evaluate_models(
    X_tfidf_train, X_tfidf_test, X_seq_train, X_seq_test, y_train, y_test, num_classes
):
    """
    Trains and returns a dictionary of machine learning and deep learning models.
    """

    ml_models = get_machine_learning_models()
    dl_models = get_deep_learning_models(num_classes=num_classes)
    accuracy_results = {}

    # Train machine learning models on TF-IDF features
    for name, model in ml_models.items():
        print(f"\nTraining {name} model...")
        model.fit(X_tfidf_train, y_train)
        print(f"{name} model training completed.")
        print(f"Training Accuracy: {model.score(X_tfidf_train, y_train):.4f}")

        # Evaluate training accuracy
        accuracy = model.score(X_tfidf_test, y_test)
        print(f"{name} Evaluation Accuracy: {accuracy:.4f}")
        accuracy_results[name] = accuracy

    y_train_onehot = tf.keras.utils.to_categorical(y_train, num_classes=num_classes)
    # Train deep learning models on sequence data
    for name, model in dl_models.items():
        print(f"\nTraining {name} model...")
        model.compile(
            optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"]
        )
        # if the saved_models directory does not exist, create it
        tf.io.gfile.makedirs("./saved_models")

        # Use ModelCheckpoint to save the best model during training
        checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
            filepath=f"./saved_models/{name}_best_model.h5",
            save_best_only=True,
            monitor="val_accuracy",
            mode="max",
        )

        # Add verbose output for deep learning models
        history = model.fit(
            X_seq_train,
            y_train_onehot,
            validation_split=0.1,
            epochs=10,
            callbacks=[checkpoint_callback],
            batch_size=32,
            verbose=1,  # Set verbose=1 to print progress for each epoch
        )
        print(f"{name} model training completed.")
        # Print training accuracy for each epoch
        for epoch, acc in enumerate(history.history["accuracy"], 1):
            print(f"Epoch {epoch}: Training Accuracy = {acc:.4f}")

        # Evaluate training accuracy
        accuracy = evaluate_model(model, X_seq_test, y_test)
        print(f"{name} Evaluation Accuracy: {accuracy:.4f}")
        accuracy_results[name] = accuracy

    print("Evaluation Accuracy Results:", accuracy_results)

    return {**ml_models, **dl_models}

# src.preprocess_data.py

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd


def preprocess_data(df):
    df = df.dropna().drop_duplicates()

    feature_cols = ["link", "headline", "short_description", "authors", "date"]
    X = df[feature_cols].astype(str).agg(" ".join, axis=1)
    X = clean_text(X)

    category_to_idx = {k: i for i, k in enumerate(df["category"].unique())}
    y = df["category"].map(category_to_idx).astype("float32")

    # Get both representations
    X_tfidf = get_embeddings(X, method="tfidf")
    X_seq = get_embeddings(X, method="sequence")

    # Split both
    X_tfidf_train, X_tfidf_test, y_train, y_test = train_test_split(
        X_tfidf, y, test_size=0.2, random_state=42
    )
    X_seq_train, X_seq_test, _, _ = train_test_split(
        X_seq, y, test_size=0.2, random_state=42
    )

    print("Data split into training and testing sets successfully.")
    return X_tfidf_train, X_tfidf_test, X_seq_train, X_seq_test, y_train, y_test


def clean_text(X):
    X = X.str.lower()
    X = X.str.replace(r"http\S+|www\S+|https\S+", "", regex=True)
    X = X.str.replace(r"\@\w+|\#", "", regex=True)
    # Keep numbers and some punctuation
    X = X.str.replace(r"[^a-zA-Z0-9\s\.\?\!]", "", regex=True)
    X = X.str.replace(r"\s+", " ", regex=True)
    return X.str.strip()


def get_embeddings(X, method="tfidf", max_features=50000, max_len=500):
    if method == "tfidf":
        vectorizer = TfidfVectorizer(
            max_features=max_features,
            ngram_range=(1, 2),  # Use unigrams and bigrams
            stop_words="english",
            min_df=5,
            max_df=0.7,
        )
        return vectorizer.fit_transform(X)

    elif method == "sequence":
        tokenizer = Tokenizer(
            num_words=max_features,
            oov_token="<OOV>",
            filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
        )
        tokenizer.fit_on_texts(X)
        sequences = tokenizer.texts_to_sequences(X)
        return pad_sequences(
            sequences, maxlen=max_len, padding="post", truncating="post"
        )

# src.main

In [10]:
import json
import pandas as pd


def main():
    print("*" * 20, "Main", "*" * 20)
    print("\nStep 1: Loading dataset...")
    with open("/kaggle/input/news-category-dataset/News_Category_Dataset_v3.json", "r") as file:
        data = [json.loads(line) for line in file]

    # convert to pandas DataFrame
    data = pd.DataFrame(data)

    # data = data.head(100) # For testing

    print("Data loaded successfully.")

    # Step 2: Preprocess text
    print("\nStep 2: Preprocessing data...")
    X_tfidf_train, X_tfidf_test, X_seq_train, X_seq_test, y_train, y_test = (
        preprocess_data(data)
    )
    print("Data preprocessed successfully.")

    # Step 3: Train models
    print("\nStep 3: Training and Evaluating models...")
    models = train_and_evaluate_models(
        X_tfidf_train, X_tfidf_test, X_seq_train, X_seq_test, y_train, y_test, num_classes=data["category"].nunique()
    )
    print("Models Trained and Evaluated successfully.")
    print("*" * 20, "Return", "*" * 20)


if __name__ == "__main__":
    main()

******************** Main ********************

Step 1: Loading dataset...
Data loaded successfully.

Step 2: Preprocessing data...
Data split into training and testing sets successfully.
Data preprocessed successfully.

Step 3: Training and Evaluating models...

Training Logistic Regression model...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Logistic Regression model training completed.
Training Accuracy: 0.8272
Logistic Regression Evaluation Accuracy: 0.7379

Training Random Forest model...
Random Forest model training completed.
Training Accuracy: 1.0000
Random Forest Evaluation Accuracy: 0.6759

Training Naive Bayes model...
Naive Bayes model training completed.
Training Accuracy: 0.6296
Naive Bayes Evaluation Accuracy: 0.5903

Training XGBoost model...
XGBoost model training completed.
Training Accuracy: 0.8021
XGBoost Evaluation Accuracy: 0.6960

Training Simple Feedforward model...
Epoch 1/10
[1m4715/4715[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 3ms/step - accuracy: 0.1533 - loss: 159.1739 - val_accuracy: 0.1714 - val_loss: 3.2882
Epoch 2/10
[1m4715/4715[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2ms/step - accuracy: 0.1711 - loss: 3.3031 - val_accuracy: 0.1714 - val_loss: 3.2875
Epoch 3/10
[1m4715/4715[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2ms/step - accuracy: 0.1711 - l