In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
# Load the dataset
df = pd.read_json('../../dataset/News_Category_Dataset_v3.json', lines=True)

In [4]:
# Combine 'headline' and 'short_description' into a single text field
df['text'] = df['headline'].astype(str) + ' ' + df['short_description'].astype(str)

In [5]:
# Encode the categories
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['category'])
num_classes = len(label_encoder.classes_)

In [6]:
# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)  # Limit features to manage memory
X = vectorizer.fit_transform(df['text']).toarray()

In [7]:
# Prepare labels for classification
y = to_categorical(df['label'], num_classes=num_classes)

In [8]:
# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [9]:
# Build the neural network model
model = Sequential([
    Dense(256, input_shape=(X_train.shape[1],), activation='relu'),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(num_classes, activation='softmax')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])