In [3]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
import pandas as pd


In [4]:

df = pd.read_json('../dataset/News_Category_Dataset_v3.json', lines = True)
df.head()
X = df['short_description']
y = df['category']



In [5]:
# Tokenize and pad sequences
max_words = 10000  # Maximum number of words in the vocabulary
max_sequence_length = 200  # Maximum length of input sequences

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(X_sequences, maxlen=max_sequence_length)



In [6]:
# One-hot encode labels
label_binarizer = LabelBinarizer()
y_encoded = label_binarizer.fit_transform(y)

# Split the dataset into a training and testing set
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_encoded, test_size=0.2, random_state=42)



In [7]:
# Build the CNN model
model = Sequential()
model.add(Embedding(max_words, 100, input_length=max_sequence_length))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(y_encoded.shape[1], activation='softmax'))

model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])



In [8]:
# Train the model
batch_size = 32
epochs = 10

history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [21]:
# Evaluate the model
accuracy = model.evaluate(X_test, y_test, verbose=0)
print("Test Accuracy:", accuracy[1])

Test Accuracy: 0.3755309581756592
