In [19]:
# Step 1: Load the necessary libraries
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.utils import to_categorical
import tensorflow as tf
from tensorflow.keras import mixed_precision

In [20]:
# Step 2: Check for GPU availability
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  0


In [21]:
# Enable mixed precision training (if supported by your GPU)
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)

In [22]:
# Step 3: Load the dataset (assuming it's saved as a JSON file)
df = pd.read_json('../../dataset/News_Category_Dataset_v3.json',lines=True)

In [23]:
# Step 4: Keep both 'headline' and 'short_description' columns
df = df[['headline', 'short_description', 'category']]

In [24]:
# Step 5: Find the top 10 most frequent categories
top_categories = df['category'].value_counts().head(10).index

In [25]:
# Step 6: Filter the dataset to include only the top 10 frequent categories
df_filtered = df[df['category'].isin(top_categories)]

In [26]:
# Step 7: Combine 'headline' and 'short_description' into one text column
df_filtered['text'] = df_filtered['headline'] + " " + df_filtered['short_description']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['text'] = df_filtered['headline'] + " " + df_filtered['short_description']


In [27]:
# Step 8: Initialize the Sentence-Transformer model for sentence embeddings and move to GPU
model = SentenceTransformer('all-MiniLM-L6-v2')  # Use GPU if available ( device='cuda' )

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [28]:
# Step 9: Generate sentence embeddings for each row in the 'text' column
X = model.encode(df_filtered['text'].tolist(), convert_to_tensor=False)  # List of sentence embeddings

In [29]:
# Step 10: Encode the target labels ('category') using LabelEncoder
encoder = LabelEncoder()
y = encoder.fit_transform(df_filtered['category'])
y = to_categorical(y)  # Convert labels to one-hot encoded format

In [30]:
# Step 11: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
# Step 12: Build the Keras neural network model
model_nn = Sequential()

In [32]:
# Add input layer (Dense layer with 512 units)
model_nn.add(Dense(512, input_dim=X_train.shape[1], activation='relu'))

# Add dropout layer for regularization
model_nn.add(Dropout(0.5))

# Add hidden layer
model_nn.add(Dense(256, activation='relu'))

# Add dropout layer for regularization
model_nn.add(Dropout(0.5))

# Add output layer with softmax activation (for multi-class classification)
model_nn.add(Dense(y_train.shape[1], activation='softmax'))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [33]:
# Step 13: Compile the model
model_nn.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [34]:
# Step 14: Train the model
history = model_nn.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test))

Epoch 1/10
[1m1560/1560[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 10ms/step - accuracy: 0.7049 - loss: 0.9097 - val_accuracy: 0.7992 - val_loss: 0.5907
Epoch 2/10
[1m1560/1560[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 10ms/step - accuracy: 0.7990 - loss: 0.6114 - val_accuracy: 0.8092 - val_loss: 0.5662
Epoch 3/10
[1m1560/1560[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 11ms/step - accuracy: 0.8114 - loss: 0.5724 - val_accuracy: 0.8127 - val_loss: 0.5526
Epoch 4/10
[1m1560/1560[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 9ms/step - accuracy: 0.8217 - loss: 0.5397 - val_accuracy: 0.8189 - val_loss: 0.5343
Epoch 5/10
[1m1560/1560[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 10ms/step - accuracy: 0.8293 - loss: 0.5144 - val_accuracy: 0.8207 - val_loss: 0.5300
Epoch 6/10
[1m1560/1560[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 10ms/step - accuracy: 0.8373 - loss: 0.4927 - val_accuracy: 0.8217 - val_loss: 0.5278
Epoch

In [35]:
# Step 15: Evaluate the model
loss, accuracy = model_nn.evaluate(X_test, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

[1m780/780[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.8210 - loss: 0.5451
Test Loss: 0.5313742160797119
Test Accuracy: 0.825707197189331
