In [10]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.utils import to_categorical
import tensorflow as tf
from tensorflow.keras import mixed_precision

In [11]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  0


In [12]:
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)

In [15]:
df = pd.read_json('../../dataset/News_Category_Dataset_v3.json', lines=True)
df

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22
...,...,...,...,...,...,...
209522,https://www.huffingtonpost.com/entry/rim-ceo-t...,RIM CEO Thorsten Heins' 'Significant' Plans Fo...,TECH,Verizon Wireless and AT&T are already promotin...,"Reuters, Reuters",2012-01-28
209523,https://www.huffingtonpost.com/entry/maria-sha...,Maria Sharapova Stunned By Victoria Azarenka I...,SPORTS,"Afterward, Azarenka, more effusive with the pr...",,2012-01-28
209524,https://www.huffingtonpost.com/entry/super-bow...,"Giants Over Patriots, Jets Over Colts Among M...",SPORTS,"Leading up to Super Bowl XLVI, the most talked...",,2012-01-28
209525,https://www.huffingtonpost.com/entry/aldon-smi...,Aldon Smith Arrested: 49ers Linebacker Busted ...,SPORTS,CORRECTION: An earlier version of this story i...,,2012-01-28


In [16]:
df = df[['headline', 'short_description', 'category']]

In [17]:
top_categories = df['category'].value_counts().head(10).index

In [18]:
df_filtered = df[df['category'].isin(top_categories)]

In [20]:
df_filtered['text'] = df_filtered['headline'] + " " + df_filtered['short_description']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['text'] = df_filtered['headline'] + " " + df_filtered['short_description']


In [22]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [25]:
X = model.encode(df_filtered['text'].tolist(), convert_to_tensor=False)
encoder = LabelEncoder()
y = encoder.fit_transform(df_filtered['category'])
y = to_categorical(y)  # Convert labels to one-hot encoded format

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
model_nn = Sequential()

In [29]:
# Add input layer (Dense layer with 512 units)
model_nn.add(Dense(512, input_dim=X_train.shape[1], activation='relu'))

# Add dropout layer for regularization
model_nn.add(Dropout(0.5))

# Add hidden layer
model_nn.add(Dense(256, activation='relu'))

# Add dropout layer for regularization
model_nn.add(Dropout(0.5))

# Add output layer with softmax activation (for multi-class classification)
model_nn.add(Dense(y_train.shape[1], activation='softmax'))

In [30]:
model_nn.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [31]:
history = model_nn.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test))

Epoch 1/10
[1m1560/1560[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 14ms/step - accuracy: 0.7120 - loss: 0.8924 - val_accuracy: 0.8049 - val_loss: 0.5825
Epoch 2/10
[1m1560/1560[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 19ms/step - accuracy: 0.8159 - loss: 0.5740 - val_accuracy: 0.8154 - val_loss: 0.5584
Epoch 3/10
[1m1560/1560[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 27ms/step - accuracy: 0.8401 - loss: 0.4913 - val_accuracy: 0.8199 - val_loss: 0.5433
Epoch 4/10
[1m1560/1560[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 30ms/step - accuracy: 0.8598 - loss: 0.4238 - val_accuracy: 0.8170 - val_loss: 0.5614
Epoch 5/10
[1m1560/1560[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 27ms/step - accuracy: 0.8804 - loss: 0.3537 - val_accuracy: 0.8182 - val_loss: 0.5941
Epoch 6/10
[1m1560/1560[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 31ms/step - accuracy: 0.8988 - loss: 0.2922 - val_accuracy: 0.8145 - val_loss: 0.6503
Epoc

In [32]:
loss, accuracy = model_nn.evaluate(X_test, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

[1m780/780[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.7994 - loss: 0.8818
Test Loss: 0.8719780445098877
Test Accuracy: 0.8037502765655518
