### Reuters Dense TF V1


Code in this file process the Reuters data (single-label multi-class classification). The setup is:

1. The data comes from 10,000 articles shared by Reuters.
2. Each article contains [0...1000] words
3. The Kaggle folks pre-processed the articles so that each article is a 1D array that contains numbers in the
    range(0...10000) corresponding to the words most frequently found in the articles. 
4. The labels is also a 1D array that contains numbers corresponding to the 46 categories of the articles.
5. The pre-processing here is similar to the pre-processing done to the imdb data
6. The model here illustrate the case in which the final layer is made up of more than one class;
   in this case the number of classes (topics) is 46.
7. As it was the case of the imdb setup, the model here also overfits.

    after 20 epochs
                TEST                        VALIDATION
        loss: 0.1097 - acc: 0.9582 - val_loss: 1.0746 - val_acc: 0.8010
'''

In [1]:
# get APIs
from logging import logProcesses
import os, shutil
from platform import python_branch
from syslog import LOG_SYSLOG

import numpy as np
import matplotlib.pyplot as plt 
import tensorflow as tf
import keras
from keras import layers
from keras import models
from keras import optimizers
from keras.layers import Dropout
from keras.datasets import mnist
# from keras.utils import to_categorical
# from keras.preprocessing.image import ImageDataGenerator
from tensorflow.python.platform import build_info as tf_build_info



2025-02-20 02:48:11.478953: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1740019691.523905     686 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1740019691.536696     686 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-20 02:48:11.634889: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Print APIs version numbers
print ("TF Version   ", tf.__version__)
print ("TF Path      ", tf.__path__[0])
print("Keras version ", keras.__version__)
print("numpy version ", np.__version__)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


# print(tf_build_info.cuda_version_number)
# print(tf_build_info.cudnn_version_number)


TF Version    2.18.0
TF Path       /usr/local/lib/python3.11/site-packages/keras/api/_v2
Keras version  3.8.0
numpy version  2.0.2
Num GPUs Available:  1


In [3]:
# enumerate is the reason why this function is so short, enumerate is a very "pythonic" function
def vectorize_sequences(sequences, dimension=10000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate (sequences):
        results[i, sequence] = 1.
    return results

def vectorize_labels(labels, dimension=46):
    results = np.zeros((len(labels), dimension))
    for i, label in enumerate (labels):
        results[i, label] = 1.
    return results

def print_review( item ) :
  for i in item :
    print (i)
  return


def decode_to_text() :
    word_index = reuters.get_word_index()
    reverse_word_index = dict([ (val, key) for (key, val) in word.index.items])
    decoded_text = ' '.join( [reverse_word_index.get (i-3, '?') for i in train_data[0]])
    print ( decoded_text)
    return

In [4]:
# get the data
reuters = tf.keras.datasets.reuters
(train_data, train_labels), (test_data, test_labels) = reuters.load_data( num_words=10000)

print(train_data[0])
print(train_labels[0])
print(test_data[0])
print(test_labels[0])

[1, 2, 2, 8, 43, 10, 447, 5, 25, 207, 270, 5, 3095, 111, 16, 369, 186, 90, 67, 7, 89, 5, 19, 102, 6, 19, 124, 15, 90, 67, 84, 22, 482, 26, 7, 48, 4, 49, 8, 864, 39, 209, 154, 6, 151, 6, 83, 11, 15, 22, 155, 11, 15, 7, 48, 9, 4579, 1005, 504, 6, 258, 6, 272, 11, 15, 22, 134, 44, 11, 15, 16, 8, 197, 1245, 90, 67, 52, 29, 209, 30, 32, 132, 6, 109, 15, 17, 12]
3
[1, 4, 1378, 2025, 9, 697, 4622, 111, 8, 25, 109, 29, 3650, 11, 150, 244, 364, 33, 30, 30, 1398, 333, 6, 2, 159, 9, 1084, 363, 13, 2, 71, 9, 2, 71, 117, 4, 225, 78, 206, 10, 9, 1214, 8, 4, 270, 5, 2, 7, 748, 48, 9, 2, 7, 207, 1451, 966, 1864, 793, 97, 133, 336, 7, 4, 493, 98, 273, 104, 284, 25, 39, 338, 22, 905, 220, 3465, 644, 59, 20, 6, 119, 61, 11, 15, 58, 579, 26, 10, 67, 7, 4, 738, 98, 43, 88, 333, 722, 12, 20, 6, 19, 746, 35, 15, 10, 9, 1214, 855, 129, 783, 21, 4, 2280, 244, 364, 51, 16, 299, 452, 16, 515, 4, 99, 29, 5, 4, 364, 281, 48, 10, 9, 1214, 23, 644, 47, 20, 324, 27, 56, 2, 2, 5, 192, 510, 17, 12]
3


In [None]:
x_train = vectorize_sequences (train_data)
x_test = vectorize_sequences(test_data)
y_train = np.asarray(train_labels).astype('float32')
y_test  = np.asarray(test_labels).astype('float32')
v_train_labels = vectorize_labels( train_labels)
v_test_labels = vectorize_labels( test_labels)

print(x_train[0])
print(x_test[0])
print(y_train[0])
print(y_test[0])
print(v_train_labels[0])
print(v_test_labels[0])

In [None]:
# define the model and print params number
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(64, activation='relu', input_shape=(10000,)))
model.add(tf.keras.layers.Dense(64, activation='relu'))
model.add(tf.keras.layers.Dense(46, activation='softmax'))

model.summary()


In [6]:
# partition the data into train and validation sets
x_val = x_train[:1000]
partial_x_train = x_train[1000:]

y_val = v_train_labels[:1000]
partial_y_train = v_train_labels[1000:]

In [None]:
# Compile, train

model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
history = model.fit(partial_x_train, partial_y_train, epochs=20, batch_size=512, validation_data=(x_val, y_val))

history_dict = history.history
loss_values = history_dict['loss']
val_loss_values = history_dict['val_loss']
epochs = range(1, len(loss_values) + 1)


In [None]:
# Plot training and validation loss
plt.plot(epochs, loss_values, 'bo', label='Training Loss')
plt.plot(epochs, val_loss_values, 'b', label='Validation Loss')
plt.title('training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()


In [None]:
# plot training and validation accuracy
plt.clf()
acc = history_dict['acc']
val_acc = history_dict['val_acc']

plt.plot(epochs, acc, 'bo', label='Training Acc')
plt.plot(epochs, val_acc,'b', label='Validation Acc')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()


In [None]:
# Produce cool charts for evaluation
reuters_model=keras.Sequential([layers.Dense(64,activation="relu"),
                                layers.Dense(4,activation="relu"),
                                layers.Dense(46,activation="softmax")])
reuters_model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"])
history = reuters_model.fit(partial_x_train, partial_y_train, epochs=20, batch_size=512, 
                             validation_data=(x_val, y_val))


In [None]:
predictions = reuters_model.predict(x_test)

history_dict = history.history
loss_values = history_dict['loss']
val_loss_values = history_dict['val_loss']
epochs = range(1, len(loss_values) + 1)

In [None]:
# Plot training and validation loss
plt.plot(epochs, loss_values, 'bo', label='Training Loss')
plt.plot(epochs, val_loss_values, 'b', label='Validation Loss')
plt.title('training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()