# Text document Classification (Dataset)

https://www.kaggle.com/datasets/jensenbaxter/10dataset-text-document-classification

*   10 clases
*   1000 muestras de entrenamiento

In [3]:
import zipfile
with zipfile.ZipFile('/content/Dataset_text.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/Dataset_text/')

In [4]:
from tensorflow import keras
batch_size = 32

train_ds = keras.utils.text_dataset_from_directory(
    "/content/Dataset_text",
    batch_size=batch_size,
    validation_split=0.2,
    label_mode="categorical",
    seed = 2023,
    subset="training",
)

Found 1000 files belonging to 10 classes.
Using 800 files for training.


In [5]:
val_ds = keras.utils.text_dataset_from_directory(
    "/content/Dataset_text",
    batch_size=batch_size,
    validation_split=0.2,
    label_mode="categorical",
    seed = 2023,
    subset="validation",
)

Found 1000 files belonging to 10 classes.
Using 200 files for validation.


# Data preparation



In [6]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization

# No se utilizan ngrams (none)
max_length = 600
max_tokens = 20000
text_vectorization = TextVectorization(
    max_tokens=max_tokens,
    output_mode="int",
    output_sequence_length=max_length,    # Limitar la entrada a 600 palabras
)

# Extraer solo el texto (features) de los datos de entrada para calcular el vocabulario
text_only_train_ds = train_ds.map(lambda x, y: x)

text_vectorization.adapt(text_only_train_ds)

int_train_ds = train_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)

int_val_ds = val_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)

# Modelo

In [7]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense, Input, Dropout, Bidirectional


inputs = Input(shape=(None,), dtype="int64")
embedded = tf.keras.ops.one_hot(inputs, num_classes=max_tokens)

x = Bidirectional(layers.LSTM(32))(embedded)
x = Dropout(0.5)(x)
outputs = Dense(10, activation="softmax")(x)

model_LSTM = keras.Model(inputs, outputs)

model_LSTM.compile(optimizer="rmsprop",
              loss="categorical_crossentropy",
              metrics=["accuracy"])

model_LSTM.summary()

# Entrenamiento

In [8]:
# Utilización de caché
model_LSTM.fit(
    int_train_ds,
    validation_data=int_val_ds,
    epochs=20,
    #callbacks=callbacks
    )

Epoch 1/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 347ms/step - accuracy: 0.1386 - loss: 2.2993 - val_accuracy: 0.3250 - val_loss: 2.2839
Epoch 2/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 335ms/step - accuracy: 0.2985 - loss: 2.2605 - val_accuracy: 0.1950 - val_loss: 2.1787
Epoch 3/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 336ms/step - accuracy: 0.2456 - loss: 2.0758 - val_accuracy: 0.2700 - val_loss: 2.0176
Epoch 4/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 336ms/step - accuracy: 0.2836 - loss: 2.0054 - val_accuracy: 0.3400 - val_loss: 1.9082
Epoch 5/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 352ms/step - accuracy: 0.3191 - loss: 1.9035 - val_accuracy: 0.2400 - val_loss: 2.0774
Epoch 6/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 339ms/step - accuracy: 0.2947 - loss: 1.8749 - val_accuracy: 0.3650 - val_loss: 1.7880
Epoch 7/20
[1m25/25[0m

<keras.src.callbacks.history.History at 0x7ef70ed6e290>

#Evaluación del modelo

Sobre datos test