# IMDB (Dataset)

In [None]:
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80.2M  100 80.2M    0     0  3716k      0  0:00:22  0:00:22 --:--:-- 3920k


In [None]:
!rm -r aclImdb/train/unsup

In [None]:
from tensorflow import keras
batch_size = 32

In [None]:
train_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/train",
    batch_size=batch_size,
    validation_split=0.2,
    seed = 2023,
    subset="training",
)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.


In [None]:
val_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/train",
    batch_size=batch_size,
    validation_split=0.2,
    seed = 2023,
    subset="validation",
)

Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [None]:
test_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/test",
    batch_size=batch_size
)

Found 25000 files belonging to 2 classes.


In [None]:
# Visualizar características del primer lote (batch)
for inputs, targets in train_ds:
  print("inputs.shape:", inputs.shape)
  print("inputs.dtype:", inputs.dtype)
  print("targets.shape:", targets.shape)
  print("targets.dtype:", targets.dtype)
  print("inputs[0]:", inputs[0])
  print("targets[0]:", targets[0])
  break

inputs.shape: (32,)
inputs.dtype: <dtype: 'string'>
targets.shape: (32,)
targets.dtype: <dtype: 'int32'>
inputs[0]: tf.Tensor(b"This movie took my breath away at some points, I simply loved it! <br /><br />I admit that the character dialogs and storyline could have been done a bit better, but hey, this is just a simple (short) story of a couple of guys trying to slain a dragon, there's nothing more to it!<br /><br />The overall design, atmosphere, the beautiful landscapes... it's all just magical! <br /><br />They've put a lot of love in this movie. Character designs were great and funny. A bit Tim Burton-ish if you like. I can recommend this movie to anyone interested in great design, displayed in a simple small, but lovely story.", shape=(), dtype=string)
targets[0]: tf.Tensor(1, shape=(), dtype=int32)


# TextVectorization

Pre-procesamiento


*   Limitar a las 20000 palabras más frecuentes
*   Multi-hot enconding
*   Texto plano sin etiquetas
*   Indexar el dataset de acuerdo con el vocabulario

https://www.tensorflow.org/api_docs/python/tf/keras/layers/TextVectorization

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization

text_vectorization = TextVectorization(
    ngrams=1,
    max_tokens=20000,
    output_mode="multi_hot",
)

# Extraer solo el texto (features) de los datos de entrada para calcular el vocabulario
text_only_train_ds = train_ds.map(lambda x, y: x)

# Calcular el vocabulario de los tokens de entrada
text_vectorization.adapt(text_only_train_ds)

# Definir los datasets
# Procesar los features de entrada con text_vectorization y conservar la etiqueta
Unigram_train_ds = train_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)

Unigram_val_ds = val_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)

Unigram_test_ds = test_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)

Visualizar las características de la salida después del TextVectorization:


*   Vectores de 20000 elementos
*   Valores (elementos) del vector tipo Int32 (unos y ceros)

In [None]:
for inputs, targets in Unigram_train_ds:
  print("inputs.shape:", inputs.shape)
  print("inputs.dtype:", inputs.dtype)
  print("targets.shape:", targets.shape)
  print("targets.dtype:", targets.dtype)
  print("inputs[0]:", inputs[0])
  print("targets[0]:", targets[0])
  break

inputs.shape: (32, 20000)
inputs.dtype: <dtype: 'int64'>
targets.shape: (32,)
targets.dtype: <dtype: 'int32'>
inputs[0]: tf.Tensor([1 1 1 ... 0 0 0], shape=(20000,), dtype=int64)
targets[0]: tf.Tensor(1, shape=(), dtype=int32)


# Modelo

In [None]:
from tensorflow import keras
from tensorflow.keras import layers

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense, Input, Dropout

max_tokens=20000
No_unidades=16

model_MLP = Sequential([
    Input(shape=(max_tokens,)),
    Dense(No_unidades, activation="relu"),
    Dropout(0.5),
    Dense(1, activation="sigmoid")
])

model_MLP.compile(
    optimizer='rmsprop',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [None]:
model_MLP.summary()

# Entrenamiento

In [None]:
# Utilización de caché
model_MLP.fit(Unigram_train_ds.cache(),
          validation_data=Unigram_val_ds.cache(),
          epochs=10,
          #callbacks=callbacks
              )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x79c82a89ff70>

#Evaluación del modelo

Sobre datos test

In [None]:
#model_MLP = keras.models.load_model("binary_1gram.keras")

print(f"Test acc: {model_MLP.evaluate(Unigram_test_ds)[1]:.3f}")

Test acc: 0.874


# Predicción

In [None]:
# Para la predicción, es necesario incoprpar al modelo la capa de pre-procesamiento

inputs = keras.Input(shape=(1,), dtype="string")
processed_inputs = text_vectorization(inputs)
outputs = model_MLP(processed_inputs)
inference_model = keras.Model(inputs, outputs)

In [None]:
raw_text_data = tf.convert_to_tensor([
    ["That was an excellent movie, I loved it."],
    ])
predictions = inference_model(raw_text_data)
print(f"{float(predictions[0] * 100):.2f} percent positive")

90.83 percent positive


# Bigram

In [None]:
text_vectorization = TextVectorization(
    ngrams=2,
    max_tokens=20000,
    output_mode="multi_hot",
)

# Calcular el vocabulario de los tokens de entrada
text_vectorization.adapt(text_only_train_ds)

# Definir los datasets
# Procesar los features de entrada con text_vectorization y conservar la etiqueta
Bigram_train_ds = train_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)

Bigram_val_ds = val_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)

Bigram_test_ds = test_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)

In [None]:
model_MLP2 = Sequential([
    Input(shape=(max_tokens,)),
    Dense(No_unidades, activation="relu"),
    Dropout(0.5),
    Dense(1, activation="sigmoid")
])

model_MLP2.compile(
    optimizer='rmsprop',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [None]:
# Utilización de caché
model_MLP2.fit(Bigram_train_ds.cache(),
          validation_data=Bigram_val_ds.cache(),
          epochs=10,
          #callbacks=callbacks
              )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7b3af43e1570>

In [None]:
print(f"Test acc: {model_MLP2.evaluate(Bigram_test_ds)[1]:.3f}")

Test acc: 0.884


# Bigram + TF_IDF

TF_IDF: taking the histogram of the words over the text

(Term frequency, inverse document frequency): how often the term comes up across the dataset

In [None]:
text_vectorization = TextVectorization(
    ngrams=2,
    max_tokens=20000,
    output_mode="tf_idf",
)

# Extraer solo el texto (features) de los datos de entrada
text_only_train_ds = train_ds.map(lambda x, y: x)

# Calcular el vocabulario de los tokens de entrada
text_vectorization.adapt(text_only_train_ds)

# Definir los datasets
# Procesar los features de entrada con text_vectorization y conservar la etiqueta
TFBigram_train_ds = train_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)

TFBigram_val_ds = val_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)

TFBigram_test_ds = test_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)

In [None]:
model_MLP3 = Sequential([
    Input(shape=(max_tokens,)),
    Dense(No_unidades, activation="relu"),
    Dropout(0.5),
    Dense(1, activation="sigmoid")
])

model_MLP3.compile(
    optimizer='rmsprop',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [None]:
# Utilización de caché
model_MLP3.fit(TFBigram_train_ds.cache(),
          validation_data=TFBigram_val_ds.cache(),
          epochs=10,
          #callbacks=callbacks
              )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7b3aed7a2c80>

In [None]:
print(f"Test acc: {model_MLP3.evaluate(TFBigram_test_ds)[1]:.3f}")

Test acc: 0.876


# Exportar csv to txt files

En excel separar archivos xlsx por clase y luego guardar en cada carpeta

In [None]:
import pandas as pd
df = pd.read_csv('my_file.csv')

for i in range(df.shape[0]):
  content = df['Name'][i:i+1].to_string(index=False)
  print(content, file=open('my_file' + str(i) + '.txt', 'w'), end="")