In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout, BatchNormalization
import json
import numpy as np
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt

In [None]:
with open("path_to_train_articles.json", "r") as f:
  train_data = json.load(f)
with open("path_to_train_articles.json", "r") as f2:
  test_data = json.load(f2)

Get train and test for 3 class classification

In [None]:
X_train = []
y_train = []

for article in train_data:
  X_train.append(article['text'])
  y_train.append(article['class_label'])

In [None]:
X_test = []
y_test = []

for article in test_data:
  X_test.append(article['text'])
  y_test.append(article['class_label'])

Get train and test for binary classification

In [None]:
X_train = []
y_train = []

for article in train_data:
  X_train.append(article['text'])
  y_train.append(article['class_label'] if article['class_label'] == 0 else 1)

In [None]:
X_test = []
y_test = []

for article in test_data:
  X_test.append(article['text'])
  y_test.append(article['class_label'] if article['class_label'] == 0 else 1)

In [None]:
y_train = to_categorical(y_train, num_classes=3)
y_test = to_categorical(y_test, num_classes=3)

In [None]:
X = X_train + X_test

In [None]:
# 1. Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X_train)

In [None]:
# Automatically determine the vocabulary size
vocab_size = len(tokenizer.word_index) + 1

In [None]:
#define the max_sequences_lenght:
max_sequence_length = max(len(seq) for seq in sequences)

In [None]:
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')

In [None]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=128))
model.add(Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True))),
model.add(Dropout(0.2))
model.add(Bidirectional(tf.keras.layers.LSTM(64))),
model.add(Dense(32, activation='relu')),
model.add(Dense(3, activation='softmax'))

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.build()

In [None]:
history = model.fit(padded_sequences, y_train, epochs=1, batch_size=600, validation_split=0.1, shuffle=True)

In [None]:
# Plot the loss curves
plt.figure(figsize=(10, 6))

# Plot training and validation loss
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')

# Add labels and title
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()

# Show the plot
plt.show()

In [None]:
model.summary()

In [None]:
from tensorflow.keras.utils import plot_model
plot_model(model, to_file='/content/modelo.png', show_shapes=True)

In [None]:
test_sequences = tokenizer.texts_to_sequences(X_test)
max_test_sequence_length = max(len(seq) for seq in test_sequences)
padded_test_sequences = pad_sequences(test_sequences, maxlen=max_test_sequence_length, padding='post')

In [None]:
from sklearn.metrics import classification_report
import time

now = time.time()
y_pred = model.predict(padded_test_sequences)
print(time.time() - now)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true_classes = np.argmax(y_test, axis=1)

# Generate a classification report
report = classification_report(y_true_classes, y_pred_classes)
print(report)

In [None]:
import numpy as np

# Example n value
n = 5  # Number of shortest test strings to select

# Get the lengths of each test sequence (before padding)
test_sequences_lengths = [len(sequence) for sequence in padded_test_sequences]

# Get the indices of the n shortest sequences
shortest_indices = np.argsort(test_sequences_lengths)[:-n]

# Select the n shortest sequences and their corresponding labels
shortest_test_sequences = padded_test_sequences[shortest_indices]
shortest_test_labels = np.array(y_test)[shortest_indices]
print(X_test[shortest_indices[0]])
print(shortest_test_labels)

# Evaluate the model on the n shortest test sequences
loss, accuracy = model.evaluate(shortest_test_sequences, shortest_test_labels)
print(f"Test accuracy on the {n} shortest sequences: {accuracy}")