In [1]:
from transformers import BertTokenizer, TFBertModel,BertForQuestionAnswering,BertForMaskedLM,BertModel
from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import pipeline
import logging
import tensorflow as tf
import torch
from nltk.tokenize import sent_tokenize
import faiss
import numpy as np

logging.getLogger("transformers.modeling_utils").setLevel(logging.WARNING)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, TFBertModel
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Initialize the IndoBERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p2')
bert_model = TFBertModel.from_pretrained('indobenchmark/indobert-base-p2')

file_path = '/Users/t-arvio.anandi/Downloads/Train Label - Sheet1 (2).csv'
df = pd.read_csv(file_path)

x_data_list = df['x_data'].tolist()
y_data_list = df['y_data'].tolist()

# Tokenize and encode
encoding = tokenizer(x_data_list, padding='max_length', truncation=True, return_tensors='tf', max_length=40)
x_input_ids = encoding['input_ids']
x_attention_mask = encoding['attention_mask']

# Pass the token IDs through BERT to get embeddings
x_embeddings = bert_model(x_input_ids, attention_mask=x_attention_mask)[0].numpy()  # Shape: (batch_size, sequence_length, hidden_size)
# hidden_size is 768 for BERT-base, but we need 50 features

# Desired size for the last dimension
target_size = 50

# Pad or truncate the last dimension
if x_embeddings.shape[2] < target_size:
    # Pad the last dimension
    padding = target_size - x_embeddings.shape[2]
    x_embeddings_padded = np.pad(x_embeddings, ((0, 0), (0, 0), (0, padding)), 'constant')
else:
    # Truncate the last dimension
    x_embeddings_padded = x_embeddings[:, :, :target_size]


# Convert y_data_list to strings and handle NaNs
def clean_label(label):
    if pd.isna(label):
        return [0]  # Replace NaN with a default value
    try:
        return list(map(int, label.split()))
    except:
        return [0]  # Handle any other conversion issues

y_sequences = [clean_label(label) for label in y_data_list]

# Pad y_sequences
y_sequences_padded = pad_sequences(y_sequences, maxlen=40, padding='post', truncating='post', value=0)  # Using -1 or another placeholder for padding

# Verify shapes
print("Padded X Shape:", x_embeddings.shape)
print("Padded Y Shape:", y_sequences_padded.shape)


Some layers from the model checkpoint at indobenchmark/indobert-base-p2 were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at indobenchmark/indobert-base-p2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Padded X Shape: (91, 40, 768)
Padded Y Shape: (91, 40)


In [126]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras import regularizers
from tensorflow.keras import backend as K

# Define the input layer to match the shape of your data
inputs = layers.Input(shape=(40, 768))  # sequence_length=40, embedding_dim=768

# LSTM layer
lstm_out = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(inputs)
lstm_out = layers.LSTM(128, return_sequences=True)(lstm_out)  # Stacked LSTM

# Dense layer with ReLU activation
relu_out = layers.TimeDistributed(layers.Dense(64, activation='relu'))(lstm_out)

# TimeDistributed Dense layer for classification
outputs = layers.TimeDistributed(layers.Dense(7, activation='softmax', kernel_regularizer=regularizers.l2(0.01)))(lstm_out)

# Define the model
model = models.Model(inputs, outputs)

def weighted_sparse_categorical_crossentropy(class_weights):
    def loss(y_true, y_pred):
        y_true = tf.cast(y_true, dtype=tf.int32)
        loss = tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred)
        
        # Convert class_weights to a TensorFlow constant
        class_weights_tensor = tf.constant(class_weights, dtype=tf.float32)
        
        # Gather weights for each sample
        sample_weights = tf.gather(class_weights_tensor, tf.squeeze(y_true))
        
        # Apply weights to the loss
        weighted_loss = loss * sample_weights
        
        return tf.reduce_mean(weighted_loss)
    return loss

class_weights = np.array([0.000000001, 1, 1, 1, 1, 1, 1], dtype=np.float32)

# Compile the model
model.compile(optimizer='adam', 
              loss=weighted_sparse_categorical_crossentropy(class_weights),
              metrics=['accuracy'])


# Print model summary
print(model.summary())


None


In [127]:
# Assuming you have X_sequences_padded and y_sequences_padded already padded
X_sequences_padded_tensor = tf.convert_to_tensor(x_embeddings, dtype=tf.float32)  # or tf.int32
y_sequences_padded_tensor = tf.convert_to_tensor(y_sequences_padded, dtype=tf.int32)  # or tf.int32


In [128]:
print(X_sequences_padded_tensor.shape)

(91, 40, 768)


In [129]:
# Convert Tensors to NumPy arrays
X_sequences_padded_numpy = X_sequences_padded_tensor.numpy()
y_sequences_padded_numpy = y_sequences_padded_tensor.numpy()

In [130]:
print(model.input_shape)

(None, 40, 768)


In [131]:
print(X_sequences_padded_numpy.shape)  # Should match the model's expected input shape


(91, 40, 768)


In [170]:
from sklearn.model_selection import train_test_split

# Assuming you have more data, split into train/test
X_train, X_test, y_train, y_test = train_test_split(X_sequences_padded_numpy, y_sequences_padded_numpy, test_size=0.2)

# Flatten y_train if it's one-hot encoded
y_train_flat = np.argmax(y_train, axis=1)  # Assuming y_train is one-hot encoded

# Apply SMOTE
smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train.reshape(X_train.shape[0], -1), y_train_flat)

# Fit the model
model.fit(X_train, y_train, epochs=10, batch_size=32)

InvalidParameterError: The 'k_neighbors' parameter of SMOTE must be an int in the range [1, inf) or an object implementing 'kneighbors' and 'kneighbors_graph'. Got 0 instead.

In [141]:
# Predict on test data
predictions = model.predict(X_test)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step


In [113]:
print(X_test)

[[[ 1.16373360e+00  1.64614451e+00  1.49624541e-01 ...  9.47471857e-01
   -7.79526770e-01 -2.38206834e-01]
  [-1.09512307e-01  5.61420798e-01  1.10080171e+00 ...  1.58512592e+00
   -6.93527937e-01  1.52172685e-01]
  [ 1.72116518e+00  5.08581460e-01 -8.53071928e-01 ...  9.59150255e-01
   -1.68603480e+00  1.36044395e+00]
  ...
  [ 3.99216563e-01  9.81472373e-01  6.81728661e-01 ...  3.21035218e+00
   -1.60785401e+00  6.23640642e-02]
  [ 4.99320835e-01  1.28240681e+00  1.13290274e+00 ...  2.64770746e+00
   -1.42777896e+00  1.16318412e-01]
  [ 3.19074899e-01  9.92004991e-01  1.34694839e+00 ...  1.90102756e+00
   -1.50747275e+00  1.98303416e-01]]

 [[ 3.14343989e-01  7.47709513e-01  2.43126631e-01 ...  1.49743712e+00
   -7.04169929e-01  2.21151859e-04]
  [ 1.13802421e+00 -4.13762420e-01 -3.24892879e-01 ...  5.58837205e-02
   -1.15946376e+00  7.70162165e-01]
  [-1.92225158e-01 -1.45542324e+00 -2.14756966e-01 ...  2.35511351e+00
   -4.63654995e-02  6.24685824e-01]
  ...
  [-5.51110208e-02  4.0

In [114]:
from sklearn.metrics import accuracy_score

# Convert predictions to class indices
predicted_labels = np.argmax(predictions, axis=-1)

# Flatten the labels and predictions if needed
Y_test_flattened = y_test.flatten()
predicted_labels_flattened = predicted_labels.flatten()

# Compute accuracy
accuracy = accuracy_score(Y_test_flattened, predicted_labels_flattened)

print(f"Accuracy: {accuracy:.4f}")


Accuracy: 0.0737


In [134]:
# Define the new sentence and max length
new_sentence = "durasi durasi durasi durasi durasi durasi durasi program durasi durasi"
# words = new_sentence.split()
# max_len = len(words)
max_len = 40

# Tokenize and convert to IDs
new_tokens = tokenizer.tokenize(new_sentence)
new_input_ids = tokenizer.convert_tokens_to_ids(new_tokens)
new_input_ids_padded = pad_sequences([new_input_ids], maxlen=max_len, padding='post', truncating='post')

# Convert input IDs to tensor
new_input_ids_tensor = tf.convert_to_tensor(new_input_ids_padded, dtype=tf.int32)

# Create an attention mask using TensorFlow operations
attention_mask_tensor = tf.cast(new_input_ids_tensor != 0, dtype=tf.float32)

# Compute embeddings using BERT
outputs = bert_model(new_input_ids_tensor, attention_mask=attention_mask_tensor, return_dict=True)
new_input_embeddings = outputs.last_hidden_state.numpy()


In [135]:
print(new_input_embeddings)

[[[-0.76173073  1.5870137  -0.85510683 ...  0.02373251  1.3382697
   -1.2021867 ]
  [-0.7474993   1.3678035  -0.83683765 ...  0.16939129  0.8250834
   -0.6025503 ]
  [-0.7360506   1.4114292  -0.8065479  ...  0.15967149  0.7553942
   -0.5666687 ]
  ...
  [-0.5694482   1.6982384  -0.3923992  ...  0.9461408   1.4252127
   -0.03565612]
  [-0.5575044   1.6410947  -0.4768017  ...  0.92158675  1.3660648
    0.07664187]
  [-0.6312691   1.6563826  -0.375901   ...  0.8670048   1.3082098
    0.02709645]]]


In [136]:
words = new_sentence.split()
max_len = len(words)

# Predict token labels for the input sentence
predictions = model.predict(new_input_embeddings)

# Convert probabilities to predicted class labels
predicted_labels = tf.argmax(predictions, axis=-1)

# Trim the predictions to keep only the first 7 (original) tokens
trimmed_predictions = predicted_labels[:, :max_len]

# Print the trimmed predicted labels
print("Trimmed Predicted Labels:", trimmed_predictions)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 268ms/step
Trimmed Predicted Labels: tf.Tensor([[1 6 6 4 4 4 4 4 4 4]], shape=(1, 10), dtype=int64)


In [137]:
from sklearn.metrics import classification_report, confusion_matrix

# Predict on test data
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=-1)

# Flatten the true labels and predictions
y_true_flat = y_test.flatten()
y_pred_flat = y_pred_classes.flatten()

# Print classification report
print(classification_report(y_true_flat, y_pred_flat))

# Create confusion matrix
cm = confusion_matrix(y_true_flat, y_pred_flat)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 276ms/step
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       712
           1       0.03      1.00      0.05         7
           2       0.01      1.00      0.02         1
           3       0.02      1.00      0.05         1
           4       0.28      1.00      0.44        23
           6       0.05      1.00      0.10        16

    accuracy                           0.06       760
   macro avg       0.07      0.83      0.11       760
weighted avg       0.01      0.06      0.02       760



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
