In [1]:
from transformers import BertTokenizer, TFBertModel,BertForQuestionAnswering,BertForMaskedLM,BertModel
from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import pipeline
import logging
import tensorflow as tf
import torch
from nltk.tokenize import sent_tokenize
import faiss
import numpy as np

logging.getLogger("transformers.modeling_utils").setLevel(logging.WARNING)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, TFBertModel
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Initialize the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

file_path = '/Users/t-arvio.anandi/Downloads/Train Label - Sheet1 (2).csv'
df = pd.read_csv(file_path)

x_data_list = df['x_data'].tolist()
y_data_list = df['y_data'].tolist()

# Tokenize and encode
encoding = tokenizer(x_data_list, padding='max_length', truncation=True, return_tensors='tf', max_length=40)
x_input_ids = encoding['input_ids']
x_attention_mask = encoding['attention_mask']

# Pass the token IDs through BERT to get embeddings
x_embeddings = bert_model(x_input_ids, attention_mask=x_attention_mask)[0].numpy()  # Shape: (batch_size, sequence_length, hidden_size)
# hidden_size is 768 for BERT-base, but we need 50 features

# Desired size for the last dimension
target_size = 50

# Pad or truncate the last dimension
if x_embeddings.shape[2] < target_size:
    # Pad the last dimension
    padding = target_size - x_embeddings.shape[2]
    x_embeddings_padded = np.pad(x_embeddings, ((0, 0), (0, 0), (0, padding)), 'constant')
else:
    # Truncate the last dimension
    x_embeddings_padded = x_embeddings[:, :, :target_size]


# Convert y_data_list to strings and handle NaNs
def clean_label(label):
    if pd.isna(label):
        return [0]  # Replace NaN with a default value
    try:
        return list(map(int, label.split()))
    except:
        return [0]  # Handle any other conversion issues

y_sequences = [clean_label(label) for label in y_data_list]

# Pad y_sequences
y_sequences_padded = pad_sequences(y_sequences, maxlen=40, padding='post', truncating='post', value=0)  # Using -1 or another placeholder for padding

# Verify shapes
print("Padded X Shape:", x_embeddings.shape)
print("Padded Y Shape:", y_sequences_padded.shape)


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Padded X Shape: (91, 40, 768)
Padded Y Shape: (91, 40)


In [3]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras import regularizers

# Define the input layer to match the shape of your data
inputs = layers.Input(shape=(40, 768))  # sequence_length=40, embedding_dim=768

# LSTM layer
lstm_out = layers.LSTM(64, return_sequences=True)(inputs)

# Add dropout to prevent overfitting
lstm_dropout = layers.Dropout(0.5)


# TimeDistributed Dense layer for classification
outputs = layers.TimeDistributed(layers.Dense(7, activation='softmax', kernel_regularizer=regularizers.l2(0.01)))(lstm_out)

# Define the model
model = models.Model(inputs, outputs)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Print model summary
print(model.summary())


None


In [4]:
# Assuming you have X_sequences_padded and y_sequences_padded already padded
X_sequences_padded_tensor = tf.convert_to_tensor(x_embeddings, dtype=tf.int32)  # or tf.int32
y_sequences_padded_tensor = tf.convert_to_tensor(y_sequences_padded, dtype=tf.int32)  # or tf.int32


In [5]:
print(X_sequences_padded_tensor.shape)

(91, 40, 768)


In [6]:
# Convert Tensors to NumPy arrays
X_sequences_padded_numpy = X_sequences_padded_tensor.numpy()
y_sequences_padded_numpy = y_sequences_padded_tensor.numpy()

In [7]:
print(model.input_shape)

(None, 40, 768)


In [8]:
print(X_sequences_padded_numpy.shape)  # Should match the model's expected input shape


(91, 40, 768)


In [9]:
from sklearn.model_selection import train_test_split

# Assuming you have more data, split into train/test
X_train, X_test, y_train, y_test = train_test_split(X_sequences_padded_numpy, y_sequences_padded_numpy, test_size=0.2)

# Fit the model
model.fit(X_train, y_train, epochs=10, batch_size=32)

Epoch 1/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.3732 - loss: 1.7850
Epoch 2/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.9524 - loss: 1.1578
Epoch 3/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.9648 - loss: 0.7385
Epoch 4/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.9636 - loss: 0.5203
Epoch 5/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.9642 - loss: 0.4247
Epoch 6/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.9646 - loss: 0.3885
Epoch 7/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.9661 - loss: 0.3630
Epoch 8/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.9657 - loss: 0.3572
Epoch 9/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [

<keras.src.callbacks.history.History at 0x38048a820>

In [10]:
# Predict on test data
predictions = model.predict(X_test)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 176ms/step


In [11]:
from sklearn.metrics import accuracy_score

# Convert predictions to class indices
predicted_labels = np.argmax(predictions, axis=-1)

# Flatten the labels and predictions if needed
Y_test_flattened = y_test.flatten()
predicted_labels_flattened = predicted_labels.flatten()

# Compute accuracy
accuracy = accuracy_score(Y_test_flattened, predicted_labels_flattened)

print(f"Accuracy: {accuracy:.4f}")


Accuracy: 0.9447


In [12]:
# Define the new sentence and max length
new_sentence = "School of Doctorate Akuntansi desain interior layanan"
# words = new_sentence.split()
# max_len = len(words)
max_len = 40

# Tokenize and convert to IDs
new_tokens = tokenizer.tokenize(new_sentence)
new_input_ids = tokenizer.convert_tokens_to_ids(new_tokens)
new_input_ids_padded = pad_sequences([new_input_ids], maxlen=max_len, padding='post', truncating='post')

# Convert input IDs to tensor
new_input_ids_tensor = tf.convert_to_tensor(new_input_ids_padded, dtype=tf.int32)

# Create an attention mask using TensorFlow operations
attention_mask_tensor = tf.cast(new_input_ids_tensor != 0, dtype=tf.int32)

# Compute embeddings using BERT
outputs = bert_model(new_input_ids_tensor, attention_mask=attention_mask_tensor, return_dict=True)
new_input_embeddings = outputs.last_hidden_state.numpy()


In [13]:
# Predict
predictions = model.predict(new_input_embeddings)
predicted_labels = tf.argmax(predictions, axis=0)

print("Predicted Labels:", predicted_labels)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 176ms/step
Predicted Labels: tf.Tensor(
[[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]], shape=(40, 7), dtype=int64)
