In [6]:
import numpy as np
from sklearn.model_selection import train_test_split
import json
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification


In [7]:
# Load data from JSON file
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    sentences = [item['request'] for item in data]  # Extracting sentences from the 'request' key
    labels = [np.argmax([item[key] for key in ['is_order', 'is_refund', 'is_status', 'is_date_change', 'is_dest_change', 'is_weather', 'is_allowed']]) for item in data]
    return sentences, np.array(labels)  # Return sentences as a list of strings





In [8]:
# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")


In [9]:
# Load data
sentences, labels = load_data("data/data.json")



array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6], dtype=int64)

In [71]:
# Tokenize and encode data
max_length = 64  # Adjust according to your data and BERT model's maximum input length
encoded_data = tokenizer(sentences, padding='max_length', truncation=True, max_length=max_length, return_tensors='tf')

# Convert BatchEncoding to dictionary
encoded_data = {key: encoded_data[key].numpy() for key in encoded_data}

In [72]:
# Convert labels to numpy array
# labels = np.array(labels)
# print(labels)
# Convert one-hot encoded labels to integer indices


In [73]:
#split the data into training and testing
train_inputs, test_inputs, train_labels, test_labels = train_test_split(encoded_data['input_ids'], labels, test_size=0.2, random_state=42)


In [74]:
# Load pre-trained BERT model
num_classes = 7 
model = TFBertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=num_classes)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [75]:
# Compile the model with sparse categorical cross-entropy loss
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = ['accuracy']
model.compile(optimizer=optimizer, loss=loss, metrics=metrics)


In [76]:
# Train the model
batch_size = 16
epochs = 10 # Adjust according to your training needs


history = model.fit(
    train_inputs,
    train_labels,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.1
)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [77]:
#test model accuracy
test_loss, test_accuracy = model.evaluate(test_inputs, test_labels)
print(f"Test accuracy: {test_accuracy}")

Test accuracy: 0.7599999904632568


In [78]:
# Define directory to save the model
output_dir = "./saved_model"

# Save model and tokenizer
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print("Model saved successfully to:", output_dir)



Model saved successfully to: ./saved_model


In [4]:
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf
# Load tokenizer and model
def load_model(model_path):
    tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
    model = TFBertForSequenceClassification.from_pretrained(model_path)
    return tokenizer, model

# Function to classify a sentence
def classify_sentence(tokenizer, model, sentence):
    # Tokenize and encode the input sentence
    inputs = tokenizer(sentence, return_tensors='tf', padding=True, truncation=True)

    # Make prediction
    logits = model(inputs)[0]

    # Get predicted class label
    predicted_class = tf.argmax(logits, axis=1).numpy()[0]

    return predicted_class

  from .autonotebook import tqdm as notebook_tqdm





In [11]:
def main(sentence):
    model_path = './saved_model'  # Path to the directory where you saved your trained model
    tokenizer, model = load_model(model_path)

    predicted_label = classify_sentence(tokenizer, model, sentence)
    if predicted_label == 0:
        print('you want to order a ticket')
    elif predicted_label == 1:
        print('you want to refund a ticket')
    elif predicted_label == 2:
        print('you want to check the status of your ticket')
    elif predicted_label == 3:
        print('you want to change the date of your ticket')
    elif predicted_label == 4:
        print('you want to change the destination of your ticket')
    elif predicted_label == 5:
        print('you want to know the weather of your destination')
    else:
        print('you want to know what is allowed in the flight')
    print("Predicted Label:", predicted_label)


In [23]:
main("אני רוצה לשנות את יעד הטיסה שלי לאיטליה")
# main("אני רוצה לדעת אם אני יכול לשנות את תאריך הטיסה שלי")
# main("אני רוצה לדעת אם אני יכול לבטל את הטיסה שלי")
# main("היי, הייתי רוצה להביא את החתול שלי לטיסה כי אני לא יכולה בלעדיו")

Some layers from the model checkpoint at ./saved_model were not used when initializing TFBertForSequenceClassification: ['dropout_227']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at ./saved_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


you want to change the destination of your ticket
Predicted Label: 4
