In [1]:
import pandas as pd
import tensorflow as tf
from transformers import TFAutoModel, AutoTokenizer
from tensorflow.keras.regularizers import l2

In [2]:
# Load your dataset
!wget -O dataset_final.parquet https://github.com/dennyrianto/asterisk-machine-learning/raw/main/data/dataset_final.parquet

df = pd.read_parquet('dataset_final.parquet')
df = df.dropna()
df = df.drop_duplicates()
df

--2024-06-19 06:54:18--  https://github.com/dennyrianto/asterisk-machine-learning/raw/main/data/dataset_final.parquet
Resolving github.com (github.com)... 20.205.243.166
Connecting to github.com (github.com)|20.205.243.166|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/dennyrianto/asterisk-machine-learning/main/data/dataset_final.parquet [following]
--2024-06-19 06:54:19--  https://raw.githubusercontent.com/dennyrianto/asterisk-machine-learning/main/data/dataset_final.parquet
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 31097543 (30M) [application/octet-stream]
Saving to: ‘dataset_final.parquet’


2024-06-19 06:54:22 (338 MB/s) - ‘dataset_final.parquet’ saved [31097543/31097543]



Unnamed: 0,reviews,food,service,ambience,price,resptext
0,"The ambience was good, food was quite good . h...",2,2,2,2,Food: 2\nService: 2\nAmbience: 2\nPrice: 2 \n
1,Ambience is too good for a pleasant evening. S...,2,2,2,1,"food: 2, service: 2, ambience: 2, price: 1 \n"
2,A must try.. great food great ambience. Thnx f...,2,2,2,1,food: 2\nservice: 2\nambience: 2\nprice: 1 \n
3,Soumen das and Arun was a great guy. Only beca...,2,2,1,1,"food: 2, service: 2, ambience: 1, price: 1 \n"
4,Food is good.we ordered Kodi drumsticks and ba...,2,2,2,1,food: 2\nservice: 2\nambience: 2\nprice: 1 \n
...,...,...,...,...,...,...
205109,"After waiting for 30 minutes outside, seated o...",2,2,1,1,food: 2\nservice: 2\nambience: 1\nprice: 1 \n
205110,I was in town for a Marriott Conference. I am ...,2,2,1,1,Food: 2\nService: 2\nAmbience: 1\nPrice: 1 \n
205111,"Always fast, always consistent, always good. G...",2,2,1,1,"food: 2, service: 2, ambience: 1, price: 1 \n"
205112,Gotta knock'em down a peg sine the arrival of ...,2,1,1,1,"food: 2, service: 1, ambience: 1, price: 1 \n"


In [3]:
# Preprocess the data
texts = df['reviews'].tolist()
labels_food = df['food'].tolist()
labels_ambience = df['ambience'].tolist()
labels_service = df['service'].tolist()
labels_price = df['price'].tolist()

# Tokenizer
pretrained_model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)

# Split the data into training and validation sets
splt = 160000

train_texts = texts[:splt]
val_texts = texts[splt:]

train_labels_food = labels_food[:splt]
val_labels_food = labels_food[splt:]

train_labels_ambience = labels_ambience[:splt]
val_labels_ambience = labels_ambience[splt:]

train_labels_service = labels_service[:splt]
val_labels_service = labels_service[splt:]

train_labels_price = labels_price[:splt]
val_labels_price = labels_price[splt:]

# Tokenize texts
train_encoded_inputs = tokenizer(train_texts, padding=True, truncation=True, return_tensors="tf")
val_encoded_inputs = tokenizer(val_texts, padding=True, truncation=True, return_tensors="tf")

# Convert labels to tensors
train_labels_food = tf.convert_to_tensor(train_labels_food, dtype=tf.int32)
train_labels_ambience = tf.convert_to_tensor(train_labels_ambience, dtype=tf.int32)
train_labels_service = tf.convert_to_tensor(train_labels_service, dtype=tf.int32)
train_labels_price = tf.convert_to_tensor(train_labels_price, dtype=tf.int32)

val_labels_food = tf.convert_to_tensor(val_labels_food, dtype=tf.int32)
val_labels_ambience = tf.convert_to_tensor(val_labels_ambience, dtype=tf.int32)
val_labels_service = tf.convert_to_tensor(val_labels_service, dtype=tf.int32)
val_labels_price = tf.convert_to_tensor(val_labels_price, dtype=tf.int32)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [4]:
# Custom model definition
class CustomDistilBERTModel(tf.keras.Model):
    def __init__(self, pretrained_model):
        super(CustomDistilBERTModel, self).__init__()
        reg = l2(0.01)
        self.pretrained_model = pretrained_model
        self.dense_food = tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=reg)
        self.dropout_food = tf.keras.layers.Dropout(0.2)
        self.dense_food2 = tf.keras.layers.Dense(128, activation='relu')
        self.dropout_food2 = tf.keras.layers.Dropout(0.2)
        self.dense_food3 = tf.keras.layers.Dense(32, activation='relu')
        self.dense_food4 = tf.keras.layers.Dense(16, activation='relu')
        self.output_food = tf.keras.layers.Dense(3, activation='softmax')  # Assuming 3 classes: positive, neutral, negative

        self.dense_ambience = tf.keras.layers.Dense(128, activation='relu', kernel_regularizer=reg)
        self.dropout_ambience = tf.keras.layers.Dropout(0.3)
        self.output_ambience = tf.keras.layers.Dense(3, activation='softmax')

        self.dense_service = tf.keras.layers.Dense(128, activation='relu')
        self.dropout_service = tf.keras.layers.Dropout(0.3)
        self.output_service = tf.keras.layers.Dense(3, activation='softmax')

        self.dense_price = tf.keras.layers.Dense(128, activation='relu')
        self.dropout_price = tf.keras.layers.Dropout(0.3)
        self.output_price = tf.keras.layers.Dense(3, activation='softmax')

    def call(self, inputs):
        outputs = self.pretrained_model(inputs)
        cls_output = outputs.last_hidden_state[:, 0, :]

        food = self.dense_food(cls_output)
        food = self.dropout_food(food)
        food = self.dense_food2(food)
        food = self.dropout_food2(food)
        food = self.dense_food3(food)
        food = self.dense_food4(food)
        food = self.output_food(food)

        ambience = self.dense_ambience(cls_output)
        ambience = self.dropout_ambience(ambience)
        ambience = self.output_ambience(ambience)

        service = self.dense_service(cls_output)
        service = self.dropout_service(service)
        service = self.output_service(service)

        price = self.dense_price(cls_output)
        price = self.dropout_price(price)
        price = self.output_price(price)

        return {'food': food, 'ambience': ambience, 'service': service, 'price': price}

# Load the pre-trained DistilBERT model
pretrained_model = TFAutoModel.from_pretrained(pretrained_model_name)
pretrained_model.trainable = True

# Initialize the custom model
custom_model = CustomDistilBERTModel(pretrained_model)

# Compile the model
custom_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
                     loss={
                         'food': tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
                         'ambience': tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
                         'service': tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
                         'price': tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
                     },
                     metrics={
                         'food': 'accuracy',
                         'ambience': 'accuracy',
                         'service': 'accuracy',
                         'price': 'accuracy'
                     })

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [5]:
# Create a TensorFlow dataset
def encode_example(text, food_label, ambience_label, service_label, price_label):
    encoding = tokenizer(text, padding='max_length', truncation=True, max_length=128, return_tensors="tf")
    return {key: val[0] for key, val in encoding.items()}, {'food': food_label, 'ambience': ambience_label, 'service': service_label, 'price': price_label}

# Apply the encoding to training data
train_encoded_data = [encode_example(text, food, ambience, service, price) for text, food, ambience, service, price in zip(train_texts, train_labels_food, train_labels_ambience, train_labels_service, train_labels_price)]

# Apply the encoding to validation data
val_encoded_data = [encode_example(text, food, ambience, service, price) for text, food, ambience, service, price in zip(val_texts, val_labels_food, val_labels_ambience, val_labels_service, val_labels_price)]

In [6]:
# Create training dataset
train_dataset = tf.data.Dataset.from_generator(lambda: train_encoded_data,
                                               output_signature=(
                                                   {
                                                       'input_ids': tf.TensorSpec(shape=(128,), dtype=tf.int32),
                                                       'attention_mask': tf.TensorSpec(shape=(128,), dtype=tf.int32),
                                                   },
                                                   {
                                                       'food': tf.TensorSpec(shape=(), dtype=tf.int32),
                                                       'ambience': tf.TensorSpec(shape=(), dtype=tf.int32),
                                                       'service': tf.TensorSpec(shape=(), dtype=tf.int32),
                                                       'price': tf.TensorSpec(shape=(), dtype=tf.int32),
                                                   }
                                               ))

# Create validation dataset
val_dataset = tf.data.Dataset.from_generator(lambda: val_encoded_data,
                                             output_signature=(
                                                 {
                                                     'input_ids': tf.TensorSpec(shape=(128,), dtype=tf.int32),
                                                     'attention_mask': tf.TensorSpec(shape=(128,), dtype=tf.int32),
                                                 },
                                                 {
                                                     'food': tf.TensorSpec(shape=(), dtype=tf.int32),
                                                     'ambience': tf.TensorSpec(shape=(), dtype=tf.int32),
                                                     'service': tf.TensorSpec(shape=(), dtype=tf.int32),
                                                     'price': tf.TensorSpec(shape=(), dtype=tf.int32),
                                                 }
                                             ))

# Batch the datasets
train_dataset = train_dataset.shuffle(len(train_texts)).batch(32)
val_dataset = val_dataset.batch(32)

In [7]:
# Train the model with validation data
custom_model.fit(train_dataset, epochs=5, validation_data=val_dataset)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x78c120918880>

In [None]:
pip install tensorflowjs

Collecting tensorflowjs
  Downloading tensorflowjs-4.20.0-py3-none-any.whl (89 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/89.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.1/89.1 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Collecting tensorflow-decision-forests>=1.5.0 (from tensorflowjs)
  Downloading tensorflow_decision_forests-1.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (15.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.5/15.5 MB[0m [31m90.9 MB/s[0m eta [36m0:00:00[0m
Collecting packaging~=23.1 (from tensorflowjs)
  Downloading packaging-23.2-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.0/53.0 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
Collecting tensorflow<3,>=2.13.0 (from tensorflowjs)
  Downloading tensorflow-2.16.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (589.8 MB)
[

In [15]:
custom_model.save('/content/custom_model_fix_3')



In [None]:
!pip list

Package                          Version
-------------------------------- ---------------------
absl-py                          1.4.0
aiohttp                          3.9.5
aiohttp-cors                     0.7.0
aiosignal                        1.3.1
alabaster                        0.7.16
albumentations                   1.3.1
altair                           4.2.2
annotated-types                  0.7.0
anyio                            3.7.1
argon2-cffi                      23.1.0
argon2-cffi-bindings             21.2.0
array_record                     0.5.1
arviz                            0.15.1
astropy                          5.3.4
astunparse                       1.6.3
async-timeout                    4.0.3
atpublic                         4.1.0
attrs                            23.2.0
audioread                        3.0.1
autograd                         1.6.2
Babel                            2.15.0
backcall                         0.2.0
beautifulsoup4                   4.12.3


In [19]:
pip install tensorflowjs --no-deps

Collecting tensorflowjs
  Downloading tensorflowjs-4.20.0-py3-none-any.whl (89 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/89.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━[0m [32m81.9/89.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.1/89.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tensorflowjs
Successfully installed tensorflowjs-4.20.0


In [29]:
pip install tensorflowjs==4.19.0

Collecting tensorflowjs==4.19.0
  Downloading tensorflowjs-4.19.0-py3-none-any.whl (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.1/89.1 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorflowjs
  Attempting uninstall: tensorflowjs
    Found existing installation: tensorflowjs 4.17.0
    Uninstalling tensorflowjs-4.17.0:
      Successfully uninstalled tensorflowjs-4.17.0
Successfully installed tensorflowjs-4.19.0


In [32]:
!tensorflowjs_converter --input_format=tf_saved_model --skip_op_check=SKIP_OP_CHECK /content/custom_model_fix_3 /content/tfjs_model_2

usage: TensorFlow.js model converters. [-h]
                                       [--input_format {tf_hub,keras_saved_model,tf_frozen_model,tf_saved_model,tfjs_layers_model,keras,keras_keras}]
                                       [--output_format {tfjs_graph_model,keras_saved_model,tfjs_layers_model,keras,keras_keras}]
                                       [--signature_name SIGNATURE_NAME]
                                       [--saved_model_tags SAVED_MODEL_TAGS]
                                       [--quantize_float16 [QUANTIZE_FLOAT16]]
                                       [--quantize_uint8 [QUANTIZE_UINT8]]
                                       [--quantize_uint16 [QUANTIZE_UINT16]]
                                       [--quantization_bytes {1,2}] [--split_weights_by_layer]
                                       [--version] [--skip_op_check]
                                       [--strip_debug_ops STRIP_DEBUG_OPS]
                                       [--use_structured

In [25]:
pip show tensorflow

Name: tensorflow
Version: 2.16.1
Summary: TensorFlow is an open source machine learning framework for everyone.
Home-page: https://www.tensorflow.org/
Author: Google Inc.
Author-email: packages@tensorflow.org
License: Apache 2.0
Location: /usr/local/lib/python3.10/dist-packages
Requires: absl-py, astunparse, flatbuffers, gast, google-pasta, grpcio, h5py, keras, libclang, ml-dtypes, numpy, opt-einsum, packaging, protobuf, requests, setuptools, six, tensorboard, tensorflow-io-gcs-filesystem, termcolor, typing-extensions, wrapt
Required-by: dopamine_rl, tensorflow_decision_forests, tensorflowjs, tf_keras


In [42]:
import shutil
shutil.make_archive('tfjs_model_fix', 'zip', '/content/tfjs_model_fix')

'/content/tfjs_model_fix.zip'

In [43]:
from google.colab import files
files.download('tfjs_model_fix.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [14]:
from google.colab import files
files.download('custom_model_fix_2.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [17]:
from google.colab import files
files.download('custom_model_fix_3.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [10]:
inpt = "The food is good, the ambience is bad. But the service is great. The price is affordable"

test_enc = tokenizer([inpt], padding='max_length', truncation=True, max_length=128, return_tensors="tf")
test_enc = {k: v for k, v in test_enc.items()}
predictions = custom_model.predict(test_enc)
print(predictions)

{'food': array([[6.1734580e-05, 5.9635367e-04, 9.9934191e-01]], dtype=float32), 'ambience': array([[0.9163765 , 0.08138057, 0.00224305]], dtype=float32), 'service': array([[5.5761153e-05, 5.0579634e-05, 9.9989367e-01]], dtype=float32), 'price': array([[2.0274581e-06, 2.1954563e-04, 9.9977845e-01]], dtype=float32)}


In [33]:
pip install tensorflowjs==4.19.0



In [40]:
import os
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

In [41]:

!tensorflowjs_converter \
    --input_format=tf_saved_model \
    /content/custom_model_fix_3 \
    /content/tfjs_model_fix

2024-06-19 10:00:27.589799: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:47] Overriding orig_value setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
