In [None]:
!pip install transformers

import transformers
from transformers import DistilBertTokenizerFast, TFDistilBertForSequenceClassification, TFDistilBertModel
import numpy as np
from tensorflow import Tensor
import pandas as pd
import tensorflow as tf
import keras

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, roc_auc_score, accuracy_score

VOCAB_SIZE = 15000
SEQUENCE_LENGTH = 100
EMBED_DIM = 8
SEED = 0

DATA_PATH=r"../../data/transformed/amazon_reviews_5_partition_1.csv"

BERT_MODEL = "distilbert-base-uncased-finetuned-sst-2-english"

In [None]:
# Downloading and instantiating pre-trained model and tokenizer
# model = TFDistilBertForSequenceClassification.from_pretrained(BERT_MODEL, num_labels=2)
bert_model = TFDistilBertModel.from_pretrained(BERT_MODEL)
tokenizer = DistilBertTokenizerFast.from_pretrained(BERT_MODEL)

In [None]:
# Creating a model that takes logits from bert as input
inputs = keras.Input(shape=((183, 768)),dtype="int64")
x = keras.layers.Dense(768, activation="relu")(inputs)
x = keras.layers.Dense(32, activation="relu")(x)
outputs = keras.layers.Dense(1)(x)

regressor_model = keras.Model(inputs=inputs, outputs=outputs)

In [None]:
# Getting data
df = pd.read_csv(
    DATA_PATH,
    index_col=0,
    nrows = 100000
)
df = df.dropna()
df["reviewText"] = df["reviewText"].convert_dtypes(convert_string=True)

In [None]:
# Creating x and y for training
x = tokenizer(
    df["reviewText"].to_list(),
    truncation=True,
    padding=True,
    return_tensors="tf"
)

y = (df["overall"] > 3) * 1

x=dict(x)

In [None]:
# Compiling the bert model, targeting binary predictions
# model.compile(
#     optimizer="adam", 
#     loss="sparse_categorical_crossentropy", 
#     metrics=["accuracy"]
# )

In [None]:
bert_model.summary()

In [None]:
# Creating inputs for regressor model
preds = bert_model.predict(x)[0]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    preds, 
    y, 
    test_size=0.2, 
    random_state=SEED
)

In [None]:
# Compiling regressor top-hat model
regressor_model.compile(
    optimizer="adam", 
    loss="mse", 
    metrics=["mae"]
)

In [None]:
# Fitting regressor models on predictions from bert (Pre-training)
regressor_model.fit(
    x_train, 
    y_train, 
    epochs=10,
    validation_data=(x_test, y_test)
)

In [None]:
# compining models for fine-tuning and predictions
combined_model = keras.Sequential(
    [
        bert_model,
        regressor_model
    ]
)

In [None]:
# Custom optimizer with custom learnign rate
optimizer = keras.optimizers.Adam(1e-5)

combined_model.compile(
    optimizer=optimizer,
    loss="mse",
    metrics=["mae"]
)

In [None]:
cbs = [
    keras.callbacks.ModelCheckpoint(
        filepath="", # fill in filepath (where to save model)
        save_best_only=True
    ) ,
    keras.callbacks.EarlyStopping(
        patience=4
    )
]

In [None]:
combined_model.fit(
    x_train,
    y_train,
    validation_data=(x_test, y_test),
    epochs = 15,
    callbacks=cbs

)