In [None]:
import transformers
from transformers import DistilBertTokenizerFast, TFDistilBertForSequenceClassification
import numpy as np
from tensorflow import Tensor
import pandas as pd
import tensorflow as tf
import keras

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, roc_auc_score, accuracy_score

VOCAB_SIZE = 15000
SEQUENCE_LENGTH = 100
EMBED_DIM = 8
SEED = 0

DATA_PATH=r"../../data/transformed/amazon_reviews_5_partition_1.csv"

BERT_MODEL = "distilbert-base-uncased-finetuned-sst-2-english"

In [None]:
# Downloading and instantiating pre-trained model and tokenizer
model = TFDistilBertForSequenceClassification.from_pretrained(BERT_MODEL, num_labels=2)
tokenizer = DistilBertTokenizerFast.from_pretrained(BERT_MODEL)


In [None]:
# Creating a model that takes logits from bert as input
inputs = keras.Input(shape=(2,),dtype="int64")
x = keras.layers.Dense(100, activation="relu")(inputs)
outputs = keras.layers.Dense(1)(x)
regressor = keras.Model(inputs=inputs, outputs=outputs)

In [None]:
# Getting data
df = pd.read_csv(
    DATA_PATH,
    index_col=0,
    nrows = 10
)
df = df.dropna()
df["reviewText"] = df["reviewText"].convert_dtypes(convert_string=True)

In [None]:
# Creating x and y for training
x = tokenizer(
    df["reviewText"].to_list(),
    truncation=True,
    padding=True,
    return_tensors="tf"
)

y = (df["overall"] > 3) * 1

x=dict(x)

In [None]:
# Compiling the bert model, binary predictions
model.compile(
    optimizer="adam", 
    loss="sparse_categorical_crossentropy", 
    metrics=["accuracy"]
)

In [None]:
# Creating inputs for regressor model
preds = model.predict(x)["logits"]

In [None]:
# Compiling regressor top-hat model
regressor.compile(
    optimizer="adam", 
    loss="mse", 
    metrics="mae"
)

In [None]:
# Fitting regressor models on predictions from bert
regressor.fit(
    preds, 
    np.array(y), 
    epochs=100
)