In [24]:
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification, create_optimizer
import tensorflow as tf
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from nltk.tokenize import sent_tokenize
import re

In [25]:
# Define and read the CSV file
file_path = '/Users/pablonieuwenhuys/EatzAI/training/restaurant_reviews.csv'
data = pd.read_csv(file_path, delimiter=';', quotechar='"', on_bad_lines = "skip")

In [26]:
# Strip any leading or trailing spaces from column names
data.columns = data.columns.str.strip()

In [27]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/pablonieuwenhuys/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [28]:
# Preprocess the text
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

In [29]:
data['Review'] = data['Review'].apply(preprocess_text)

data['Sentences'] = data['Review'].apply(sent_tokenize)

In [30]:
# Flatten the list of sentences and create a corresponding list of labels (optional)
sentence_data = []
sentence_review_indices = []

for idx, sentences in enumerate(data['Sentences']):
    for sentence in sentences:
        sentence_data.append(sentence)
        sentence_review_indices.append(idx)


In [31]:
# Ensure all entries in the 'Review' column are strings
data['Review'] = data['Review'].astype(str)
data['count'] = data['Review'].apply(lambda x: len(x.split()))

In [32]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [33]:
# Tokenize the sentences using DistilBERT tokenizer
sentence_encodings = tokenizer(sentence_data, truncation=True, padding=True)

In [34]:
# Load the model
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [35]:
# Encode labels as numerical values
data['encoded_text'] = data['Label'].astype('category').cat.codes

In [36]:
# Split the data into training and validation sets
data_texts = data['Review'].to_list()
data_labels = data['encoded_text'].to_list()

train_texts, val_texts, train_labels, val_labels = train_test_split(
    data_texts, data_labels, test_size=0.2, random_state=0)


In [37]:
# Tokenize the training and validation texts
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)


In [38]:
# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
))


In [39]:
# Define the optimizer
num_train_steps = len(train_dataset) * 7  # Assuming 7 epochs
optimizer, lr_schedule = create_optimizer(
    init_lr=5e-5,
    num_train_steps=num_train_steps,
    num_warmup_steps=500
)




In [40]:
# Compile the model (only once)
model.compile(
    optimizer=optimizer, 
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
    metrics=['accuracy']
)

In [41]:
tf.config.run_functions_eagerly(True)

In [42]:
# Train the model
history = model.fit(train_dataset.batch(16), epochs=7, validation_data=val_dataset.batch(64))

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


In [47]:
from sklearn.metrics import confusion_matrix, classification_report

y_pred = model.predict(val_dataset.batch(64))
y_pred_classes = tf.argmax(y_pred.logits, axis=1)
print(confusion_matrix(val_labels, y_pred_classes))
print(classification_report(val_labels, y_pred_classes))


[[0 0 1]
 [0 3 1]
 [0 0 1]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       1.00      0.75      0.86         4
           2       0.33      1.00      0.50         1

    accuracy                           0.67         6
   macro avg       0.44      0.58      0.45         6
weighted avg       0.72      0.67      0.65         6



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
