In [2]:
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification, create_optimizer
import tensorflow as tf
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from nltk.tokenize import sent_tokenize
import re
import json
import numpy as np
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import resample
from sklearn.preprocessing import MultiLabelBinarizer

# Load the JSON file
with open('/Users/pablonieuwenhuys/EatzAI/label_studio_review_sentences.json', 'r') as f:
    data = json.load(f)

# Convert the JSON data to a DataFrame
df = pd.DataFrame(data)

# Extract sentences and their labels
sentences = []
labels = []

for item in data:
    text = item['data']['text']
    sentence_labels = item['annotations'][0]['result'][0]['value']['choices']
    sentences.append(text)
    labels.append(sentence_labels)

# Preprocess the text
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

nltk.download('punkt')

# Apply preprocessing to each sentence
sentences = [preprocess_text(sentence) for sentence in sentences]

# Encode labels as numerical values using MultiLabelBinarizer
mlb = MultiLabelBinarizer()
labels_encoded = mlb.fit_transform(labels)

# Create DataFrame for sentences and labels for further processing
df = pd.DataFrame({
    'sentence': sentences,
    'labels': labels_encoded.tolist()
})

# Add columns for each label (assuming labels are 'food', 'service', 'ambiance', 'none')
label_names = mlb.classes_
for i, label_name in enumerate(label_names):
    df[label_name] = labels_encoded[:, i]

# Now you can proceed with the undersampling
df_majority = df[df['food'] == 1]
df_minority = df[df['food'] == 0]

# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                   replace=False,    # sample without replacement
                                   n_samples=len(df_minority),  # match number of minority samples
                                   random_state=42)  # reproducibility

# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

# Shuffle the dataset
df_downsampled = df_downsampled.sample(frac=1, random_state=42).reset_index(drop=True)

# After undersampling, recreate the sentences and labels
sentences = df_downsampled['sentence'].tolist()
labels_encoded = df_downsampled[label_names].values

# Tokenize the sentences using DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
encodings = tokenizer(sentences, truncation=True, padding=True)

# Extract input_ids and attention_mask
input_ids = encodings['input_ids']
attention_masks = encodings['attention_mask']

# Check the lengths to ensure consistency
assert len(input_ids) == len(labels_encoded), "Mismatch in number of sentences and labels."
assert len(attention_masks) == len(labels_encoded), "Mismatch between attention_mask and labels."

# Split the data into training and validation sets
train_input_ids, val_input_ids, train_labels, val_labels = train_test_split(
    input_ids, labels_encoded, test_size=0.2, random_state=0
)

train_attention_masks, val_attention_masks = train_test_split(
    attention_masks, test_size=0.2, random_state=0
)

# Convert them into TensorFlow datasets or pass them to your model
train_dataset = tf.data.Dataset.from_tensor_slices(({
    'input_ids': train_input_ids,
    'attention_mask': train_attention_masks
}, train_labels))

val_dataset = tf.data.Dataset.from_tensor_slices(({
    'input_ids': val_input_ids,
    'attention_mask': val_attention_masks
}, val_labels))

# Load and compile the model
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(label_names))

# Define the optimizer
num_train_steps = len(train_dataset) * 7  # Assuming 7 epochs
optimizer, lr_schedule = create_optimizer(
    init_lr=5e-5,
    num_train_steps=num_train_steps,
    num_warmup_steps=500
)

# Compile the model
model.compile(
    optimizer=optimizer, 
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), 
    metrics=['accuracy']
)

tf.config.run_functions_eagerly(True)

# Train the model
history = model.fit(train_dataset.batch(16), epochs=7, validation_data=val_dataset.batch(64))


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/pablonieuwenhuys/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
2024-08-12 15:52:10.560678: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2024-08-12 15:52:10.561121: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2024-08-12 15:52:10.561125: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2024-08-12 15:52:10.561746: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-08-12 15:52:10.562353: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
Some weights of the PyTorch model were not

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
