In [1]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures
import tensorflow as tf
import pandas as pd

# Load the dataset found at https://www.kaggle.com/datasets/harshalhonde/starbucks-reviews-dataset?resource=download
df = pd.read_csv('reviews_data.csv')
# Assume 'reviews' column contains the text and 'stars' column contains ratings

# Convert ratings to binary sentiment labels
df['sentiment'] = df['stars'].apply(lambda x: 1 if x > 3 else 0)

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text
def convert_example_to_feature(review):
    return tokenizer.encode_plus(review, 
                                 add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
                                 max_length=512,  # Max length to truncate/pad
                                 pad_to_max_length=True,  # Pad sentence to max length
                                 return_attention_mask=True,  # Return attention mask
                                )

# Map the tokenizer function to reviews
input_ids = []
attention_masks = []

for review in df['reviews']:
    encoded_review = convert_example_to_feature(review)
    input_ids.append(encoded_review['input_ids'])
    attention_masks.append(encoded_review['attention_mask'])

input_ids = np.array(input_ids)
attention_masks = np.array(attention_masks)
labels = df['sentiment'].values

# Split the data
from sklearn.model_selection import train_test_split

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, random_state=2018, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels, random_state=2018, test_size=0.1)

# Load the pre-trained BERT model
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5, epsilon=1e-08)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

# Train the model
model.fit([train_inputs, train_masks], train_labels, batch_size=32, epochs=2, validation_data=([validation_inputs, validation_masks], validation_labels))

# Note: This code is simplified for illustrative purposes and may require adjustments based on the specific details of the Starbucks Reviews Dataset, such as handling very large datasets or optimizing model parameters for better performance.


ModuleNotFoundError: No module named 'transformers'