In [5]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import tensorflow as tf
from sklearn.metrics import confusion_matrix, classification_report
from keras.callbacks import EarlyStopping
from transformers import TFDistilBertForSequenceClassification, DistilBertTokenizerFast
from tensorflow.keras import mixed_precision

In [3]:
# Check for GPU
!nvidia-smi -L

GPU 0: NVIDIA A100-SXM4-40GB (UUID: GPU-b319ef48-7c43-3b21-dde1-30b49a68d3f1)


In [None]:
# Enable mixed precision training
mixed_precision.set_global_policy('mixed_float16')

# Read in CSV Data for Twitter Sentiment Analysis
df = pd.read_csv("twitter_sentiment_data.csv")

# Get sentences and labels as dataframes
sentences = df["message"].to_numpy()
labels = df["sentiment"].to_numpy()

# Remove -1
labels[labels == -1] = 3

# One-hot encode the labels
num_classes = 4
labels = tf.keras.utils.to_categorical(labels, num_classes)

# Split into train and test
train_sentences, test_sentences, train_labels, test_labels = train_test_split(sentences, labels, test_size=0.2, random_state=1)

# Initialize the tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Tokenize the data
train_encodings = tokenizer(list(train_sentences), truncation=True, padding=True, max_length=64)
test_encodings = tokenizer(list(test_sentences), truncation=True, padding=True, max_length=64)

# Create datasets
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), train_labels))
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), test_labels))

# Initialize the model
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_classes)

# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

# Create EarlyStopping instance
early_stopping = EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)

# Fit the model
model_history = model.fit(train_dataset.shuffle(1000).batch(32),
                          epochs=5,
                          validation_data=test_dataset.batch(32),
                          callbacks=[early_stopping])

# Evaluate the model on the test set
model_results = model.evaluate(test_dataset.batch(32))
print(f"Loss: {model_results[0]}, Accuracy: {model_results[1]}")

# Make Predictions
model_pred = model.predict(test_dataset.batch(128))

# Convert predictions to labels
model_pred_labels = np.argmax(model_pred.logits, axis=1)

# Print the confusion matrix
cm = confusion_matrix(np.argmax(test_labels, axis=1), model_pred_labels)
print(cm)

# Print the classification report
cr = classification_report(np.argmax(test_labels, axis=1), model_pred_labels)
print(cr)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

Epoch 1/5
  71/1099 [>.............................] - ETA: 2:54 - loss: 1.0613 - accuracy: 0.5682