In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m89.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m103.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m25.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.1


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import tensorflow as tf
from sklearn.metrics import confusion_matrix, classification_report
from keras.callbacks import EarlyStopping
from transformers import TFBertForSequenceClassification, BertTokenizer

In order for our deep learning models to run as fast as possible, we'll need access to a GPU.

In Google Colab, you can set this up by going to Runtime -> Change runtime type -> Hardware accelerator -> GPU.

After selecting GPU, you may have to restart the runtime.

In [None]:
# Check for GPU
!nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-ed9dbfe3-4978-7a74-6394-872e30b12e90)


In [None]:
# Read in CSV Data for Twitter Sentiment Analysis
df = pd.read_csv("twitter_sentiment_data.csv")

# Check distribution of data
sentiment_counts = df['sentiment'].value_counts()

# Get sentences and labels as dataframes
sentences = df["message"].astype(str).to_numpy()
labels = df["sentiment"].to_numpy()

# Remove -1
labels[labels == -1] = 3

# One-hot encode the labels
num_classes = 4
labels = tf.keras.utils.to_categorical(labels, num_classes)

# Split into train and test
train_sentences, test_sentences, train_labels, test_labels = train_test_split(sentences, labels, test_size=0.2, random_state=1)

# Calculate class weights
class_weights = {}
for i in range(num_classes):
    class_count = np.sum(labels[:, i])
    class_weight = len(labels) / (num_classes * class_count)
    class_weights[i] = class_weight

# Load the BERT tokenizer and model
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)

# Tokenize train and test sentences
train_encodings = tokenizer(list(train_sentences), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(test_sentences), truncation=True, padding=True, max_length=128)

# Convert tokenized data to TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), train_labels)).batch(8)
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), test_labels)).batch(8)

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
              metrics=["accuracy"])

# Create EarlyStopping instance
early_stopping = EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)

# Fit the model
history = model.fit(train_dataset,
                    epochs=10,
                    validation_data=test_dataset,
                    class_weight=class_weights,
                    callbacks=[early_stopping])

# Evaluate the model on the test set
results = model.evaluate(test_dataset)
print(f"Loss: {results[0]}, Accuracy: {results[1]}")

# Make predictions
predictions = model.predict(test_dataset)

# Convert predictions to labels
pred_labels = np.argmax(predictions.logits, axis=1)

# Print the confusion matrix
cm = confusion_matrix(np.argmax(test_labels, axis=1), pred_labels)
print(cm)

# Print the classification report
cr = classification_report(np.argmax(test_labels, axis=1), pred_labels)
print(cr)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/536M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10

KeyboardInterrupt: ignored