In [None]:
! pip install transformers datasets
! pip install datasets



In [None]:
import os
import pandas as pd

# NOTE: notebook needs to be run with NumPy 1.x for smooth integration with
# HuggingFace architecture, furthermore a compatible NumPy version must be used
# such that TensorFlow works
import numpy as np
import torch
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import StackingClassifier
from xgboost import XGBClassifier

from transformers import AutoTokenizer
from transformers import TFAutoModelForSequenceClassification
from datasets import load_dataset, Dataset
from transformers import DataCollatorWithPadding
from transformers import create_optimizer
import tensorflow as tf
from transformers import pipeline

In [None]:
from sklearn.metrics import accuracy_score
y_pred = [3, 2, 1, 0, 1, 2, 3]
y_true = [0, 1, 2, 3, 1, 2, 3]
accuracy_score(y_true, y_pred)

0.42857142857142855

In [None]:
train = pd.read_csv("train.csv").drop("created_at",axis=1)
val = pd.read_csv("val.csv").drop("created_at",axis=1)
test = pd.read_csv("test.csv").drop("created_at",axis=1)
test

Unnamed: 0,id,full_text,score
0,10706,The first informative #App on Pivot Points.\r\...,0.3
1,10707,We're still looking for someone else who can b...,-0.2
2,10708,Politics And The Markets 4/9/20. https://t.co/...,-0.1
3,10709,Bitcoin following similar price pattern to Ama...,-0.1
4,10710,Wall Street Breakfast: OPEC Virtual Meeting In...,0.3
...,...,...,...
1858,12587,RT @PeterLBrandt: $SPX $ES_F \r\nFollowing thi...,-0.2
1859,12588,RT @vieiraUAE: Fearless Alex Vieira Calls Best...,0.0
1860,12589,$spy $spx $qqq $ndx #nyse going from poking th...,0.2
1861,12590,RT @DavidScottAdams: On watch tomorrow // Pt. ...,-0.1


In [None]:
# We would like to use the pre-trained BERT model to classify our text because
# it is designed bidirectionally, which means it can read text in both
# directions to enhance its contextuality, a concept we think is important since
# classifying sentiment of text is very dependent on contextuality as we learned
# BERT framework results showed state-of-the-art results in text classification
# tasks, making it an appealing model to use for our emotion classification
# purposes
# Since BERT is a pre-trained model, it has a deep understanding of language
# structure, so we can fine tune it for the specific nuances of our task via our
# labeled dataset

In [None]:
# Process data so that it can be appropriately used, meaning we will use the
# BERT text embedding instead of our previous histogram vectors
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
train_ds = Dataset.from_pandas(train)
val_ds = Dataset.from_pandas(val)
test_ds = Dataset.from_pandas(test)

def tokens(ds):
    return tokenizer(ds["full_text"], truncation=True)
tokenized_train = train_ds.map(tokens, batched=True)
tokenized_val = val_ds.map(tokens, batched=True)
tokenized_test = test_ds.map(tokens, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

In [None]:
init_lr=1e-5
batch_size = 4
num_epochs = 5
train_steps = int(len(tokenized_train)/batch_size * num_epochs)
optimizer, _ = create_optimizer(init_lr=init_lr, num_warmup_steps=0, num_train_steps=train_steps)


# Instantiate our model for text classification
model = TFAutoModelForSequenceClassification.from_pretrained(
"bert-base-uncased", num_labels=28
)

# Prepare the training and validation data and pass our Adam optimizer to the
# BERT model
train_set = model.prepare_tf_dataset(
    tokenized_train,
    batch_size=batch_size,
    collate_fn=data_collator,
)
val_set = model.prepare_tf_dataset(
    tokenized_val,
    shuffle=False,
    batch_size=batch_size,
    collate_fn=data_collator,
)
test_set = model.prepare_tf_dataset(
    tokenized_test,
    shuffle=False,
    batch_size=batch_size,
    collate_fn=data_collator,
)
model.compile(optimizer=optimizer)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Train the model
model.fit(x=train_set)



<tf_keras.src.callbacks.History at 0x7a9b1b45f040>

In [None]:
# Obtain validation predictions
preds = model.predict(val_set)["logits"]
class_preds = np.argmax(preds, axis=1)



In [None]:
accuracy_score(val["score"],class_preds)

0.736

In [None]:
preds = model.predict(test_set)["logits"]
class_preds = np.argmax(preds, axis=1)



In [None]:
accuracy_score(test["score"],class_preds)

0.752
