In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Sun Dec 31 17:28:47 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.23.07              Driver Version: 546.12       CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4070        On  | 00000000:01:00.0  On |                  N/A |
|  0%   51C    P5              12W / 200W |   1094MiB / 12282MiB |      2%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [1]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import (
    GPT2TokenizerFast,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
import torch
import hopsworks
import numpy as np
import evaluate

In [2]:
tokenizer = GPT2TokenizerFast.from_pretrained('Xenova/text-embedding-ada-002')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT4Tokenizer'. 
The class this function is called from is 'GPT2TokenizerFast'.


In [3]:
def get_decoding(dataset, embedding_object):
    decodings = []
    for data in dataset["embeddings"]:
        decoded_text = embedding_object.decode(data)
        decodings.append(decoded_text)

    dataset_decoded = dataset.copy()
    dataset_decoded["text"] = decodings
    dataset_decoded = dataset_decoded.drop(columns=["embeddings"])
    return dataset_decoded

In [4]:
hopsworks_project = hopsworks.login() 
fs = hopsworks_project.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

Multiple projects found. 

	 (1) id2223labs
	 (2) calinp

Enter project to access: 1

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/197784
Connected. Call `.close()` to terminate connection gracefully.


In [5]:
training_fg = fs.get_or_create_feature_group("news_sentiment_traindata", version=1)
test_fg = fs.get_or_create_feature_group("news_sentiment_testdata", version=1)
# get all of the data from the feature group
training_features = training_fg.read()
testing_features = test_fg.read()

Reading data from Hopsworks, using ArrowFlight.   

W20231231 18:00:31.720172 19619 status.cc:137] DoAction result was not fully consumed: Cancelled: Flight cancelled call, with message: CANCELLED. Detail: Cancelled


Finished: Reading data from Hopsworks, using ArrowFlight (2.64s) 
Finished: Reading data from Hopsworks, using ArrowFlight (1.29s) 


In [6]:
training_data = get_decoding(training_features, tokenizer)
testing_data = get_decoding(testing_features, tokenizer)

In [7]:
# print the 50th row
print(training_data)

       label                                               text
0          1  This S&P 500 comeback stock could ride higher ...
1          0  Study Finds Most Packaged Foods Contain Danger...
2          2  "1/3 of all women in the world have experience...
3          2  Top Earnings Tu 11/26 Pre: $ADI $AMWD $ANF $BB...
4          0  About half of U.S. small businesses haven't pa...
...      ...                                                ...
12474      2  ... - December 22nd, 2023 (Trade Strategy For ...
12475      1  2 Red-Hot Artificial Intelligence (AI) Stocks ...
12476      1  Google paying $700M to settle antitrust allega...
12477      2  Google Allows More App Payment Options in Anti...
12478      0  Bull of the Day: Amazon.com, Inc. (AMZN). Amaz...

[12479 rows x 2 columns]


In [8]:
bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
    return bert_tokenizer(examples["text"], padding="max_length", truncation=True)


In [9]:
# Convert pandas dataframes to Hugging Face datasets
train_dataset = Dataset.from_pandas(training_data)
test_dataset = Dataset.from_pandas(testing_data)

# Combine datasets into a DatasetDict
datasets = DatasetDict({'train': train_dataset, 'test': test_dataset})

In [10]:
tokenized_datasets = datasets.map(tokenize_function, batched=True)



Map:   0%|          | 0/12479 [00:00<?, ? examples/s]

Map:   0%|          | 0/3122 [00:00<?, ? examples/s]

In [13]:
tokenized_datasets['train']['text'][50]

"Most Americans Think They Won't Need Social Security. They're Wrong."

In [14]:
def get_compute_metrics(metric):
    def compute_metrics(eval_pred): 
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        return metric.compute(predictions=predictions, references=labels)
    return compute_metrics

In [15]:
id2label = {0: "Negative", 1: "Positive", 2: "Neutral"}
label2id = {val: key for key, val in id2label.items()}

def model_init():
    return AutoModelForSequenceClassification.from_pretrained('bert-base-cased', return_dict=True, num_labels=3,
                                                             id2label=id2label, label2id=label2id)

metric = evaluate.load("accuracy")
compute_metrics = get_compute_metrics(metric)
training_args = TrainingArguments(
    output_dir="bert_sentiment_trainer", 
    evaluation_strategy="steps",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=4,
    num_train_epochs=8,
    learning_rate= 2.754984679344267e-05,
    save_total_limit=3,
    seed=42,
    lr_scheduler_type='constant_with_warmup',
    warmup_steps=50,
    max_steps=3000,
    save_strategy="steps",
    save_steps=250,
    fp16=False,
    eval_steps=250,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
)

tokenized_train_dataset = tokenized_datasets["train"].shuffle(seed=55)
tokenized_test_dataset = tokenized_datasets["test"].shuffle(seed=55)

trainer = Trainer(
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    compute_metrics=compute_metrics,
    model_init=model_init,
    tokenizer=bert_tokenizer,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy
250,0.4787,0.444748,0.83376
500,0.374,0.383488,0.861307
750,0.3681,0.35326,0.869635
1000,0.2456,0.44249,0.870596
1250,0.2092,0.439329,0.868354
1500,0.2827,0.381386,0.881166
1750,0.1424,0.558913,0.87508
2000,0.1791,0.586923,0.881807
2250,0.1297,0.574099,0.874439
2500,0.0474,0.632827,0.883408


TrainOutput(global_step=3000, training_loss=0.254610068321228, metrics={'train_runtime': 2876.0624, 'train_samples_per_second': 16.689, 'train_steps_per_second': 1.043, 'total_flos': 1.2628654710690816e+16, 'train_loss': 0.254610068321228, 'epoch': 3.85})

In [31]:
trainer.push_to_hub()



model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

events.out.tfevents.1703758864.ArtanisPC.7175.14:   0%|          | 0.00/27.5k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/4.66k [00:00<?, ?B/s]

'https://huggingface.co/Artanis1551/bert_sentiment_trainer/tree/main/'

In [18]:
from transformers import pipeline
import json

pipe = pipeline("text-classification", model="Artanis1551/bert_sentiment_trainer")
results = pipe(list(testing_data["text"]))

json_dict = json.loads(str(results).replace('\'', '\"'))
predictions = pd.DataFrame.from_dict(json_dict)

predicted_labels = [pipe.model.config.label2id[x] for x in predictions['label']] 

old_accuracy = metric.compute(predictions=predicted_labels, references=test_dataset['label'])['accuracy']
print("old model metric = " + str(old_accuracy))

new_accuracy = trainer.predict(tokenized_test_dataset).metrics["test_accuracy"]
print("new model metric = " + str(new_accuracy))

old model metric = 0.8856502242152466


new model metric = 0.8856502242152466
