<a href="https://colab.research.google.com/github/ArjunNPatel/finbertuconn2024/blob/main/Fine_Tuned_Bert_Model_July_2024_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U transformers --quiet
#!pip install tensorflow --quiet
!pip install evaluate --quiet
!pip install torch --quiet
!pip install huggingface_hub --quiet
!pip install tqdm boto3 requests regex sentencepiece sacremoses --quiet
from huggingface_hub import notebook_login
import numpy as np
import pandas as pd
from transformers import BertTokenizer, AutoModel, AutoModelForSequenceClassification, TrainingArguments, Trainer, PreTrainedModel, PretrainedConfig
import evaluate
import pathlib, os, json
#import tensorflow as tf
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import tqdm, boto3, requests, regex, sentencepiece, sacremoses

notebook_login()

In [None]:
#the fine tuning corpus
import urllib, json
url = "https://raw.githubusercontent.com/stocks-predictor/bert/master/BNEWS_DATA/datasetEconomyNews.json"
response = urllib.request.urlopen(url)
rawdata = json.loads(response.read())

data = [["text,label"]]

examplenum = 300
exampletypes = [0,0,0]
for article in rawdata:
  splitter = [article["headlineTitle"] + " " + article["headlineText"], article["classification"]]
  if(exampletypes[splitter[1]] < examplenum/3):
    exampletypes[splitter[1]] += 1
  else:
    continue
  splitter[1] += 1
  data.append(splitter)


df = pd.DataFrame(data[1:], columns = data[0][0].split(","))
from google.colab import drive
drive.mount('/content/gdrive')
%cd /content/gdrive/MyDrive/Machine_Learning_Finance_UCONN_Stamford_2024/Models
df = df.sample(frac = 1).reset_index(drop=True)
df1 = df.iloc[0:int(len(df)*2/3)]
df2 = df.iloc[int(len(df)*2/3):]
df1 = df1.sample(frac = 1).reset_index(drop=True)
df2 = df2.reset_index(drop=True)
df1.to_csv(r"myfinetuningdataset_train.csv")
df1['label'].value_counts().plot(kind='bar')
df2.to_csv(r"myfinetuningdataset_test.csv")
df2['label'].value_counts().plot(kind='bar')

In [None]:
!pip install datasets --quiet
from datasets import load_dataset

tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'yiyanghkust/finbert-tone')
def tokenize_function(examples):
     return tokenizer(examples["text"], padding = "max_length", truncation=True, max_length=128, return_tensors = "pt")

dataset = load_dataset('csv', data_files= {'train': 'myfinetuningdataset_train.csv', 'test': 'myfinetuningdataset_test.csv' })
dataset = dataset.remove_columns(["Unnamed: 0"])

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
from huggingface_hub import PyTorchModelHubMixin
! pip install -U 'accelerate>=0.21.0' --quiet
import accelerate
accelerate.__version__



class BertForSequenceClassification( nn.Module, PyTorchModelHubMixin):
    def __init__(self, pretrained_model_name, num_labels=3):
        super(BertForSequenceClassification, self).__init__()
        self.num_labels = num_labels
        self.bert = torch.hub.load('huggingface/pytorch-transformers', 'model', pretrained_model_name)
        self.tokenizer = tokenizer
        self.loss_fn = nn.CrossEntropyLoss()
        self.dropout = nn.Dropout()
        self.fc1 = nn.Linear(self.bert.config.hidden_size, 128)
        self.finaloutput = nn.Linear(128, num_labels)
        self.softmaxlayer = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask, labels = None, *args, **kwargs):
        #print(kwargs)
        #print(args)
        outputs = self.bert(input_ids = input_ids, attention_mask = attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        fc1_output = nn.functional.relu(self.fc1(pooled_output))
        logits = self.finaloutput(fc1_output)
        logits = self.softmaxlayer(logits)
        loss = None
        if labels is not None:
          loss = self.loss_fn(logits.view(-1, self.num_labels), labels.view(-1))
        return {"logits":logits,
                "loss": loss
                }

pretrained_model_name = "yiyanghkust/finbert-pretrain"
the_model_pytorch = BertForSequenceClassification(pretrained_model_name)
# Print model summary
#print(the_model_pytorch)
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits = eval_pred.predictions
    labels = eval_pred.label_ids
    #print(logits)
    #print(labels)
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)
training_args = TrainingArguments(
    output_dir="test_trainer",
    #remove_unused_columns=False,
    accelerator_config  = {"use_seedable_sampler": False},
    num_train_epochs=5,
    eval_strategy = 'epoch'
    #load_best_model_at_end = True
    )

trainer = Trainer(
    model = the_model_pytorch,
    args=training_args,
    train_dataset= tokenized_datasets["train"],
    eval_dataset= tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)

trainer.train()


In [None]:

"""
the_model_pytorch.push_to_hub("finbert-tone-v0")
"""