In [None]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m105.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m83.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.1-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.2/199.2 KB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.1 tokenizers-0.13.2 transformers-4.26.1


In [None]:
%%sh
mkdir machine-learning
cp -r /content/drive/MyDrive/machine-learning/* /content/machine-learning/

In [None]:
DATE = '2023_03_13_'
MODEL_RUN = 1
PREDICTION_RUN = 1

In [None]:
data_dir   = "/content/machine-learning/model-preparation/"
output_dir = "/content/machine-learning/model-output/"

predict_file_path = data_dir+'benchmark-emotional-sentences.csv'


In [None]:

import numpy as np

def read_clean_data(file_path):
  data = pd.read_csv(file_path)
  nan_value = float("NaN")
  #Convert NaN values to empty string
  data["text"].replace("", nan_value, inplace=True)
  data["text"].replace([np.inf, -np.inf], nan_value, inplace=True)
  data.dropna(subset = ["text"], inplace=True)

  return data

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
from transformers.pipelines.pt_utils import KeyDataset
from transformers import TrainingArguments, Trainer
import torch

import numpy as np
import pandas as pd

# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

# ----- 3. Classify/Predict -----#
def classify(input_file_path, output_file_path, classification_prefix, generate_report):
  test_data = read_clean_data(input_file_path)
  file_path = output_file_path.replace(".csv", "_"+DATE+ str(PREDICTION_RUN) +"_relevant.csv")

  X_test = list(test_data["text"])
  
  model_path = "ProsusAI/finbert"
  model_path = "cardiffnlp/twitter-roberta-base-sentiment-latest"
  
  # Download and load FinBert pretrained model
  tokenizer = AutoTokenizer.from_pretrained(model_path)
  model = AutoModelForSequenceClassification.from_pretrained(model_path) 

  
  # Create torch dataset
  X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=1024)
  test_dataset = Dataset(X_test_tokenized)

  
  # # # Define test trainer
  test_trainer = Trainer(model)

  # # Make prediction
  raw_pred, _, _ = test_trainer.predict(test_dataset)

  # move all the values in 3rd column to 1st column, and the values in the 1s column to 2nd column, 2nd column to 3rd column
  # since cardiffnlp/twitter-roberta-base-sentiment-latest provides output in neg/post/neutral order.
  raw_pred = np.roll(raw_pred, 1, axis=1)

  # Preprocess raw predictions
  data = pd.read_csv(input_file_path)
  y_pred = np.argmax(raw_pred, axis=1)
  data['label'] = y_pred
  prob = np.array(raw_pred)
  data['positive'] = prob[:,0]
  data['negative'] = prob[:,1]
  data['neutral'] = prob[:,2]

  data['matched'] = data.apply(lambda row: 1 if row['expected-label'] == row['label'] else 0, axis=1)

  data.to_csv(file_path, index=False)
  print("Written prediction results to file: "+ file_path)

  if generate_report == True: #validation case only
    from sklearn.metrics import classification_report
    y_test = list(test_data["expected-label"])
    print(classification_report(y_test, y_pred.tolist()))

  # print(raw_pred)
  # print(raw_pred.shape)


In [None]:
#classify(validate_file_path, output_dir+validate_file_name, "validation_bert_"+topic_title, True)
PREDICTION_RUN += 1
classify(predict_file_path, predict_file_path, "", True)

In [None]:
#classify(validate_file_path, output_dir+validate_file_name, "validation_bert_"+topic_title, True)
PREDICTION_RUN += 1
classify(predict_file_path, predict_file_path, "", True)