In [1]:
!pip install transformers datasets torch

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.19.3 (from transformers)
  Downloading huggingface_hub-0.22.2-py3

In [2]:
!pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.29.3-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.29.3


## **Loading the dataset**

In [3]:
from datasets import load_dataset
import pandas as pd

# Load the dataset
dataset = load_dataset("csv", data_files={"train": "/content/macroeconomic_classifier_dataset_4_cleaned.csv"}, delimiter=',', column_names=["sentence", "label"])

Generating train split: 0 examples [00:00, ? examples/s]

## **Labeling**

In [4]:
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Extract the labels and convert to a pandas Series to easily manipulate and check data
labels = pd.Series(dataset["train"]["label"])

# Check if the 'label' from headers is included as data and remove it
if 'label' in labels.values:
    labels = labels[labels != 'label']

# Convert the cleaned labels back to a list for further processing
cleaned_labels = labels.tolist()

# Initialize the encoder
label_encoder = LabelEncoder()

# Fit label encoder and return encoded labels
encoded_labels = label_encoder.fit_transform(cleaned_labels)

# Replace your dataset labels with encoded labels
# Since datasets are immutable, you need to use 'map' to apply changes
def add_encoded_labels(examples):
    examples['encoded_labels'] = encoded_labels[list(dataset["train"]["label"]).index(examples['label'])]
    return examples

# Apply the function to add the 'encoded_labels' column
dataset["train"] = dataset["train"].map(add_encoded_labels, batched=False, load_from_cache_file=False)

# Now your dataset["train"] has an additional column 'encoded_labels'

Map:   0%|          | 0/1100 [00:00<?, ? examples/s]

## **Tokenizing**

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")

def tokenize_function(examples):
    print("Type of input:", type(examples["sentence"]))

    # Convert all items in the list to strings
    examples["sentence"] = [str(item) if not isinstance(item, str) else item for item in examples["sentence"]]

    if isinstance(examples["sentence"], list):
        print("Input is a list. All items have been converted to strings.")

    result = tokenizer.batch_encode_plus(examples["sentence"], padding="max_length", truncation=True, max_length=512)
    result["labels"] = examples["encoded_labels"]
    return result

# Apply the function to the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["sentence", "label", "encoded_labels"])


tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Map:   0%|          | 0/1100 [00:00<?, ? examples/s]

Type of input: <class 'list'>
Input is a list. All items have been converted to strings.
Type of input: <class 'list'>
Input is a list. All items have been converted to strings.


In [6]:
from datasets import DatasetDict

# Assuming `tokenized_datasets` is your processed dataset ready for training and evaluation
train_test_split = tokenized_datasets["train"].train_test_split(test_size=0.25)  # Adjust test_size as needed
dataset_split = DatasetDict({
    "train": train_test_split["train"],
    "test": train_test_split["test"]
})

# Now, you have `dataset_split["train"]` for training and `dataset_split["test"]` for evaluation


In [7]:
# Calculate the number of unique labels
num_labels = len(np.unique(labels))

print(f"Number of unique labels: {num_labels}")
print(np.unique(labels))

Number of unique labels: 7
['Exchange Rates' 'Fiscal Policy' 'GDP Growth' 'Inflation'
 'International Trade' 'Monetary Policy' 'Unemployment']


In [8]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoConfig

# Determine the number of unique labels
# num_labels = dataset["train"].features["label"].num_classes

# Load the configuration of the model
config = AutoConfig.from_pretrained("ProsusAI/finbert", num_labels=num_labels)

# Create a new model with the updated configuration
model = AutoModelForSequenceClassification.from_config(config)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=40,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    evaluation_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=dataset_split["test"],
    compute_metrics=None,  # Define a function to compute metrics if you want
)



In [9]:
trainer.train()
trainer.evaluate()

Epoch,Training Loss,Validation Loss
1,No log,1.652854
2,No log,1.640038
3,No log,1.609411
4,No log,1.307605
5,No log,0.963115
6,No log,0.849041
7,No log,0.934529
8,1.319600,0.607774
9,1.319600,0.319324
10,1.319600,0.414059


{'eval_loss': 0.0001592974440427497,
 'eval_runtime': 8.6911,
 'eval_samples_per_second': 31.642,
 'eval_steps_per_second': 0.575,
 'epoch': 40.0}

In [10]:
model.save_pretrained("macroecon_classifier")
tokenizer.save_pretrained("my_finbert_model")

('my_finbert_model/tokenizer_config.json',
 'my_finbert_model/special_tokens_map.json',
 'my_finbert_model/vocab.txt',
 'my_finbert_model/added_tokens.json',
 'my_finbert_model/tokenizer.json')

In [17]:
from transformers import pipeline

classifier = pipeline("text-classification", model="macroecon_classifier", tokenizer="my_finbert_model")

# Example prediction
predictions = classifier("Exports by Destination showed diverse contributions from different regions, indicating shifts in international trade relationships.")

# Extract the numeric ID
predicted_label_ids = [int(prediction['label'].split('_')[-1]) for prediction in predictions]

# Assuming `label_encoder` is your LabelEncoder instance
original_labels = label_encoder.inverse_transform(predicted_label_ids)

print(original_labels)

['Fiscal Policy']


In [18]:
# List files in the saved directories to verify
!ls macroecon_classifier
!ls my_finbert_model

config.json  model.safetensors
special_tokens_map.json  tokenizer_config.json	tokenizer.json	vocab.txt


In [19]:
!zip -r macroecon_classifier.zip macroecon_classifier
!zip -r my_finbert_model.zip my_finbert_model

  adding: macroecon_classifier/ (stored 0%)
  adding: macroecon_classifier/config.json (deflated 54%)
  adding: macroecon_classifier/model.safetensors (deflated 7%)
  adding: my_finbert_model/ (stored 0%)
  adding: my_finbert_model/special_tokens_map.json (deflated 42%)
  adding: my_finbert_model/tokenizer_config.json (deflated 75%)
  adding: my_finbert_model/tokenizer.json (deflated 71%)
  adding: my_finbert_model/vocab.txt (deflated 53%)


In [2]:
from google.colab import files

# Programmatically trigger the download
files.download('macroecon_classifier.zip')


FileNotFoundError: Cannot find file: macroecon_classifier.zip