In [None]:
!pip install transformers datasets torch scikit-learn



In [None]:
!pip install transformers[torch] accelerate -U
!pip install torch

Collecting transformers[torch]
  Downloading transformers-4.41.0-py3-none-any.whl (9.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m39.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers, accelerate
  Attempting uninstall: transformers
    Found existing installation: transformers 4.40.2
    Uninstalling transformers-4.40.2:
      Successfully uninstalled transformers-4.40.2
Successfully installed accelerate-0.30.1 transformers-4.41.0


In [None]:
!pip install accelerate -U



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import RobertaTokenizer, TrainingArguments, Trainer, AutoConfig
import torch
import torch.nn as nn
from transformers import RobertaModel, RobertaPreTrainedModel

## **Loading the dataset**

In [None]:

# Load the dataset
df = pd.read_csv('/content/data.csv')


In [None]:
# Ensure sentiment labels are encoded as integers
label_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
df['Sentiment'] = df['Sentiment'].map(label_mapping)


In [None]:
# Split the data into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.25, random_state=42)

# Convert dataframes to HuggingFace Datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

## **Loading finBERT**

In [None]:
model_name = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)



vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

## **Preprocessing data**

In [None]:
# Step 7: Preprocess the data with consistent padding and truncation
def preprocess(data):
    return tokenizer(data['Sentence'], truncation=True, padding='max_length', max_length=128)

train_dataset = train_dataset.map(preprocess, batched=True)
val_dataset = val_dataset.map(preprocess, batched=True)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'Sentiment'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'Sentiment'])

# Rename 'Sentiment' to 'labels' to match the expected input for the Trainer
train_dataset = train_dataset.rename_column("Sentiment", "labels")
val_dataset = val_dataset.rename_column("Sentiment", "labels")


Map:   0%|          | 0/4381 [00:00<?, ? examples/s]

Map:   0%|          | 0/1461 [00:00<?, ? examples/s]

## **Define the custom model with additional dropout**

In [None]:
class CustomRobertaForSequenceClassification(RobertaPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.roberta = RobertaModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
        )

        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        output = (logits,) + outputs[2:]
        return ((loss,) + output) if loss is not None else output


## **Training Parameters**

In [None]:
config = AutoConfig.from_pretrained(model_name, num_labels=3)
model = CustomRobertaForSequenceClassification.from_pretrained(model_name, config=config)

# training_args = TrainingArguments(
#     output_dir='./results',
#     evaluation_strategy='epoch',
#     learning_rate=1e-5,
#     per_device_train_batch_size=32,
#     per_device_eval_batch_size=32,
#     num_train_epochs=10,
#     weight_decay=0.01,
# )

def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    return {"accuracy": (preds == p.label_ids).astype(float).mean().item()}

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=val_dataset,
#     compute_metrics=compute_metrics,
# )

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    save_strategy='epoch',  # Make sure save strategy matches evaluation strategy
    learning_rate=3.148857155380094e-05,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of CustomRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer.train()
trainer.evaluate()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.388909,0.822724
2,No log,0.383529,0.819986
3,No log,0.405758,0.832307
4,0.321100,0.413817,0.829569
5,0.321100,0.441638,0.830253
6,0.321100,0.444505,0.82204
7,0.321100,0.513391,0.819302
8,0.187100,0.518833,0.821355
9,0.187100,0.544586,0.807666
10,0.187100,0.541391,0.800137


{'eval_loss': 0.5413905382156372,
 'eval_accuracy': 0.8001368925393566,
 'eval_runtime': 9.2338,
 'eval_samples_per_second': 158.224,
 'eval_steps_per_second': 4.982,
 'epoch': 10.0}

In [None]:
model.save_pretrained("macroecon_classifier")
tokenizer.save_pretrained("my_finbert_model")

('my_finbert_model/tokenizer_config.json',
 'my_finbert_model/special_tokens_map.json',
 'my_finbert_model/vocab.txt',
 'my_finbert_model/added_tokens.json',
 'my_finbert_model/tokenizer.json')

In [None]:
from transformers import pipeline

classifier = pipeline("text-classification", model="macroecon_classifier", tokenizer="my_finbert_model")

# Example prediction
predictions = classifier("Exports by Destination showed diverse contributions from different regions, indicating shifts in international trade relationships.")

# Extract the numeric ID
predicted_label_ids = [int(prediction['label'].split('_')[-1]) for prediction in predictions]

# Assuming `label_encoder` is your LabelEncoder instance
original_labels = label_encoder.inverse_transform(predicted_label_ids)

print(original_labels)

['Fiscal Policy']


In [None]:
# List files in the saved directories to verify
!ls macroecon_classifier
!ls my_finbert_model

config.json  model.safetensors
special_tokens_map.json  tokenizer_config.json	tokenizer.json	vocab.txt


In [None]:
# List files in the saved directories to verify
!ls macroecon_classifier
!ls my_finbert_model

config.json  model.safetensors
special_tokens_map.json  tokenizer_config.json	tokenizer.json	vocab.txt


In [None]:
!zip -r macroecon_classifier.zip macroecon_classifier
!zip -r my_finbert_model.zip my_finbert_model

  adding: macroecon_classifier/ (stored 0%)
  adding: macroecon_classifier/config.json (deflated 54%)
  adding: macroecon_classifier/model.safetensors (deflated 7%)
  adding: my_finbert_model/ (stored 0%)
  adding: my_finbert_model/special_tokens_map.json (deflated 42%)
  adding: my_finbert_model/tokenizer_config.json (deflated 75%)
  adding: my_finbert_model/tokenizer.json (deflated 71%)
  adding: my_finbert_model/vocab.txt (deflated 53%)
