# Financial Text Classification 

## Install 

In [1]:
pip install happytransformer

Collecting happytransformer
  Downloading happytransformer-3.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting tqdm>=4.43 (from happytransformer)
  Downloading tqdm-4.66.2-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers<5.0.0,>=4.30.1 (from happytransformer)
  Downloading transformers-4.38.1-py3-none-any.whl.metadata (131 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m131.1/131.1 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets<3.0.0,>=2.13.1 (from happytransformer)
  Downloading datasets-2.17.1-py3-none-any.whl.metadata (20 kB)
Collecting sentencepiece (from happytransformer)
  Downloading sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting protobuf (from happytransformer)
  Downloading protobuf-4.25.3-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 byt

## Imports

In [3]:
import csv
from datasets import load_dataset
from happytransformer import HappyTextClassification, TCTrainArgs
from tqdm import tqdm 

## Dataset

### Download 

In [4]:
# https://huggingface.co/datasets/zeroshot/twitter-financial-news-sentiment
dataset = load_dataset("zeroshot/twitter-financial-news-sentiment")
dataset = dataset.shuffle(seed=42)

Downloading readme:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/859k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/217k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

### Inspect

In [5]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 9543
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2388
    })
})


In [6]:
print(dataset["train"][0])

{'text': 'Stocks - Tiffany, Disney, Tesla Rise Premarket; Uber Falls', 'label': 2}


### Split

In [7]:
# First 9000 cases for training 
train_data = dataset["train"][:9000]
# Last 543 for eval 
eval_data = dataset["train"][9000:]

# We'll use all of the validation data as test data. Do not train with this data. 
test_data = dataset["validation"][:]



### Convert to CSV

In [8]:
def generate_csv(csv_path, dataset_dict):
    with open(csv_path, 'w', newline='') as csvfile:
        writter = csv.writer(csvfile)
        writter.writerow(["text", "label"])
        for text, label in zip(dataset_dict["text"], dataset_dict["label"]):
            writter.writerow([text, label])

In [9]:
train_csv = "train.csv"
eval_csv = "eval.csv"


In [10]:
generate_csv(train_csv, train_data)
generate_csv(eval_csv, eval_data)


In [11]:
set(train_data["label"])

{0, 1, 2}

0: Negative 
1: Positive 
2: Neutral 

## Model

In [12]:
happy_tc = HappyTextClassification("DISTILBERT", "distilbert/distilbert-base-uncased", num_labels=3) 



config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

02/24/2024 18:04:11 - INFO - happytransformer.happy_transformer -   Using device: cuda:0


## Train

In [13]:
args = TCTrainArgs(num_train_epochs=1, eval_steps=0.1,  batch_size=16)

In [14]:
happy_tc.train(input_filepath=train_csv, args=args, eval_filepath=eval_csv)

Generating train split: 0 examples [00:00, ? examples/s]

Generating eval split: 0 examples [00:00, ? examples/s]

02/24/2024 18:04:12 - INFO - happytransformer.happy_transformer -   Tokenizing training data...


Tokenizing data:   0%|          | 0/9000 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
02/24/2024 18:04:14 - INFO - happytransformer.happy_transformer -   Tokenizing eval data...


Tokenizing data:   0%|          | 0/543 [00:00<?, ? examples/s]

02/24/2024 18:04:15 - INFO - happytransformer.happy_transformer -   Moving model to cuda:0


Step,Training Loss,Validation Loss
1,1.104,1.087111
57,0.7529,0.606862
114,0.5547,0.495191
171,0.5412,0.480386
228,0.4944,0.445683
285,0.4524,0.420434
342,0.4493,0.393757
399,0.4348,0.377467
456,0.4705,0.378426
513,0.4074,0.361646


## Sample Inference

In [16]:
output_negative = happy_tc.classify_text("Apple's stock just decreased")
print(output_negative) # We expect LABEL_0

output_positive = happy_tc.classify_text("Apple's stock just increased")
print(output_positive) # We expect LABEL_1

output_neutral = happy_tc.classify_text("Will apple continue to increase?")
print(output_neutral) # We expect LABEL_2

TextClassificationResult(label='LABEL_0', score=0.9084067940711975)
TextClassificationResult(label='LABEL_1', score=0.9253707528114319)
TextClassificationResult(label='LABEL_2', score=0.9659663438796997)


## Test 

In [20]:
# Our test dataset uses the following labels as ints: 0, 1, 2. 
# Our model outputs the strings "LABEL_0", "LABEL_1" and "LABEL"2. 
# We need to create a dictionary to map the labels from the test dataset to our model's outputs 

label_map = {    
}
for i in range(0, 3):
    label_map[f"LABEL_{i}"] = i

In [21]:
print(label_map)

{'LABEL_0': 0, 'LABEL_1': 1, 'LABEL_2': 2}


In [22]:
correct = 0 
total = len(test_data["label"])
for text, label in tqdm(zip(test_data["text"], test_data["label"]),  total=total):
    output = happy_tc.classify_text(text)
    if label_map[output.label] == label:
        correct += 1
        

100%|██████████| 2388/2388 [00:14<00:00, 163.42it/s]


In [29]:
print(correct)
print(total)

2070
2388


In [30]:
print(f"accuracy: { round((correct/total)*100, 3)}%")

accuracy: 86.683%
