In [1]:
from setfit import AbsaModel
import pandas as pd
import os
import numpy as np
import logging
import random
import torch
import mlflow
from setfit import AbsaTrainer, TrainingArguments
from transformers import EarlyStoppingCallback
from torch.utils.data import Dataset
import copy



In [2]:
seed_value = 42
random.seed(seed_value)
np.random.seed(seed_value)

torch.cuda.empty_cache()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)


Using device: cuda


In [3]:
#%env TOKENIZERS_PARALLELISM=false

In [4]:
from tqdm.notebook import tqdm
import transformers

transformers.utils.logging.enable_progress_bar = lambda: tqdm


In [7]:
model = AbsaModel.from_pretrained(
        "sentence-transformers/all-MiniLM-L6-v2",
        "sentence-transformers/all-mpnet-base-v2",
        spacy_model="en_core_web_sm",
    )
model.to(device)

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


In [9]:
data_dir = '/home/ubuntu/TextStock/data'
model_dir = '/home/ubuntu/TextStock/models'
output_dir = '/home/ubuntu/TextStock/models'
table = pd.read_csv(os.path.join(data_dir, 'SEntFiN-v1.1.csv'))

In [11]:
table.head()

Unnamed: 0,S No.,Title,Decisions,Words
0,1,SpiceJet to issue 6.4 crore warrants to promoters,"{""SpiceJet"": ""neutral""}",8
1,2,MMTC Q2 net loss at Rs 10.4 crore,"{""MMTC"": ""neutral""}",8
2,3,"Mid-cap funds can deliver more, stay put: Experts","{""Mid-cap funds"": ""positive""}",8
3,4,Mid caps now turn into market darlings,"{""Mid caps"": ""positive""}",7
4,5,"Market seeing patience, if not conviction: Pra...","{""Market"": ""neutral""}",8


In [13]:
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


def prepare_dataset(table):
    """
    Prepares the dataset by expanding rows based on the 'Decisions' column.
    Each decision (aspect-sentiment pair) in the 'Decisions' dictionary becomes a separate row.

    Args:
        table (pd.DataFrame): Input DataFrame with a 'Decisions' column containing dictionaries of aspect-sentiment pairs.

    Returns:
        pd.DataFrame: A new DataFrame where each row represents a single aspect-sentiment pair.
    """
    rows = []
    table = table.sample(frac=1, random_state=42).reset_index(drop=True) # Added random_state for reproducibility
    for i, row in table.iterrows():
        try:
            decisions = eval(row['Decisions'])
            if isinstance(decisions, dict):
                for k, v in decisions.items():
                    new_row = copy.deepcopy(row)
                    new_row['Decisions'] = [k, v]
                    rows.append(new_row)
            else:
                logging.warning(f"Skipping row {i} due to invalid 'Decisions' format: {row['Decisions']}")
        except Exception as e:
            logging.error(f"Error processing row {i}: {e}")
            logging.error(f"Problematic 'Decisions' value: {row['Decisions']}")

    new_table = pd.DataFrame(rows)
    logging.info(f"Dataset prepared with {len(new_table)} samples.")
    return new_table


In [33]:
def split_data(table):
    """Data should be shuffled already"""
    train_ratio = 0.8
    split_index = int(len(table) * train_ratio)
    train_df = table[:split_index]
    validation_all = table[split_index:]
    
    train_df = train_df[:1000]
    
    split_index_test = int(len(validation_all) * 0.5)
    val_df = validation_all[:split_index_test]
    val_df = val_df[:100]
    test_df = validation_all[split_index_test:]
    test_df = test_df[:100]
    logging.info(f"Data split: {len(train_df)} training, {len(val_df)} validation, {len(test_df)} test samples.")
    return train_df, val_df, test_df


In [34]:
class FinDataset(Dataset):
    def __init__(self, data, max_len):
        self.data = data
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        span, label = row['Decisions']
        ordinal = 0# if label == 'positive' else 0
        return {
            'text': row['Title'],
            'span': span,
            'label': label,
            'ordinal': ordinal
        }



In [35]:
table = pd.read_csv(os.path.join(data_dir, 'SEntFiN-v1.1.csv'))
table = prepare_dataset(table)
train_df, valid_df, test_df = split_data(table)

2025-01-21 07:51:24,705 - INFO - Dataset prepared with 14409 samples.
2025-01-21 07:51:24,732 - INFO - Data split: 1000 training, 100 validation, 100 test samples.


In [36]:
train_df.head()

Unnamed: 0,S No.,Title,Decisions,Words
0,669,Ponzi schemes: Sebi seeks quarterly meetings o...,"[Sebi, neutral]",9
1,5334,"European shares steady, pegged back by Vodafone","[Vodafone, negative]",7
1,5334,"European shares steady, pegged back by Vodafone","[European shares, neutral]",7
2,4437,M&M Finance plunges 8.5% as brokers cut target...,"[M&M Finance, negative]",12
3,2329,Maintain 'Buy' on Wipro with target of Rs 528:...,"[Wipro, positive]",10


In [37]:
train_data = FinDataset(train_df, max_len=15)
valid_data = FinDataset(valid_df, max_len=15)
test_data = FinDataset(test_df, max_len=15)

In [38]:
test_data[9]

{'text': 'Carborundum Universal Q1 net down 25% at Rs 28.78 crore',
 'span': 'Carborundum Universal',
 'label': 'negative',
 'ordinal': 0}

In [39]:
# Set the MLflow experiment name
mlflow_experiment_name = "financial_sentiment_absa"  # Choose a descriptive name
mlflow.set_experiment(mlflow_experiment_name)

<Experiment: artifact_location='file:///home/ubuntu/TextStock/sentiment/mlruns/928194884546636435', creation_time=1737441538044, experiment_id='928194884546636435', last_update_time=1737441538044, lifecycle_stage='active', name='financial_sentiment_absa', tags={}>

In [40]:
args = TrainingArguments(
    output_dir=output_dir,
    num_epochs=5,
    use_amp=True,
    batch_size=128,
    eval_strategy="steps",
    eval_steps=100,
    save_steps=100,
    load_best_model_at_end=True,
    report_to='mlflow',
    show_progress_bar=True,
)

In [42]:
trainer = AbsaTrainer(
    model,
    args=args,
    train_dataset=train_data,
    eval_dataset=valid_data,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    metric="accuracy",

)

2025-01-21 07:52:12,848 - INFO - No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
2025-01-21 07:52:12,901 - INFO - No `loss` passed, using `losses.CoSENTLoss` as a default option.


Map:   0%|          | 0/2734 [00:00<?, ? examples/s]

2025-01-21 07:52:13,078 - INFO - No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
2025-01-21 07:52:13,131 - INFO - No `loss` passed, using `losses.CoSENTLoss` as a default option.


Map:   0%|          | 0/996 [00:00<?, ? examples/s]

In [None]:
trainer.train()

In [27]:
# Evaluating
metrics = trainer.evaluate(test_data)
print(metrics)

***** Running evaluation *****


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

***** Running evaluation *****


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

{'aspect': {'accuracy': 0.9206349206349206}, 'polarity': {'accuracy': 0.84}}


In [28]:
print(metrics)

{'aspect': {'accuracy': 0.9206349206349206}, 'polarity': {'accuracy': 0.84}}


In [29]:
preds = model.predict([
    'Sahara case: Sebi to seek help of foreign regulators',
    'Are Indian cotton prices a bubble?',
    "CLSA cuts target price for RIL to Rs 1,150, but retains a 'buy' call",
    'Petrol, diesel prices hiked again, but OMC stocks remain subdued; IOC, HPCL in red'
])

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [30]:
texts = [t['text'] for t in test_data]

In [31]:
preds = model.predict(texts)

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

In [32]:
for pair in zip([_ for _ in test_data], preds):
    print(pair[0]['span'], pair[0]['label'], pair[1])

Deutsche Bank India neutral [{'span': 'Deutsche Bank India CEO', 'polarity': 'neutral'}]
Axis Capital neutral []
Kalindee Rail negative [{'span': 'Kalindee Rail', 'polarity': 'negative'}, {'span': 'Texmaco', 'polarity': 'negative'}]
Texmaco neutral [{'span': 'Kalindee Rail', 'polarity': 'negative'}, {'span': 'Texmaco', 'polarity': 'negative'}]
Ashok Leyland positive [{'span': 'Jefferies', 'polarity': 'positive'}, {'span': 'Ashok Leyland', 'polarity': 'positive'}]
pharma companies negative [{'span': 'USFDA', 'polarity': 'neutral'}, {'span': 'Dalal Street', 'polarity': 'neutral'}, {'span': 'pharma companies', 'polarity': 'neutral'}]
USFDA neutral [{'span': 'USFDA', 'polarity': 'neutral'}, {'span': 'Dalal Street', 'polarity': 'neutral'}, {'span': 'pharma companies', 'polarity': 'neutral'}]
Oriental Bank of Commerce positive [{'span': 'Oriental Bank', 'polarity': 'neutral'}, {'span': 'Commerce Q4 Net', 'polarity': 'positive'}]
Oriental Bank of Commerce positive [{'span': 'Oriental Bank', '

In [None]:
#https://github.com/huggingface/setfit

In [28]:
model.save_pretrained(os.path.join(output_dir, 'setfit-absa-model-finance'))


2024-12-27 20:04:47,944 - INFO - Save model to /home/ubuntu/fin_experiment/models/setfit-absa-model-finance-aspect
2024-12-27 20:04:48,140 - INFO - Save model to /home/ubuntu/fin_experiment/models/setfit-absa-model-finance-polarity
