In [1]:
!pip install datasets



In [42]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import torch
from collections import Counter

# Step 1: Load the Dataset
dataset = load_dataset("Anurich/finance_dataset")
dataset

DatasetDict({
    train: Dataset({
        features: ['prompt', 'text', 'rejected_text'],
        num_rows: 8045
    })
})

In [43]:
print(type(dataset['train'][0]['prompt']))
print(dataset['train'][0]['prompt'])

<class 'str'>
what is the the interest expense in 2009?


In [32]:
# Step 2: Define Label Mapping
id_to_label = {0: "financial_analysis", 1: "out_of_scope"}
id_to_label

{0: 'financial_analysis', 1: 'out_of_scope'}

In [33]:
label_to_id = {v: k for k, v in id_to_label.items()}
label_to_id

{'financial_analysis': 0, 'out_of_scope': 1}

In [44]:
# Step 3: Dynamically Assign Labels
finance_list=['macd','rsi','ema','sma','stock','momentum','volatility','obv','daily return','corelation','marubozu','three white soldiers','pin bar','technical analysis','bollinger',
              'analysis','financial analysis','bullish','bearish','trend','formation','candlestick','doji','hammer','engulfing','hanging man','morning star','three black crows',
              'evening star','volume','on balance volume','golden cross','death cross','gap','support level','resistance level','moving average','exponential average','close price']

def add_labels(example):
    example["labels"] = label_to_id["out_of_scope"]  # Default label
    # Check if any finance-related term exists in the prompt string
    if any(f in example["prompt"].lower() for f in finance_list):
        example["labels"] = label_to_id["financial_analysis"]  # Assign financial_analysis label
    return example

In [45]:
dataset = dataset["train"].map(add_labels)
dataset

Map:   0%|          | 0/8045 [00:00<?, ? examples/s]

Dataset({
    features: ['prompt', 'text', 'rejected_text', 'labels'],
    num_rows: 8045
})

In [52]:
Counter(dataset['labels'])

Counter({1: 7345, 0: 700})

In [15]:
for x in dataset['prompt']:
  if 'stock' in x.lower():
    print(x)

what is the cumulative total shareholder return on state street's common stock in 2012 as a percentage of the average shareholder return on common stock in the s&p 500?
what is the difference in the number of class usa stock of pre and after true-up?
what would be the total common stock par value if all authorized shares were outstanding?
how is the cash flow statement from financing activities affected by the sales of commons stock during the 4th quarter of 2013?
for the 2017 restricted common stock and restricted stock unit grants , assuming the average vesting period , what would annual compensation expense be in millions over the vesting period?
at december 2010 what was the percent of the losses related to employee stock options included in the net federal operating loss carry forwards
what was the percentage chaning in the total fair value of restricted stock and performance awards vested from 2016 to 2017?
in 2011 , what percentage of common stocks were issued from treasury stoc

In [50]:
stock = set()
for x in dataset['prompt']:
  if 'stock' in x.lower():
    stock.add(x)
len(stock)

468

In [51]:
stock = set()
for x in dataset['prompt']:
  if 'stock' in x.lower() or 'rsi' in x.lower():
    stock.add(x)
len(stock)

480

In [49]:
stock = set()
for f in finance_list:
    for x in dataset['prompt']:
        if f in x.lower():
            stock.add(x)  # Set, sadece eşsiz değerleri saklar

# Benzersiz eşleşmelerin sayısı
len(stock)

700

In [53]:
# Step 4: Tokenizer Setup
model_name = "cartesinus/multilingual_minilm-amazon-massive-intent"
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/675 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/167 [00:00<?, ?B/s]

In [54]:

# Step 5: Tokenize Dataset
def tokenize_function(examples):
    return tokenizer(examples["prompt"], padding="max_length", truncation=True, max_length=128)

# Apply tokenization to the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets

Map:   0%|          | 0/8045 [00:00<?, ? examples/s]

Dataset({
    features: ['prompt', 'text', 'rejected_text', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 8045
})

In [55]:
# Verify tokenized dataset
print(tokenized_datasets[0])  # Should include `input_ids`, `attention_mask`, and `labels`

{'prompt': 'what is the the interest expense in 2009?', 'text': '380', 'rejected_text': '41932', 'labels': 1, 'input_ids': [0, 2367, 83, 70, 70, 33946, 14700, 21161, 23, 1877, 32, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [56]:
# Step 6: Split Dataset
train_test_split = tokenized_datasets.train_test_split(test_size=0.2)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

In [57]:
# Verify Dataset Balance
train_labels = [example["labels"] for example in train_dataset]
eval_labels = [example["labels"] for example in eval_dataset]
print(f"Training labels distribution: {Counter(train_labels)}")
print(f"Validation labels distribution: {Counter(eval_labels)}")

Training labels distribution: Counter({1: 5871, 0: 565})
Validation labels distribution: Counter({1: 1474, 0: 135})


In [60]:
from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # Tensorleri aynı cihaza taşı
    if isinstance(logits, torch.Tensor):
        logits = logits.cpu().numpy()  # CPU'ya taşı ve NumPy formatına dönüştür
    if isinstance(labels, torch.Tensor):
        labels = labels.cpu().numpy()  # CPU'ya taşı ve NumPy formatına dönüştür

    predictions = logits.argmax(axis=-1)  # En yüksek logit değeri sınıfı temsil eder
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy}

# Step 7: Model Setup
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(id_to_label),
    ignore_mismatched_sizes=True
)

# Update the model's configuration
model.config.id2label = id_to_label
model.config.label2id = label_to_id

# Step 8: Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=3e-5,  # Lower learning rate for fine-tuning
    per_device_train_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
)

# Step 9: Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Step 10: Train the Model
trainer.train()

# Step 11: Save the Fine-Tuned Model
model.save_pretrained("fine_tuned_model")
tokenizer.save_pretrained("fine_tuned_model")
print("Fine-tuning completed and model saved!")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cartesinus/multilingual_minilm-amazon-massive-intent and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([60]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.weight: found shape torch.Size([60, 384]) in the checkpoint and torch.Size([2, 384]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.126529,0.972032
2,0.150300,0.053265,0.990677
3,0.085300,0.040196,0.992542
4,0.032900,0.039133,0.993163
5,0.018900,0.043265,0.991299


Fine-tuning completed and model saved!


In [62]:
# Evaluate the model on the validation set
results = trainer.evaluate()
print("Validation Accuracy:", results["eval_accuracy"])

# If you want training accuracy, calculate manually using the training dataset
train_results = trainer.predict(train_dataset)
train_predictions = torch.tensor(train_results.predictions).argmax(dim=-1).cpu().numpy()  # CPU'ya taşı ve NumPy formatına dönüştür
train_labels = torch.tensor(train_dataset["labels"]).cpu().numpy()  # Etiketleri CPU'ya taşı ve NumPy formatına dönüştür

train_accuracy = accuracy_score(train_labels, train_predictions)
print("Training Accuracy:", train_accuracy)

Validation Accuracy: 0.9931634555624611
Training Accuracy: 0.997513983840895


In [63]:
train_accuracy-results["eval_accuracy"]

0.004350528278433852

In [64]:
model_path = "fine_tuned_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Verify the updated label mapping
print(model.config.id2label)  # Should include "financial_analysis" and "out_of_scope"

{0: 'financial_analysis', 1: 'out_of_scope'}


In [65]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load the fine-tuned model and tokenizer
model_path = "fine_tuned_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Set the model to evaluation mode
model.eval()

# Example queries
queries = [
    "What is the RSI value for this stock?",  # In-scope
    "What is the purpose in life?",          # Out-of-scope
]

id_to_label = {0: "financial_analysis", 1: "out_of_scope"}
THRESHOLD = 0.95  # Adjust threshold for out-of-scope detection

for query in queries:
    inputs = tokenizer(query, return_tensors="pt", padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=1)

        # Log probabilities
        print(f"Query: {query}")
        print(f"Probabilities: {probabilities.tolist()}")

        max_prob, predicted_label = probabilities.max(dim=1)
        max_prob = max_prob.item()
        predicted_label = predicted_label.item()

        # Apply confidence threshold
        if probabilities[0][0].item() < THRESHOLD:  # Confidence for `financial_analysis`
            predicted_intent = "out_of_scope"
        else:
            predicted_intent = id_to_label[predicted_label]

        print(f"Predicted intent: {predicted_intent}")
        print(f"Confidence: {max_prob:.4f}\n")


Query: What is the RSI value for this stock?
Probabilities: [[0.99628084897995, 0.0037191209848970175]]
Predicted intent: financial_analysis
Confidence: 0.9963

Query: What is the purpose in life?
Probabilities: [[0.003373584244400263, 0.9966264963150024]]
Predicted intent: out_of_scope
Confidence: 0.9966



In [66]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load the fine-tuned model and tokenizer
model_path = "fine_tuned_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Set the model to evaluation mode
model.eval()

# Example query
query = "What is the capital of Turkey"

# Tokenize the input
inputs = tokenizer(query, return_tensors="pt", padding=True, truncation=True, max_length=128)

# Get predictions
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=1)

    # Log probability distribution
    print(f"Probabilities: {probabilities.tolist()}")

    # Get the most probable label and its confidence
    max_prob, predicted_label = probabilities.max(dim=1)
    max_prob = max_prob.item()
    predicted_label = predicted_label.item()

# Map the predicted label back to its string representation
id_to_label = {0: "financial_analysis", 1: "out_of_scope"}

# Apply confidence threshold
THRESHOLD = 0.9  # Adjust as needed
if max_prob < THRESHOLD:
    predicted_intent = "out_of_scope"
else:
    predicted_intent = id_to_label[predicted_label]

print(f"Predicted intent: {predicted_intent}")
print(f"Confidence: {max_prob:.4f}")


Probabilities: [[0.003294408554211259, 0.9967055916786194]]
Predicted intent: out_of_scope
Confidence: 0.9967


In [None]:
#import torch
#torch.save(model.state_dict(), "fine_tuned_model7/pytorch_model.bin")

In [67]:
import shutil
shutil.make_archive('fine_tuned_model', 'zip', 'fine_tuned_model')

'/content/fine_tuned_model.zip'

In [68]:
from google.colab import files
files.download('fine_tuned_model.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [69]:
from huggingface_hub import login

#Hugging face API key kullanarak giriş yapın
login('apikey')

In [70]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from huggingface_hub import HfApi

# Model path and repository name
model_path = "fine_tuned_model"  # Path to your local model directory
repo_name = "fine_tuned_financial_analysis_intent_classification"  # Desired name for the Hugging Face repo

# Load the model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Push the model to Hugging Face
model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)

print(f"Model pushed to Hugging Face Hub: https://huggingface.co/username/{repo_name}")


model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

Model pushed to Hugging Face Hub: https://huggingface.co/username/fine_tuned_financial_analysis_intent_classification


In [71]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load the fine-tuned model and tokenizer
model_path = "Erdeniz/fine_tuned_financial_analysis_intent_classification"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Set the model to evaluation mode
model.eval()

# Example query
query = "What is the purpose in life?"

# Tokenize the input
inputs = tokenizer(query, return_tensors="pt", padding=True, truncation=True, max_length=128)

# Get predictions
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=1)

    # Log probability distribution
    print(f"Probabilities: {probabilities.tolist()}")

    # Get the most probable label and its confidence
    max_prob, predicted_label = probabilities.max(dim=1)
    max_prob = max_prob.item()
    predicted_label = predicted_label.item()

# Map the predicted label back to its string representation
id_to_label = {0: "financial_analysis", 1: "out_of_scope"}

# Apply confidence threshold
THRESHOLD = 0.9  # Adjust as needed
if max_prob < THRESHOLD:
    predicted_intent = "out_of_scope"
else:
    predicted_intent = id_to_label[predicted_label]

print(f"Predicted intent: {predicted_intent}")
print(f"Confidence: {max_prob:.4f}")


tokenizer_config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/965 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/884 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

Probabilities: [[0.003373584244400263, 0.9966264963150024]]
Predicted intent: out_of_scope
Confidence: 0.9966
