In [3]:
import os

def get_project_root() -> str:
    return os.path.abspath(os.path.join(os.getcwd(), "../"))

In [4]:
print(get_project_root())

/Users/maxmartyshov/Desktop/IU/year3/PMDL/Sentiment_Analysis_for_Financial_News


In [5]:
import sys 

src_path = os.path.join(get_project_root(), 'src')
piplines_path = os.path.join(get_project_root(), 'pipelines')
sys.path.append(src_path)
sys.path.append(piplines_path)

In [None]:
from extract_training_data import extract_latest_loaders

dataloaders = extract_latest_loaders()
train_loader = dataloaders['train']
val_loader = dataloaders['validation']

In [99]:
def get_input_example():
    batch = next(iter(train_loader))

    # Move the batch to CPU if needed (for logging purposes)
    for key in batch:
        batch[key] = batch[key].cpu()

    # Prepare the input example
    return {
        "input_ids": batch["input_ids"],
        "attention_mask": batch["attention_mask"],
        "has_source": batch["has_source"]
    }

In [100]:
import torch.nn as nn
import torch

from transformers import BertModel


class SentimentAnalysisModel(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', num_labels=3):
        super(SentimentAnalysisModel, self).__init__()

        self.bert = BertModel.from_pretrained(bert_model_name)

        self.linear1 = nn.Linear(self.bert.config.hidden_size + 1, num_labels)

        self.dropout = nn.Dropout(0.3)

    def forward(self, input_ids, attention_mask, has_source):
        embeddings = self.bert(input_ids=input_ids, attention_mask=attention_mask).pooler_output
        has_source = has_source.unsqueeze(1) 
        combined_input = torch.cat((embeddings, has_source), dim=1)

        regularized = self.dropout(combined_input)
        logits = self.linear1(regularized)

        return logits


In [101]:
from tqdm import tqdm

import mlflow
import mlflow.pytorch

def train_one_epoch(model, dataloader, optimizer, criterion, device, epoch):
    model.train()
    train_loss = 0.0
    total = 0.

    loop = tqdm(
        enumerate(dataloader, 1),
        total=len(dataloader),
        desc=f"Epoch {epoch}: train",
        leave=True,
    )

    for _, batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        has_source = batch['has_source'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        logits = model(input_ids = input_ids, attention_mask=attention_mask, has_source=has_source)

        loss = criterion(logits, labels)

        loss.backward()
        optimizer.step()

        train_loss += loss.item() * input_ids.size(0)
        total += labels.size(0)

        loop.set_postfix({"loss": train_loss/total})

    avg_train_loss = train_loss / total
    mlflow.log_metric('train_loss', avg_train_loss, step=epoch)


def val_one_epoch(model, dataloader, criterion, device, epoch, best_so_far, ckpt_name='model'):
    model.eval()
    val_loss = 0.
    correct = 0.
    total = 0.
    with torch.no_grad():
        loop = tqdm(
            enumerate(dataloader, 1),
            total=len(dataloader),
            desc=f"Epoch {epoch}: val",
            leave=True,
        )
        for i, batch in loop:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            has_source = batch['has_source'].to(device)
            labels = batch['labels'].to(device)

            logits = model(input_ids=input_ids, attention_mask=attention_mask, has_source=has_source)

            loss = criterion(logits, labels)
            val_loss += loss.item() * input_ids.size(0)

            _, preds = torch.max(logits, dim=1)
            correct += (preds == labels).sum().item()

            total += labels.size(0)

            loop.set_postfix({"loss": val_loss/total, "acc": correct / total})
        current_acc = correct / total

        avg_val_loss = val_loss / total
        mlflow.log_metric('validation_loss', avg_val_loss, step=epoch)
        mlflow.log_metric('validation_accuracy', current_acc, step=epoch)


        if current_acc > best_so_far:
            print(f"Validation accuracy improved from {best_so_far:.4f} to {current_acc:.4f}. Saving model...")
            mlflow.pytorch.log_model(model, ckpt_name)

            best_so_far = current_acc
    return best_so_far

In [102]:
from mlflow.tracking import MlflowClient

def register_model(run_id, model_name, description):
    """
    Registers the incoming model from the specified run without modifying the Champion tag.
    
    Parameters:
    - run_id: str, the ID of the run where the model is logged.
    - model_name: str, the name of the model in the registry.
    - description: str, a description for the model version.
    
    Returns:
    - version: int, the version of the registered model.
    """
    client = MlflowClient()
    model_uri = f"runs:/{run_id}/{model_name}"
    result = mlflow.register_model(model_uri, model_name)
    print(f"Model registered with name '{model_name}' and version '{result.version}'")
    client.update_model_version(
        name=model_name,
        version=result.version,
        description=description,
    )
    return result.version


In [103]:
def update_champion_alias(model_name, metric_name="validation_accuracy"):
    """
    Goes through all model versions, checks their metrics, and assigns the "Champion" alias to the best-performing model.
    
    Parameters:
    - model_name: str, the name of the model in the MLflow Model Registry.
    - metric_name: str, the metric to base the Champion selection on (default is 'accuracy').
    
    Returns:
    - champion_version: int, the version of the model that is now assigned the "Champion" alias.
    """
    client = MlflowClient()
    
    # Search for all registered versions of the model
    versions = client.search_model_versions(f"name='{model_name}'")
    
    # Initialize variables to track the best version based on the metric
    best_version = None
    best_metric_value = -float('inf')  # Assume we're maximizing the metric (e.g., accuracy)
    best_run_id = None

    # Go through all versions to find the one with the best metric
    for version in versions:
        run_id = version.run_id
        # Get the run's metrics
        run = client.get_run(run_id)
        
        if metric_name in run.data.metrics:
            metric_value = run.data.metrics[metric_name]
            if metric_value > best_metric_value:
                best_metric_value = metric_value
                best_version = version.version
                best_run_id = run_id
    
    # Check if a best version was found
    if best_version is None:
        raise ValueError(f"No models found with metric '{metric_name}'")

    # Reassign the "champion" alias to the best version
    client.set_registered_model_alias(
        name=model_name,
        alias="champion",
        version=best_version
    )
    print(f"Model version {best_version} from run {best_run_id} assigned as 'champion' with {metric_name}: {best_metric_value}")

    return best_version


In [105]:
import torch.optim as optim
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
project_root = os.path.abspath(os.path.join(os.getcwd(), "../"))
mlflow.set_tracking_uri(f"file://{project_root}/mlruns")

epochs = 3
device = 'mps'
model_name = 'simple_sentiment_analysis_model'
lr = 2e-5

model_desctiption = "BERT, 1 fc layer, 0.3 dropout"

model = SentimentAnalysisModel(bert_model_name='bert-base-uncased', num_labels=3).to(device)
criterion = nn.CrossEntropyLoss()  
optimizer = optim.Adam(model.parameters(), lr=lr)

best_so_far = 0.
mlflow.set_experiment("SentimentAnalysis")

with mlflow.start_run():
    mlflow.log_param("learning_rate", 2e-5)
    mlflow.log_param("epochs", epochs)
    run = mlflow.active_run()
    run_id = run.info.run_id
    for epoch in range(epochs):
        train_one_epoch(model, train_loader, optimizer, criterion, device, epoch)
        best_so_far = val_one_epoch(model, val_loader, criterion, device, epoch, best_so_far, model_name)
    register_model(run_id, model_name, model_desctiption)
    update_champion_alias(model_name)

Epoch 0: train: 100%|██████████| 106/106 [00:42<00:00,  2.47it/s, loss=0.86] 
Epoch 0: val: 100%|██████████| 52/52 [00:07<00:00,  6.80it/s, loss=0.612, acc=0.762]


Validation accuracy improved from 0.0000 to 0.7619. Saving model...


Epoch 1: train: 100%|██████████| 106/106 [00:53<00:00,  1.99it/s, loss=0.477]
Epoch 1: val: 100%|██████████| 52/52 [00:09<00:00,  5.39it/s, loss=0.498, acc=0.817]


Validation accuracy improved from 0.7619 to 0.8168. Saving model...


Epoch 2: train: 100%|██████████| 106/106 [01:02<00:00,  1.70it/s, loss=0.277]
Epoch 2: val: 100%|██████████| 52/52 [00:10<00:00,  5.04it/s, loss=0.49, acc=0.835] 


Validation accuracy improved from 0.8168 to 0.8348. Saving model...




Model registered with name 'simple_sentiment_analysis_model' and version '2'
Model version 2 from run be0b25db674f4b2793939799d9ce6b11 assigned as 'champion' with validation_accuracy: 0.8348402652200121


Registered model 'simple_sentiment_analysis_model' already exists. Creating a new version of this model...
Created version '2' of model 'simple_sentiment_analysis_model'.
