In [None]:
# Import necessary libraries
import re
from typing import Optional, Any
from pydantic import BaseModel, ValidationError, validator
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import json
import torch

from transformers import (
    AdamW,
    AutoModel,
    AutoTokenizer,
    AutoModelForMaskedLM,
    AutoModelForSequenceClassification
)
from transformers import TrainingArguments, Trainer
from datasets import load_metric, Dataset
from nltk.tokenize import word_tokenize
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    precision_recall_fscore_support
)
from sklearn.model_selection import train_test_split
from unidecode import unidecode
import warnings
warnings.filterwarnings('ignore')

# Data Structure Definition
class DatasetStructure(BaseModel):
    name: str
    content: str
    attributes: str
    category_a: int|str
    category_b: int|str
    category_c: int|str
    value_1: int|str
    value_2: int|str
    metric_1: float|str
    metric_2: float|str
    metric_3: float|str
    classification: Optional[str] = ""

    @validator('name', 'content', 'attributes')
    @classmethod
    def process_text(cls, value):
        pattern = r"\n+|\t+"
        value = re.sub(pattern, "", value)
        value = re.sub(r"\;", ",", value)
        return value

    @validator('category_a', 'category_b', 'category_c', 'value_1', 'value_2', 'metric_1', 'metric_2', 'metric_3', pre=True)
    @classmethod
    def process_numeric(cls, value):
        if not isinstance(value, float):
            try:
                value = float(value)
            except:
                value = "Unknown"
        if value == 0.0:
            return "Unknown"
        return value

def compose_data_object(data_object: DatasetStructure) -> str:
    return f"""###NAME: {data_object.name}</s>###CONTENT: {data_object.content}</s>###ATTRIBUTES: {data_object.attributes}</s>###VALUE1: {data_object.value_1}</s>###VALUE2: {data_object.value_2}</s>###CATA: {data_object.category_a}</s>###CATB: {data_object.category_b}</s>###CATC: {data_object.category_c}</s>###METRIC1: {data_object.metric_1}</s>###METRIC2: {data_object.metric_2}</s>###METRIC3: {data_object.metric_3}</s>"""

# Model Configuration
def seed_all(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.benchmark = True
        torch.backends.cudnn.deterministic = False

# Training Configuration
class ModelTrainer:
    def __init__(self, model_name="intfloat/multilingual-e5-base"):
        self.model_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

    def prepare_model(self, num_labels):
        self.model = AutoModelForSequenceClassification.from_pretrained(
            self.model_name,
            num_labels=num_labels,
            ignore_mismatched_sizes=True
        ).to(self.device)

    def train(self, train_dataset, test_dataset, output_dir="./private_results"):
        training_args = TrainingArguments(
            output_dir=output_dir,
            num_train_epochs=8,
            per_device_train_batch_size=14,
            per_device_eval_batch_size=8,
            weight_decay=0.001,
            logging_dir='./private_logs',
            load_best_model_at_end=True,
            learning_rate=1e-5,
            evaluation_strategy='epoch',
            logging_strategy='epoch',
            save_strategy='epoch',
            save_total_limit=1
        )

        trainer = Trainer(
            model=self.model,
            tokenizer=self.tokenizer,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=test_dataset,
            compute_metrics=self.compute_metrics
        )

        return trainer.train()

    @staticmethod
    def compute_metrics(pred):
        labels = pred.label_ids
        preds = pred.predictions.argmax(-1)
        f1 = f1_score(labels, preds, average="weighted")
        return {'F1': f1}

    def inference(self, text: str, int_to_label_map: dict) -> str:
        inputs = self.tokenizer(text, padding=False, return_tensors="pt", truncation=True).to(self.device)
        output = self.model(**inputs)
        predicted_class = output.logits[0].tolist().index(sorted(output.logits[0], reverse=True)[0])
        return int_to_label_map[predicted_class]

# Usage Example:
def main():
    seed_all(3258976)
    
    # Load and preprocess data
    private_data = pd.DataFrame()  # Your data loading logic here
    
    # Prepare labels
    unique_labels = private_data['classification'].unique()
    label_to_int = {label: i for i, label in enumerate(unique_labels)}
    int_to_label = {i: label for label, i in label_to_int.items()}
    
    # Process data
    X = [compose_data_object(DatasetStructure(**obj)) for obj in private_data.to_dict(orient="records")]
    y = private_data['classification'].map(label_to_int).values.tolist()
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=3258976)
    
    # Initialize trainer
    trainer = ModelTrainer()
    trainer.prepare_model(num_labels=len(unique_labels))
    
    # Train model
    results = trainer.train(X_train, X_test)
    
    # Evaluate
    y_pred = [trainer.inference(text, int_to_label) for text in X_test]
    print(classification_report(y_test, y_pred, digits=4))

if __name__ == "__main__":
    main()