In [1]:
import os

In [2]:
from sklearn.utils.class_weight import compute_class_weight

In [3]:
%pwd

'/Users/CalebE/Documents/Text-Summarization/research'

In [4]:
os.chdir("../")

In [5]:
%pwd

'/Users/CalebE/Documents/Text-Summarization'

In [5]:
# TrainingParams:
#   model_type: 'roberta'
#   model_name: 'roberta-base'
#   n_epochs: 4
#   train_batch_size: 16
#   eval_batch_size: 16
#   lr: 3e-5
#   class_weights: None
#   reprocess_input_data: True
#   overwrite_output_dir: True
#   fp16: False
#   # use_cuda: True
#   do_lower_case: False
#   manual_seed: 2
#   use_multiprocessing: False
#   use_multiprocessing_for_evaluation: False
#   thread_count: 1
#   save_eval_checkpoints: False
#   save_model_every_epoch: False
#   # use_early_stopping: True
#   early_stopping_patience: 2
#   early_stopping_metric: 'eval_loss'
#   early_stopping_metric_minimize: True


In [6]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    data_path: Path
    output_dir: Path
    model_type: str
    model_name: str
    n_epochs: int
    train_batch_size: int
    eval_batch_size: int
    lr: float
    class_weights: None
    reprocess_input_data: bool
    overwrite_output_dir: bool
    fp16: bool
    do_lower_case: bool
    manual_seed: int
    use_multiprocessing: bool
    use_multiprocessing_for_evaluation: bool
    thread_count: int
    save_eval_checkpoints: bool
    save_model_every_epoch: bool
    early_stopping_metric: str
    early_stopping_metric_minimize: bool
    early_stopping_patience: int
    use_cuda: bool

In [7]:
from textSummarizer.constants import *
from textSummarizer.utils.common import read_yaml, create_directories
import warnings
warnings.filterwarnings("ignore")


class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])
        
    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.TrainingParams
        
        create_directories([config.root_dir])
        
        model_trainer_config = ModelTrainerConfig(
            root_dir = config.root_dir,
            data_path = config.data_path,
            output_dir = config.output_dir,
            model_type = params.model_type,
            model_name = params.model_name,
            n_epochs = params.n_epochs,
            train_batch_size = params.train_batch_size,
            eval_batch_size = params.eval_batch_size,
            lr = params.lr,
            class_weights = params.class_weights,
            reprocess_input_data = params.reprocess_input_data,
            overwrite_output_dir = params.overwrite_output_dir,
            fp16 = params.fp16,
            do_lower_case = params.do_lower_case,
            manual_seed = params.manual_seed,
            use_multiprocessing = params.use_multiprocessing,
            use_multiprocessing_for_evaluation = params.use_multiprocessing_for_evaluation,
            thread_count = params.thread_count,
            save_eval_checkpoints = params.save_eval_checkpoints,
            save_model_every_epoch = params.save_model_every_epoch,
            early_stopping_metric = params.early_stopping_metric,
            early_stopping_metric_minimize = params.early_stopping_metric_minimize,
            early_stopping_patience = params.early_stopping_patience,
            use_cuda = params.use_cuda
        )
        
        return model_trainer_config

In [8]:
# Class that loads the data and preprocesses it in this format tmp = pd.DataFrame() tmp['text'] = train_filtered['text_short'] tmp['labels'] = train_filtered['label']

import pandas as pd

class DataLoaders:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config
        
    def load_data(self):
        data = pd.read_csv(self.config.data_path)
        return data
    
    def preprocess_data(self, data):
        tmp = pd.DataFrame()
        tmp['text'] = data['text_short']
        tmp['labels'] = data['label']
        tmp['labels'] = tmp['labels'].map({'NOT': 0, 'TIN': 1, 'UNT': 2})
        # Sample sizes for each class
        # Filter by category and sample
        not_samples = tmp[tmp['labels'] == 0].sample(n=200, random_state=100)  # 'NOT' class
        tin_samples = tmp[tmp['labels'] == 1].sample(n=200, random_state=100)  # 'TIN' class
        unt_samples = tmp[tmp['labels'] == 2].sample(n=100, random_state=100)  
  
        # Concatenate the samples
        balanced_sample = pd.concat([not_samples, tin_samples, unt_samples]).reset_index(drop=True)

        return balanced_sample
    
    def load_and_preprocess_data(self):
        data = self.load_data()
        data = self.preprocess_data(data)
        return data



In [11]:
from simpletransformers.classification import ClassificationModel
import torch

class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig, data_loader: DataLoaders):
        self.config = config
        self.model = None
        self.data_loader = data_loader
        
    def train_model(self):
        train_df = self.data_loader.load_and_preprocess_data()
        class_weights = compute_class_weight('balanced', classes=train_df['labels'].unique(), y=train_df['labels'])
        class_weights_dict = {i: weight for i, weight in enumerate(class_weights)}
        device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model = ClassificationModel(
            self.config.model_type,
            self.config.model_name,
            num_labels=3,
            use_cuda = True if device == "cuda" else False,
            args={
                'output_dir': f"{self.config.output_dir}/outputs",
                'cache_dir': f"{self.config.output_dir}/cache",
                'tensorboard_dir': f"{self.config.output_dir}/runs",
                'reprocess_input_data': self.config.reprocess_input_data,
                'overwrite_output_dir': self.config.overwrite_output_dir,
                'fp16': self.config.fp16,
                'weight': class_weights_dict,
                'do_lower_case': self.config.do_lower_case,
                'manual_seed': self.config.manual_seed,
                'use_multiprocessing': self.config.use_multiprocessing,
                'use_multiprocessing_for_evaluation': self.config.use_multiprocessing_for_evaluation,
                'thread_count': self.config.thread_count,
                'save_eval_checkpoints': self.config.save_eval_checkpoints,
                'save_model_every_epoch': self.config.save_model_every_epoch,
                'early_stopping_metric': self.config.early_stopping_metric,
                'early_stopping_metric_minimize': self.config.early_stopping_metric_minimize,
                'early_stopping_patience': self.config.early_stopping_patience,
                
            }
        )
        
        self.model.train_model(train_df)
        
    def evaluate_model(self, eval_df):
        result, model_outputs, wrong_predictions = self.model.eval_model(eval_df)
        return result, model_outputs, wrong_predictions
    
    def predict(self, data):
        predictions, raw_outputs = self.model.predict(data['text'])
        return predictions, raw_outputs

In [13]:
# try:
#     config_manager = ConfigurationManager()
#     config = config_manager.get_model_trainer_config()
#     data_loader = DataLoaders(config)
#     model_trainer = ModelTrainer(config, data_loader)
#     model_trainer.train_model()
# except Exception as e:
#     raise e