In [20]:
%pwd

'/home/dahir/deedax/TLDR'

In [33]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataValidationConfig:
  root_dir: Path
  STATUS_FILE: str
  ALL_REQUIRED_FILES: list

In [34]:
from TLDR.constants import *
from TLDR.utils.common import read_yaml, create_directories

In [35]:
class ConfigurationManager:
  def __init__(
    self,
    config_filepath = CONFIG_FILE_PATH,
    params_filepath = PARAMS_FILE_PATH,
  ):
    self.config = read_yaml(config_filepath)
    self.params = read_yaml(params_filepath)

    create_directories([self.config.artifacts_root])

  def get_data_validation_config(self) -> DataValidationConfig:
    config = self.config.data_validation
    
    create_directories([config.root_dir])
    
    data_validation_config = DataValidationConfig(
      root_dir = Path(config.root_dir),
      STATUS_FILE = config.STATUS_FILE,
      ALL_REQUIRED_FILES = config.ALL_REQUIRED_FILES,
    )
    
    return data_validation_config

In [36]:
import os
from TLDR.logging import logger

In [43]:
class DataValidation:
  def __init__(self, config: DataValidationConfig):
    self.config = config

  def validate_all_files_exist(self) -> bool:
    try:
      validation_status = None
      all_existing_files = os.listdir(os.path.join('artifacts', 'data_ingestion', 'bank-additional'))
      for file in self.config.ALL_REQUIRED_FILES:
        if file in all_existing_files:
          validation_status = True
          logger.info(f'Validation Status for {file}: {validation_status}')
          with open(self.config.STATUS_FILE, 'w') as f:
            f.write(f'Validation Status: {validation_status}')
        else:
          validation_status = False
          logger.info(f'Validation Status for {file}: {validation_status}')
          with open(self.config.STATUS_FILE, 'w') as f:
            f.write(f'Validation Status: {validation_status}')
    except Exception as e:
      raise e

In [44]:
try:
  config = ConfigurationManager()
  data_validation_config = config.get_data_validation_config()
  data_validation = DataValidation(data_validation_config)
  data_validation.validate_all_files_exist()
except Exception as e:
  raise e

[2023-10-17 21:45:17,832: INFO: common] yaml file: <_io.TextIOWrapper name='config/config.yaml' mode='r' encoding='UTF-8'> read successfully.]
[2023-10-17 21:45:17,837: INFO: common] yaml file: <_io.TextIOWrapper name='params.yaml' mode='r' encoding='UTF-8'> read successfully.]
[2023-10-17 21:45:17,839: INFO: common] Created directory: artifacts]
[2023-10-17 21:45:17,840: INFO: common] Created directory: artifacts/data_validation]
[2023-10-17 21:45:17,842: INFO: 177820508] Validation Status for bank-additional.csv: True]
[2023-10-17 21:45:17,844: INFO: 177820508] Validation Status for bank-additional-full.csv: True]


# Data Transformation

In [57]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
  root_dir: Path
  data_path: Path
  tokenizer_name: Path

In [64]:
from TLDR.constants import *
from TLDR.utils.common import read_yaml, create_directories

In [65]:
class ConfigutationManager:
  def __init__(
    self,
    config_filepath = CONFIG_FILE_PATH,
    params_filepath = PARAMS_FILE_PATH,
  ):
    self.config = read_yaml(config_filepath)
    self.params = read_yaml(params_filepath)

    create_directories([self.config.artifacts_root])

  def get_data_transformation_config(self) -> DataTransformationConfig:

    config = self.config.data_transformation

    create_directories([config.root_dir])

    data_transformation_config = DataTransformationConfig(
      root_dir = config.root_dir,
      data_path = config.data_path,
      tokenizer_name = config.tokenizer_name,
    )
    return data_transformation_config


In [66]:
import os
from TLDR.logging import logger
from transformers import AutoTokenizer
from datasets import load_dataset, load_from_disk

In [67]:
class DataTransformation:
  def __init__(self, config: DataTransformationConfig):
    self.config = config
    self.tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_name)

In [68]:
class DataTransformation:
  def __init__(self, config: DataTransformationConfig):
    self.config = config
    self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name)

  def convert_examples_to_features(self, example_batch):
    input_encodings = self.tokenizer(example_batch['dialogue'], max_length = 1024, truncation = True)

    with self.tokenizer.as_target_tokenizer():
      target_encodings = self.tokenizer(example_batch['summary'], max_length = 128, truncation = True)

    return {
      'input_ids': input_encodings['input_ids'],
      'attention_mask': input_encodings['attention_mask'],
      'labels': target_encodings['input_ids'],
    }
  
  def convert(self):
    dset = load_from_disk(self.config.data_path)
    dset_pt = dset.map(self.convert_examples_to_features, batched = True)
    dset_pt.save_to_disk(os.path.join(self.config.root_dir, 'dset')) 

In [69]:
try:
  config = ConfigutationManager()
  data_transformation_config = config.get_data_transformation_config()
  data_transformation = DataTransformation(data_transformation_config)
  data_transformation.convert()
except Exception as e:
  raise e

[2023-10-18 01:24:17,729: INFO: common] yaml file: <_io.TextIOWrapper name='config/config.yaml' mode='r' encoding='UTF-8'> read successfully.]
[2023-10-18 01:24:17,734: INFO: common] yaml file: <_io.TextIOWrapper name='params.yaml' mode='r' encoding='UTF-8'> read successfully.]
[2023-10-18 01:24:17,738: INFO: common] Created directory: artifacts]
[2023-10-18 01:24:17,740: INFO: common] Created directory: artifacts/data_transformation]


100%|██████████| 15/15 [00:04<00:00,  3.53ba/s]
100%|██████████| 1/1 [00:00<00:00,  4.36ba/s]
100%|██████████| 1/1 [00:00<00:00,  5.07ba/s]


# 04 Model Training Pipeline

In [4]:
import os
os.chdir('..')

In [5]:
%pwd

'/home/dahir/deedax/TLDR'

In [6]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainerConfig:
  root_dir: Path
  data_path: Path
  model_ckpt: Path
  num_train_epochs: int
  warmup_steps: int
  per_device_train_batch_size: int
  weight_decay: float
  logging_steps: int
  evaluation_strategy: str
  eval_steps: int
  save_steps: float
  gradient_accumulation_steps: int

In [7]:
from TLDR.constants import *
from TLDR.utils.common import read_yaml, create_directories

In [8]:
class ConfigurationManager:
  def __init__(
    self,
    config_filepath = CONFIG_FILE_PATH,
    params_filepath = PARAMS_FILE_PATH,
  ):
    self.config = read_yaml(config_filepath)
    self.params = read_yaml(params_filepath)

    create_directories([self.config.artifacts_root])

  def get_model_trainer_config(self) -> ModelTrainerConfig:
    config = self.config.model_trainer
    params = self.params.TrainingArguments

    create_directories([config.root_dir])

    model_trainer_config = ModelTrainerConfig(
      root_dir = config.root_dir,
      data_path = config.data_path,
      model_ckpt = config.model_ckpt,
      num_train_epochs = params.num_train_epochs,
      warmup_steps = params.warmup_steps,
      per_device_train_batch_size = params.per_device_train_batch_size,
      weight_decay = params.weight_decay,
      logging_steps = params.logging_steps,
      evaluation_strategy = params.evaluation_strategy,
      eval_steps = params.eval_steps,
      save_steps = params.save_steps,
      gradient_accumulation_steps = params.gradient_accumulation_steps,
    )

    return model_trainer_config

In [9]:
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset, load_from_disk
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
class ModelTrainer:
  def __init__(self, config: ModelTrainerConfig):
    self.config = config

  def train(self):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)
    model = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)
    seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

    dataset = load_from_disk(self.config.data_path)

    trainer_args = TrainingArguments(
      output_dir = self.config.root_dir,
      num_train_epochs = self.config.num_train_epochs,
      per_device_train_batch_size = self.config.per_device_train_batch_size,
      warmup_steps = self.config.warmup_steps,
      weight_decay = self.config.weight_decay,
      logging_steps = self.config.logging_steps,
      evaluation_strategy = self.config.evaluation_strategy,
      eval_steps = self.config.eval_steps,
      save_steps = self.config.save_steps,
      gradient_accumulation_steps = self.config.gradient_accumulation_steps,
    )

    trainer = Trainer(
      model = model,
      args = trainer_args,
      tokenizer = tokenizer,
      train_dataset = dataset['test'],
      eval_dataset = dataset['validation'],
      data_collator = seq2seq_data_collator,
    )

    trainer.train()

    model.save_pretrained(os.path.join(self.config.root_dir, 'model'))

    tokenizer.save_pretrained(os.path.join(self.config.root_dir, 'tokenizer')) 

In [12]:
try:
  config = ConfigurationManager()
  model_trainer_config = config.get_model_trainer_config()
  model_trainer = ModelTrainer(model_trainer_config)
  model_trainer.train()
except Exception as e:
  raise e 

[2023-10-19 22:23:12,903: INFO: common] yaml file: config/config.yaml read successfully.]
[2023-10-19 22:23:12,908: INFO: common] yaml file: params.yaml read successfully.]
[2023-10-19 22:23:12,911: INFO: common] Created directory: artifacts]
[2023-10-19 22:23:12,913: INFO: common] Created directory: artifacts/model_trainer]


You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


# Model evaluation

In [26]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelEvaluationConfig:
  root_dir: Path
  data_path: Path
  model_path: Path
  tokenizer_path: Path
  metric_file_name: Path

In [27]:
from TLDR.constants import *
from TLDR.utils.common import read_yaml, create_directories

In [29]:
class ConfigurationManager:
  def __init__(
    self,
    config_filepath = CONFIG_FILE_PATH,
    params_filepath = PARAMS_FILE_PATH,
  ):
    self.config = read_yaml(config_filepath)
    self.params = read_yaml(params_filepath)

    create_directories([self.config.artifacts_root])

  def get_model_evaluation_config(self) -> ModelEvaluationConfig:
    config = self.config.model_evaluation

    create_directories([config.root_dir])
    
    model_evaluation_config = ModelEvaluationConfig(
    root_dir = config.root_dir,
    data_path = config.data_path,
    model_path = config.model_path,
    tokenizer_path = config.tokenizer_path,
    metric_file_name = config.metric_file_name,
    )

    return model_evaluation_config

In [30]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset, load_from_disk, load_metric
import torch
import pandas as pd
from tqdm import tqdm 

In [37]:
class ModelEvaluation:
  def __init__(self, config: ModelEvaluationConfig):
    self.config = config

  def generate_batch_sized_chunks(self, list_of_elements, batch_size):
    for i in range(0, len(list_of_elements), batch_size):
      yield list_of_elements[i:i + batch_size]

  def calculate_metric_on_test_ds(self, dataset, metric, model, tokenizer,
                                  batch_size = 16, device = 'cuda' if torch.cuda.is_available() else 'cpu',
                                  column_text = 'article', column_summary = 'highlights'):
    article_batches = list(self.generate_batch_sized_chunks(dataset[column_text], batch_size))
    target_batches = list(self.generate_batch_sized_chunks(dataset[column_summary], batch_size))

    for article_batch, target_batch in tqdm(
      zip(article_batches, target_batches), total = len(article_batches)):
      # Tokenize the batch of articles
      inputs = tokenizer(article_batch, padding = 'max_length', truncation = True, max_length = 1024, return_tensors = 'pt')
      summaries = model.generate(inputs['input_ids'].to(device), 
                                 attention_mask = inputs['attention_mask'].to(device),
                                 length_penalty = 0.8, num_beams = 8, max_length = 128)
      decoded_summaries = [
        tokenizer.decode(s, skip_special_tokens = True, clean_up_tokenization_spaces = True) for s in summaries
      ]
      decoded_summaries = [d.replace('', ' ') for d in decoded_summaries]
      metric.add_batch(predictions = decoded_summaries, references = target_batch)
      score = metric.compute()
      return score
    
  def evaluate(self):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path)
    model = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_path).to(device)

    dataset = load_from_disk(self.config.data_path)

    rouge_names = ['rouge1', 'rouge2', 'rougeL', 'rougeLsum']
    rouge = load_metric('rouge')
    
    score = self.calculate_metric_on_test_ds(dataset['test'][0:10], rouge, model, tokenizer, batch_size = 2, column_text = 'dialogue', column_summary = 'summary')
    
    rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)
    
    df = pd.DataFrame(rouge_dict, index=['flan-t5-small'])
    df.to_csv(self.config.metric_file_name, index=False)

In [38]:
try:
  config = ConfigurationManager()
  model_evaluation_config = config.get_model_evaluation_config()
  model_evaluation = ModelEvaluation(model_evaluation_config)
  model_evaluation.evaluate()
except Exception as e:
  raise e

[2023-10-20 18:34:07,078: INFO: common] yaml file: config/config.yaml read successfully.]
[2023-10-20 18:34:07,085: INFO: common] yaml file: params.yaml read successfully.]
[2023-10-20 18:34:07,087: INFO: common] Created directory: artifacts]
[2023-10-20 18:34:07,088: INFO: common] Created directory: artifacts/model_evaluation]


  0%|          | 0/5 [00:00<?, ?it/s]

[2023-10-20 18:34:20,269: INFO: rouge_scorer] Using default tokenizer.]


  0%|          | 0/5 [00:08<?, ?it/s]
