In [1]:
import os

In [2]:
%pwd

'c:\\Users\\lenovo\\Desktop\\Mini_Translator\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\lenovo\\Desktop\\Mini_Translator'

In [25]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    tokenizer_1: str
    tokenizer_2:str
    lang1:str
    lang2:str
    sos_token: str
    eos_token: str
    max_length : int
    lower: bool
    data_loader:Path
    batch_size: int


In [26]:
from src.Mini_Translator.constants import *
from src.Mini_Translator.utils.common import read_yaml, create_directories

In [27]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        params=self.params

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            tokenizer_1=config.tokenizer_1,
            tokenizer_2=config.tokenizer_2,
            lang1=params.lang1,
            lang2=params.lang2,
            sos_token=params.sos_token,
            eos_token=params.eos_token,
            max_length=params.max_length,
            lower=params.lower,
            data_loader=config.data_loader,
            batch_size=params.batch_size
        )

        return data_transformation_config


In [28]:
import os
from src.Mini_Translator.logging import logger
import spacy
from datasets import load_dataset, load_from_disk
import json

In [31]:
import json
import spacy
import pandas as pd
import os
from datasets import Dataset, DatasetDict
import torchtext
from torchtext.vocab import build_vocab_from_iterator
import torch.nn as nn
import torch
from src.Mini_Translator.logging import logger  # Make sure to import the logger

class DataTransformation:
    def __init__(self, config: DataTransformationConfig, config_filepath=CONFIG_FILE_PATH):
        self.config = config
        # os.system(f"python -m spacy download {config.tokenizer_1}")
        # os.system(f"python -m spacy download {config.tokenizer_2}")
        self.config2 = read_yaml(config_filepath)

    def tokenize_example(self, example, en_nlp, de_nlp, max_length, lower, sos_token, eos_token):
        en_tokens = [token.text for token in en_nlp.tokenizer(example[self.config.lang1])][:max_length]
        de_tokens = [token.text for token in de_nlp.tokenizer(example[self.config.lang2])][:max_length]
        if lower:
            en_tokens = [token.lower() for token in en_tokens]
            de_tokens = [token.lower() for token in de_tokens]
        en_tokens = [sos_token] + en_tokens + [eos_token]
        de_tokens = [sos_token] + de_tokens + [eos_token]
        return {f"{self.config.lang1}_tokens": en_tokens, f"{self.config.lang2}_tokens": de_tokens}
    
    def numericalize_example(self, example, en_vocab, de_vocab):
        en_ids = en_vocab.lookup_indices(example[f"{self.config.lang1}_tokens"])
        de_ids = de_vocab.lookup_indices(example[f"{self.config.lang2}_tokens"])
        return {f"{self.config.lang1}_ids": en_ids, f"{self.config.lang2}_ids": de_ids}
    
    def get_collate_fn(self, pad_index):
        def collate_fn(batch):
            batch_en_ids = [example[f"{self.config.lang1}_ids"] for example in batch]
            batch_de_ids = [example[f"{self.config.lang2}_ids"] for example in batch]
            batch_en_ids = nn.utils.rnn.pad_sequence(batch_en_ids, padding_value=pad_index)
            batch_de_ids = nn.utils.rnn.pad_sequence(batch_de_ids, padding_value=pad_index)
            return {f"{self.config.lang1}_ids": batch_en_ids, f"{self.config.lang2}_ids": batch_de_ids}
        return collate_fn
    
    def get_data_loader(self, dataset, batch_size, pad_index, shuffle=False):
        collate_fn = self.get_collate_fn(pad_index)
        data_loader = torch.utils.data.DataLoader(
            dataset=dataset,
            batch_size=batch_size,
            collate_fn=collate_fn,
            shuffle=shuffle,
        )
        return data_loader

    def initiate_tokenization(self):
        en_nlp = spacy.load(self.config.tokenizer_1)
        de_nlp = spacy.load(self.config.tokenizer_2)
        ingestion_config = self.config2.data_ingestion

        with open(ingestion_config.data_files.train, 'r') as json_file:
            train_data = json.load(json_file)

        with open(ingestion_config.data_files.validation, 'r') as json_file:
            valid_data = json.load(json_file)

        with open(ingestion_config.data_files.test, 'r') as json_file:
            test_data = json.load(json_file)

        # Convert the loaded lists of dictionaries to datasets.Dataset objects
        train_dataset = Dataset.from_pandas(pd.DataFrame(train_data))
        valid_dataset = Dataset.from_pandas(pd.DataFrame(valid_data))
        test_dataset = Dataset.from_pandas(pd.DataFrame(test_data))

        # Tokenize using map
        fn_kwargs = {
            "en_nlp": en_nlp,
            "de_nlp": de_nlp,
            "max_length": self.config.max_length,
            "lower": self.config.lower,
            "sos_token": self.config.sos_token,
            "eos_token": self.config.eos_token
        }

        def tokenize_wrapper(example):
            return self.tokenize_example(example, **fn_kwargs)

        train_dataset = train_dataset.map(tokenize_wrapper)
        valid_dataset = valid_dataset.map(tokenize_wrapper)
        test_dataset = test_dataset.map(tokenize_wrapper)

        logger.info(f"After tokenization, type of train_data: {type(train_dataset)}")

        # Building vocabulary
        min_freq = 2
        unk_token = "<unk>"
        pad_token = "<pad>"
        special_tokens = [unk_token, pad_token, self.config.sos_token, self.config.eos_token]

        en_vocab = build_vocab_from_iterator(
            (x[f"{self.config.lang1}_tokens"] for x in train_dataset),
            min_freq=min_freq,
            specials=special_tokens,
        )
        de_vocab = build_vocab_from_iterator(
            (x[f"{self.config.lang2}_tokens"] for x in train_dataset),
            min_freq=min_freq,
            specials=special_tokens,
        )
        torch.save(en_vocab, 'en_vocab.pth')
        torch.save(de_vocab, 'de_vocab.pth')
        #         torch.save(en_vocab, os.path.join(self.config.root_dir,'vocab/en_vocab.pth'))
        # torch.save(de_vocab, os.path.join(self.config.root_dir,'vocab/de_vocab.pth'))

        assert en_vocab[unk_token] == de_vocab[unk_token]
        assert en_vocab[pad_token] == de_vocab[pad_token]
        unk_index = en_vocab[unk_token]
        pad_index = en_vocab[pad_token]

        en_vocab.set_default_index(en_vocab[unk_token])
        de_vocab.set_default_index(de_vocab[unk_token])






        logger.info(f"English vocab size: {len(en_vocab)}")
        logger.info(f"German vocab size: {len(de_vocab)}")

        with open("metadata.json", 'w') as file:
            json.dump({
                'en_vocab': len(en_vocab),
                'de_vocab': len(de_vocab),
                'pad_index': pad_index,
                'unk_index': unk_index,
                
            }, file, indent=4)

        fn_kwargs = {"en_vocab": en_vocab, "de_vocab": de_vocab}

        def numericalize_wrapper(example):
            return self.numericalize_example(example, **fn_kwargs)

        train_dataset = train_dataset.map(numericalize_wrapper)
        valid_dataset = valid_dataset.map(numericalize_wrapper)
        test_dataset = test_dataset.map(numericalize_wrapper)

        data_type = "torch"
        format_columns = [f"{self.config.lang1}_ids", f"{self.config.lang2}_ids"]
        train_dataset = train_dataset.with_format(type=data_type, columns=format_columns, output_all_columns=True)
        valid_dataset = valid_dataset.with_format(type=data_type, columns=format_columns, output_all_columns=True)
        test_dataset = test_dataset.with_format(type=data_type, columns=format_columns, output_all_columns=True)

        logger.info(type(train_dataset[0][f"{self.config.lang2}_ids"]))

        root_dir = self.config.root_dir
        train_dataset.save_to_disk(os.path.join(root_dir, "train_dataset"))
        valid_dataset.save_to_disk(os.path.join(root_dir, "valid_dataset"))
        test_dataset.save_to_disk(os.path.join(root_dir, "test_dataset"))

        # Create dataloaders (batches)
        batch_size = self.config.batch_size
        train_data_loader = self.get_data_loader(train_dataset, batch_size, pad_index, shuffle=True)
        valid_data_loader = self.get_data_loader(valid_dataset, batch_size, pad_index)
        test_data_loader = self.get_data_loader(test_dataset, batch_size, pad_index)

        logger.info(f"Length of train data loader: {len(train_data_loader)}")
        logger.info(f"Length of valid data loader: {len(valid_data_loader)}")
        logger.info(f"Length of test data loader: {len(test_data_loader)}")

        # Saving dataloaders as list of batches
        root_dir=self.config.root_dir
        torch.save(list(train_data_loader), os.path.join(root_dir, "train_data_loader.pth"))  # Changed line
        torch.save(list(valid_data_loader), os.path.join(root_dir, "valid_data_loader.pth"))  # Changed line
        torch.save(list(test_data_loader), os.path.join(root_dir, "test_data_loader.pth"))    # Changed line

        logger.info(f"Data loaders saved to {root_dir}")
        logger.info("Data transformation successfully completed")

# Example usage
# config = DataTransformationConfig(...)
# data_transformation = DataTransformation(config)
# data_transformation.initiate_tokenization()


In [32]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.initiate_tokenization()
except Exception as e:
    raise e

[2024-05-22 21:29:50,911: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-05-22 21:29:50,918: INFO: common: yaml file: params.yaml loaded successfully]
[2024-05-22 21:29:50,926: INFO: common: created directory at: artifacts]
[2024-05-22 21:29:50,930: INFO: common: created directory at: artifacts/data_transformation]
[2024-05-22 21:29:50,968: INFO: common: yaml file: config\config.yaml loaded successfully]


Map: 100%|██████████| 29000/29000 [00:34<00:00, 836.21 examples/s] 
Map: 100%|██████████| 1014/1014 [00:01<00:00, 626.52 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 1160.89 examples/s]

[2024-05-22 21:30:43,783: INFO: 1267986715: After tokenization, type of train_data: <class 'datasets.arrow_dataset.Dataset'>]





[2024-05-22 21:31:02,277: INFO: 1267986715: English vocab size: 5893]
[2024-05-22 21:31:02,285: INFO: 1267986715: German vocab size: 7853]


Map: 100%|██████████| 29000/29000 [00:10<00:00, 2771.17 examples/s]
Map: 100%|██████████| 1014/1014 [00:00<00:00, 2415.96 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 3080.14 examples/s]

[2024-05-22 21:31:13,986: INFO: 1267986715: <class 'torch.Tensor'>]



Saving the dataset (1/1 shards): 100%|██████████| 29000/29000 [00:00<00:00, 187223.81 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1014/1014 [00:00<00:00, 40518.50 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1000/1000 [00:00<00:00, 40223.87 examples/s]

[2024-05-22 21:31:14,244: INFO: 1267986715: Length of train data loader: 227]
[2024-05-22 21:31:14,244: INFO: 1267986715: Length of valid data loader: 8]
[2024-05-22 21:31:14,253: INFO: 1267986715: Length of test data loader: 8]





[2024-05-22 21:31:29,288: INFO: 1267986715: Data loaders saved to artifacts/data_transformation]
[2024-05-22 21:31:29,288: INFO: 1267986715: Data transformation successfully completed]
