In [1]:
import os
os.chdir("../")

In [2]:
%pwd



'c:\\Users\\lenovo\\Desktop\\Mini_Translator_2.0'

In [30]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir:Path
    raw_path:Path
    dataset_name:str
    raw_data:Path
    train:Path
    valid:Path
    test:Path
    

In [31]:
from src.Mini_Translator_T.constants import *
from src.Mini_Translator_T.utils.common import read_yaml,create_directories

In [32]:
class ConfigurationManager:
    def __init__(
            self,
            config_filepath=CONFIG_FILE_PATH,
            params_filepath=PARAMS_FILE_PATH):
        
        self.config=read_yaml(config_filepath)
        self.params=read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_ingestion_config(self) ->DataIngestionConfig:
        config=self.config.data_ingestion

        create_directories([config.root_dir])
        create_directories([config.raw_path])


        data_ingestion_config=DataIngestionConfig(
            root_dir=config.root_dir,
            raw_path=config.raw_path,
            dataset_name=config.dataset_name,
            raw_data=config.data_files.raw_data,
            train=config.data_files.train,
            valid=config.data_files.validation,
            test=config.data_files.test
            
        )

        return data_ingestion_config
        
        

In [33]:
import os
from src.Mini_Translator_T.logging import logger
from src.Mini_Translator_T.utils.common import get_size
import datasets
from torch.utils.data import Dataset,DataLoader, random_split


In [7]:
ds_raw = datasets.load_dataset('opus_books', f'en-it', split = 'train')

Downloading readme: 100%|██████████| 28.1k/28.1k [00:00<00:00, 2.83MB/s]
Downloading data: 100%|██████████| 5.73M/5.73M [00:10<00:00, 545kB/s]
Generating train split: 100%|██████████| 32332/32332 [00:01<00:00, 26358.86 examples/s]


In [27]:
train_ds_size = int(0.9 * len(ds_raw)) # 90% for training

val_ds_size = len(ds_raw) - train_ds_size # 10% for validation
train_ds_raw, val_ds_raw = random_split(ds_raw, [train_ds_size, val_ds_size])

In [29]:
list(train_ds_raw)

[{'id': '19564',
  'translation': {'en': 'She had three hours still to wait, and the memory of the incidents of their last meeting fired her blood.',
   'it': 'Mancavano ancora tre ore, e il ricordo dei particolari dell’ultimo incontro le accese il sangue.'}},
 {'id': '10359',
  'translation': {'en': 'As for the gun itself, he would not so much as touch it for several days after; but he would speak to it and talk to it, as if it had answered him, when he was by himself; which, as I afterwards learned of him, was to desire it not to kill him.',
   'it': 'Quanto allo schioppo, si guardò ben dal toccarlo per molti dì appresso. Unicamente quando si credea solo gli parlava, come se lo schioppo avesse potuto rispondergli, e seppi da poi dal medesimo Venerdì che que’ borbottamenti erano preghiere di non ammazzarlo.'}},
 {'id': '30365',
  'translation': {'en': 'He had a wife and family with grown-up sons who were pages at Court; and another family, an illegitimate one, in which there were othe

In [37]:
import os
import json
import random

class DataIngestion:
    def __init__(self, config,params_filepath=PARAMS_FILE_PATH):
        self.config = config
        self.params=read_yaml(params_filepath)

    def convert_dataset_to_serializable(self, dataset):
        return [example for example in dataset]

    def initiate_data_ingestion(self):
        logger.info("Initiating dataIngestion..")
        try:
            ds_raw = datasets.load_dataset(self.config.dataset_name, f'{self.params.lang1}-{self.params.lang2}', split = 'train')
            
            serializable_dataset = {
                "train": list(ds_raw)[1:],
            }

        # Save the serializable dataset to a JSON file
            with open(self.config.raw_data, 'w') as json_file:
                json.dump(serializable_dataset, json_file, indent=4)
        except Exception as e:
            logger.info("incorrect dataset")
            raise e
        
            # Splitting the dataset for training and validation
        train_ds_size = int(0.9 * len(ds_raw)) # 90% for training
        val_ds_size = len(ds_raw) - train_ds_size # 10% for validation
        train_ds_raw, val_ds_raw = random_split(ds_raw, [train_ds_size, val_ds_size]) # Randomly splitting the dataset


        train_data_serializable = self.convert_dataset_to_serializable(train_ds_raw)
        valid_data_serializable = self.convert_dataset_to_serializable(val_ds_raw)

        logger.info("saving train, valid..")
        with open(self.config.train, 'w') as train_file:
            json.dump(train_data_serializable, train_file, indent=4)
        with open(self.config.valid, 'w') as valid_file:
            json.dump(valid_data_serializable, valid_file, indent=4)


        logger.info("data_ingestion successfully saved")


In [38]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.initiate_data_ingestion()
except Exception as e:
    raise e

[2024-05-26 10:43:29,325: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-05-26 10:43:29,333: INFO: common: yaml file: params.yaml loaded successfully]
[2024-05-26 10:43:29,335: INFO: common: created directory at: artifacts]
[2024-05-26 10:43:29,340: INFO: common: created directory at: artifacts/data_ingestion]
[2024-05-26 10:43:29,340: INFO: common: created directory at: artifacts/data_ingestion/raw]
[2024-05-26 10:43:29,355: INFO: common: yaml file: params.yaml loaded successfully]
[2024-05-26 10:43:29,358: INFO: 1083640756: Initiating dataIngestion..]
[2024-05-26 10:43:57,040: INFO: 1083640756: saving train, valid..]
[2024-05-26 10:43:58,807: INFO: 1083640756: data_ingestion successfully saved]
