In [1]:
import os
os.chdir("../")

In [2]:
%pwd



'c:\\Users\\lenovo\\Desktop\\Mini_Translator'

In [3]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir:Path
    raw_path:Path
    dataset_name:str
    raw_data:Path
    train:Path
    valid:Path
    test:Path
    

In [26]:
from src.Mini_Translator.constants import *
from src.Mini_Translator.utils.common import read_yaml,create_directories

In [5]:
class ConfigurationManager:
    def __init__(
            self,
            config_filepath=CONFIG_FILE_PATH,
            params_filepath=PARAMS_FILE_PATH):
        
        self.config=read_yaml(config_filepath)
        self.params=read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_ingestion_config(self) ->DataIngestionConfig:
        config=self.config.data_ingestion

        create_directories([config.root_dir,config.raw_path])

        data_ingestion_config=DataIngestionConfig(
            root_dir=config.root_dir,
            raw_path=config.raw_path,
            dataset_name=config.dataset_name,
            raw_data=config.data_files.raw_data,
            train=config.data_files.train,
            valid=config.data_files.validation,
            test=config.data_files.test
            
        )

        return data_ingestion_config
        
        

In [6]:
import os
from src.Mini_Translator.logging import logger
from src.Mini_Translator.utils.common import get_size
import datasets

  from .autonotebook import tqdm as notebook_tqdm


[2024-05-21 17:16:01,452: INFO: config: PyTorch version 2.3.0 available.]
[2024-05-21 17:16:01,461: INFO: config: TensorFlow version 2.16.1 available.]


In [10]:
# dataset=datasets.load_dataset("bentrevett/multi30k")

In [None]:
# dataset

In [23]:
# import json

# # Convert the dataset to a serializable format (e.g., nested dictionaries and lists)
# serializable_dataset = {
#     "train": list(dataset["train"]),
#     "test": list(dataset["test"]),
#     "validation": list(dataset["validation"])
# }

# # Save the serializable dataset to a JSON file
# with open('dataset.json', 'w') as json_file:
#     json.dump(serializable_dataset, json_file, indent=4)


In [19]:
# for example in dataset['train']:
#     print(example)
#     break

{'en': 'Two young, White males are outside near many bushes.', 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.'}


In [24]:
import os
import json

class DataIngestion:
    def __init__(self, config):
        self.config = config

    def convert_dataset_to_serializable(self, dataset):
        return [example for example in dataset]

    def initiate_data_ingestion(self):
        logger.info("Initiating dataIngestion..")
        try:
            dataset = datasets.load_dataset(self.config.dataset_name)
        except Exception as e:
            logger.info("incorrect dataset")
            raise e

        serializable_dataset = {
            "train": list(dataset["train"]),
            "test": list(dataset["test"]),
            "validation": list(dataset["validation"])
        }

        # Save the serializable dataset to a JSON file
        with open(self.config.raw_data, 'w') as json_file:
            json.dump(serializable_dataset, json_file, indent=4)

        train_data, valid_data, test_data = dataset["train"], dataset["validation"], dataset["test"]

        train_data_serializable = self.convert_dataset_to_serializable(train_data)
        valid_data_serializable = self.convert_dataset_to_serializable(valid_data)
        test_data_serializable = self.convert_dataset_to_serializable(test_data)

        logger.info("saving train, valid, test dataset..")
        with open(self.config.train, 'w') as train_file:
            json.dump(train_data_serializable, train_file, indent=4)
        with open(self.config.valid, 'w') as valid_file:
            json.dump(valid_data_serializable, valid_file, indent=4)
        with open(self.config.test, 'w') as test_file:
            json.dump(test_data_serializable, test_file, indent=4)

        logger.info("data_ingestion successfully saved")


In [25]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.initiate_data_ingestion()
except Exception as e:
    raise e

[2024-05-21 17:30:08,979: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-05-21 17:30:08,984: INFO: common: yaml file: params.yaml loaded successfully]
[2024-05-21 17:30:08,987: INFO: common: created directory at: artifacts]
[2024-05-21 17:30:08,991: INFO: common: created directory at: artifacts/data_ingestion]
[2024-05-21 17:30:08,994: INFO: 3451821151: Initiating dataIngestion..]
[2024-05-21 17:30:20,162: INFO: 3451821151: saving train, valid, test dataset..]
[2024-05-21 17:30:20,884: INFO: 3451821151: data_ingestion successfully saved]
