In [1]:
import os 

os.chdir("..")
os.getcwd()

'c:\\project\\credit_card_fault'

In [2]:
import os 
from datetime import datetime


def get_current_time_stamp():
    return f"{datetime.now().strftime('%Y%m%d%H%M%S')}"


ROOT_DIR = os.getcwd()  # to get current working directory
CURRENT_TIME_STAMP = get_current_time_stamp()
# config constants
CONFIG_DIR = os.path.join(ROOT_DIR, 'configs')
CONFIG_FILE_NAME = "config.yaml"
CONFIG_FILE_PATH = os.path.join(CONFIG_DIR, CONFIG_FILE_NAME)

In [3]:
print(CONFIG_FILE_PATH)

c:\project\credit_card_fault\configs\config.yaml


In [4]:
from pathlib import Path

from pydantic import BaseModel, DirectoryPath, FilePath

class DataIngestionConfig(BaseModel):
    dataset_download_id: str
    raw_data_file_path : Path
    ingested_train_file_path :Path
    ingested_test_data_path : Path
    
class  TrainingPipelineConfig(BaseModel):
    artifact_dir :DirectoryPath 
    pipeline_name : str

In [5]:
from src.creditcard.exception import *

In [6]:
from src.creditcard.logger import *

In [7]:
from src.creditcard.utils.common import read_yaml,create_directories

ModuleNotFoundError: No module named 'creditcard'

In [None]:
config_info = read_yaml(path_to_yaml = Path(CONFIG_FILE_PATH))

In [None]:
print(config_info)

In [None]:
data_ingestion_info = config_info.data_ingestion_config

In [None]:
training_config = config_info.training_pipeline_config
print(training_config)

In [None]:
training_artifacts = os.path.join(ROOT_DIR, training_config.artifact_dir)
print(training_artifacts)

In [None]:
create_directories(path_to_directories = [training_artifacts])

In [None]:
print(data_ingestion_info)

In [None]:
dataset_download_id = data_ingestion_info.dataset_download_id
print(dataset_download_id)

In [None]:
data_ingestion_dir_name = data_ingestion_info.ingestion_dir
print(data_ingestion_dir_name)

In [None]:
raw_data_dir = data_ingestion_info.raw_data_dir
print(raw_data_dir)

In [None]:
raw_file_name = data_ingestion_info.dataset_download_file_name
print(raw_file_name)

In [None]:
data_ingestion_dir = os.path.join(training_artifacts,data_ingestion_dir_name)
print(data_ingestion_dir)

In [None]:
raw_data_file_path  = os.path.join(data_ingestion_dir, raw_data_dir, raw_file_name)
print(raw_data_file_path)

In [None]:
ingested_dir_name = data_ingestion_info.ingested_dir
print(ingested_dir_name)
           

In [None]:
ingested_dir_path = os.path.join(data_ingestion_dir,ingested_dir_name)
print(ingested_dir_path)

In [None]:
ingested_train_file_path  = os.path.join(ingested_dir_path, data_ingestion_info.ingested_train_file)
ingested_test_file_path = os.path.join(ingested_dir_path, data_ingestion_info.ingested_test_file)
print(ingested_train_file_path)
print(ingested_test_file_path)

In [None]:
create_directories([os.path.dirname(raw_data_file_path), os.path.dirname(ingested_train_file_path)])

In [None]:
class Configuration:

    def __init__(self,
                 config_file_path: Path = CONFIG_FILE_PATH) -> None:
        try:
            self.config_info = read_yaml(path_to_yaml=Path(config_file_path))
            self.pipeline_config = self.get_training_pipeline_config()
            self.time_stamp = CURRENT_TIME_STAMP

        except Exception as e:
            raise AppException(e, sys) from e

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        
        try:
            data_ingestion_info = self.config_info.data_ingestion_config
            artifact_dir = self.pipeline_config.artifact_dir
            dataset_download_id = data_ingestion_info.dataset_download_id
            data_ingestion_dir_name = data_ingestion_info.ingestion_dir
            raw_data_dir = data_ingestion_info.raw_data_dir
            raw_file_name = data_ingestion_info.dataset_download_file_name
            data_ingestion_dir = os.path.join(artifact_dir, data_ingestion_dir_name)
            raw_data_file_path  = os.path.join(data_ingestion_dir, raw_data_dir, raw_file_name)
            ingested_dir_name = data_ingestion_info.ingested_dir
            ingested_dir_path = os.path.join(data_ingestion_dir,ingested_dir_name)
            
            ingested_train_file_path  = os.path.join(ingested_dir_path, data_ingestion_info.ingested_train_file)
            ingested_test_file_path = os.path.join(ingested_dir_path, data_ingestion_info.ingested_test_file)
            create_directories([os.path.dirname(raw_data_file_path), os.path.dirname(ingested_train_file_path)])
            
            data_ingestion_config = DataIngestionConfig(dataset_download_id = dataset_download_id , 
                                                        raw_data_file_path = raw_data_file_path , 
                                                        ingested_train_file_path = ingested_train_file_path , 
                                                        ingested_test_data_path  = ingested_test_file_path)
            
            return data_ingestion_config
        except Exception as e:
            raise AppException(e, sys) from e
            
    def get_training_pipeline_config(self) -> TrainingPipelineConfig:
        try:
            training_config = self.config_info.training_pipeline_config
            training_pipeline_name = training_config.pipeline_name
            training_artifacts = os.path.join(ROOT_DIR, training_config.artifact_dir)
            create_directories(path_to_directories = [training_artifacts])
            training_pipeline_config =  TrainingPipelineConfig(artifact_dir=training_artifacts ,pipeline_name=training_pipeline_name)
            logger.info(f"Training pipeline config: {training_pipeline_config}")
            return training_pipeline_config
        except Exception as e:
            raise AppException(e, sys) from e

In [None]:
from pathlib import Path

from pydantic import BaseModel, DirectoryPath, FilePath

class DataIngestionArtifact(BaseModel):
    train_file_path : FilePath
    test_file_path : FilePath 


In [None]:
import pandas as pd

In [None]:
raw_data_frame = pd.read_csv(dataset_download_id)
           

In [None]:
raw_data_frame.head(3)

In [None]:
raw_data_frame.to_csv(raw_data_file_path , index=False) # writting file to a new path

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
strat_train_set = None
strat_test_set = None

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

In [None]:
for train_index, test_index in split.split(raw_data_frame, raw_data_frame["default.payment.next.month"]):
    #strat_train_set = raw_data_frame.loc[train_index]
    #strat_test_set = raw_data_frame.loc[test_index]
    print(train_index)
    print(test_index)

In [None]:
for train_index, test_index in split.split(raw_data_frame, raw_data_frame["default.payment.next.month"]):
    strat_train_set = raw_data_frame.loc[train_index]
    strat_test_set = raw_data_frame.loc[test_index]
    

In [None]:
print(strat_train_set)

In [None]:
print(strat_test_set)

In [None]:

from src.creditcard.entity.config_entity import DataIngestionConfig
from src.creditcard.entity.artifact_entity import DataIngestionArtifact
import sys,os
from src.creditcard.exception import AppException
from src.creditcard.logger import logger
import numpy as np
import pandas as pd
#import gdown
from sklearn.model_selection import StratifiedShuffleSplit
from src.creditcard.constants import *

class DataIngestion:
    """Stage 1 data ingestion : Download dataset, split data into train and test, export to pickle and mongoDb
     Input :
     DataIngestionConfig =

     output :
       DataIngestionArtifact(train_file_path,
                            test_file_path)
        top download the dataset from kaggle we use kaggle api authentication
        reference : https://github.com/Kaggle/kaggle-api for more details on kaggle api"""

    def __init__(self, data_ingestion_config_info: DataIngestionConfig):
        try:
            self.data_ingestion_config = data_ingestion_config_info
            logger.info(f"{'>>' * 20}Experiment : base Model {'<<' * 20}")
        except Exception as e:
            raise AppException(e, sys)
        
    def download_data(self,dataset_download_id: str, raw_data_file_path: str) -> str:
       
        try:
            # extraction remote url to download dataset
            logger.info(f"Downloading dataset from github")
            raw_data_frame = pd.read_csv(dataset_download_id)
            raw_data_frame.to_csv(raw_data_file_path , index=False)
            logger.info("Dataset unzipped successfully")

            return True

        except Exception as e:
            raise AppException(e, sys) from e

    def split_data_as_train_test(self) -> DataIngestionArtifact:
        try:
            logger.info(f"{'>>' * 20}Data splitting.{'<<' * 20}")
            raw_data_file_path = self.data_ingestion_config.raw_data_file_path
            train_file_path = self.data_ingestion_config.ingested_train_file_path
            test_file_path = self.data_ingestion_config.ingested_test_data_path

            logger.info(f"Reading csv file: [{raw_data_file_path}]")
            raw_data_frame = pd.read_csv(raw_data_file_path)

            logger.info("Splitting data into train and test")
            strat_train_set = None
            strat_test_set = None

            split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

            for train_index, test_index in split.split(raw_data_frame, raw_data_frame["default.payment.next.month"]):
                strat_train_set = raw_data_frame.loc[train_index]
                strat_test_set = raw_data_frame.loc[test_index]

            if strat_train_set is not None:
                logger.info(f"Exporting training dataset to file: [{train_file_path}]")
                strat_train_set.to_csv(train_file_path , index=False)

            if strat_test_set is not None:
                logger.info(f"Exporting test dataset to file: [{test_file_path}]")
                strat_test_set.to_csv(test_file_path , index=False)
                data_ingestion_artifact = DataIngestionArtifact(train_file_path=train_file_path,
                                                                test_file_path=test_file_path)                                                               
                logger.info(f"Data Ingestion artifact:[{data_ingestion_artifact}]")
                return data_ingestion_artifact

        except Exception as e:
            raise AppException(e, sys) from e

    def initiate_data_ingestion(self) -> DataIngestionArtifact:
        try:
            logger.info(f"{'>>' * 20}Data Ingestion started.{'<<' * 20}")
            data_ingestion_config = self.data_ingestion_config
            dataset_download_id = data_ingestion_config.dataset_download_id
            raw_data_file_path = data_ingestion_config.raw_data_file_path
            self.download_data(dataset_download_id,raw_data_file_path)

            data_ingestion_response = self.split_data_as_train_test()
            logger.info(f"{'>>' * 20}Data Ingestion artifact.{'<<' * 20}")
            logger.info(f" Data Ingestion Artifact{data_ingestion_response}")
            logger.info(f"{'>>' * 20}Data Ingestion completed.{'<<' * 20}")       
            return data_ingestion_response
        except Exception as e:
            raise AppException(e, sys) from e

    def __del__(self):
       logger.info(f"{'>>' * 20}Data Ingestion log completed.{'<<' * 20} \n\n")


if __name__ == "__main__":
    config = Configuration()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(data_ingestion_config)
    data_ingestion_response = data_ingestion.initiate_data_ingestion()                