In [2]:
import os
os.chdir("../")

In [3]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, RobustScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

In [4]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionManipulationConfig:
    root_dir: Path
    local_data_file: Path
    source_URL: str
    save_training_file: Path
    param_target_col: str
    param_random_state: int

In [5]:
from Mushroom_Classification.utils.common import create_directories, read_yaml, save_data, read_file
from Mushroom_Classification.constants import *
from Mushroom_Classification import logger

In [6]:
class ConfigurationManager:
    def __init__(self,
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root]) #the artifacts_root is the key of the dictionary created
                                                # in the yaml file and we can read this key like that instead of
                                                # ["artifacts_root"] because we used the ConfigBox in the common.py file


    def get_data_ingestion_config(self) -> DataIngestionManipulationConfig:
        config = self.config.data_ingestion_manipulation #data ingestion is the other key value of the dictionary in the config.yaml file

        create_directories([config.root_dir,config.save_training_file])

        data_ingestion_manipulation_config = DataIngestionManipulationConfig(
            root_dir=config.root_dir,
            local_data_file = config.local_data_file,
            source_URL = config.source_URL,
            save_training_file= config.save_training_file,
            param_target_col=self.params.TARGET,
            param_random_state=self.params.RANDOM_STATE
        )                                     

        return data_ingestion_manipulation_config

In [7]:
import urllib.request as request

In [38]:
class DataIngestionManipulation:
    def __init__(self, config: DataIngestionManipulationConfig):
        self.config = config

    def download_file(self):
        """
        file_path: str
        Download, if it doesn't already exists, the csv file with data, don't need a return, just to save the Data
        """
        file = "mushrooms.csv" 
        new_path = os.path.join(self.config.local_data_file,file)
        if not os.path.exists(new_path):
            filename, headers = request.urlretrieve(
                url = self.config.source_URL,
                filename= new_path
            )

            logger.info(f"{file} is downloading!")
        else:
            logger.info(f"File already downloaded")
        
        return None
    
    

    def preprocess_file(self):
        """
        Preprocess the data and save it into a training and testing files
        """
        raw_data = read_file(Path(self.config.local_data_file), "mushrooms.csv")
        numerical_cols = raw_data.select_dtypes(include='number').columns
        categorical_cols = raw_data.select_dtypes(include='object').columns

        
        encoder = OrdinalEncoder()
        raw_data[categorical_cols] = encoder.fit_transform(raw_data[categorical_cols])

        X = raw_data.drop(labels = self.config.param_target_col, axis = 1)
        y = raw_data[self.config.param_target_col]
        
        scaler = RobustScaler()
        numerical_cols = X.select_dtypes(include='number').columns
        X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

        X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state= self.config.param_random_state)
        X_train.reset_index(drop=True,inplace = True)
        X_test.reset_index(drop=True,inplace = True)
        y_train.reset_index(drop=True,inplace = True)
        y_test.reset_index(drop=True,inplace = True)


        train_data = pd.concat([X_train,y_train],axis = 1)
        test_data = pd.concat([X_test,y_test],axis = 1)


        pca = PCA(n_components=5)
        X_train_pca = pca.fit_transform(X_train) 
        X_test_pca = pca.transform(X_test) 
        X_train_pca = pd.DataFrame(X_train_pca)
        X_test_pca = pd.DataFrame(X_test_pca)


        train_pca_data = pd.concat([X_train_pca,y_train],axis = 1)
        test_pca_data = pd.concat([X_test_pca,y_test],axis = 1)
        

        training_path = self.config.save_training_file 
        save_data(Path(training_path),train_data,"train.csv")
        save_data(Path(training_path),test_data,"test.csv")
        save_data(Path(training_path),train_pca_data,"train_pca.csv")
        save_data(Path(training_path),test_pca_data,"test_pca.csv")
        
        return None

In [39]:
try:
    config = ConfigurationManager()
    data_ingestion_manipulation_config = config.get_data_ingestion_config()
    data_ingestion_manipulation = DataIngestionManipulation(config=data_ingestion_manipulation_config)
    data_ingestion_manipulation.download_file()
    
    data_ingestion_manipulation.preprocess_file()
    
except Exception as e:
    raise e

[2024-06-17 23:10:59,588: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-06-17 23:10:59,590: INFO: common: yaml file: params.yaml loaded successfully]
[2024-06-17 23:10:59,591: INFO: common: The directory artifacts already exists]
[2024-06-17 23:10:59,591: INFO: common: The directory artifacts/data_ingestion already exists]
[2024-06-17 23:10:59,592: INFO: common: The directory artifacts/training already exists]
[2024-06-17 23:10:59,592: INFO: 3312381835: File already downloaded]
[2024-06-17 23:10:59,634: INFO: 3312381835: length X=8124]
[2024-06-17 23:10:59,645: INFO: 3312381835: length X=8124]
[2024-06-17 23:10:59,648: INFO: 3312381835: length X=6499]
[2024-06-17 23:10:59,650: INFO: 3312381835: length train_data=6499]
[2024-06-17 23:10:59,654: INFO: 3312381835: length train_pca=6499]
[2024-06-17 23:10:59,655: INFO: 3312381835: length X_train_pca=6499]
[2024-06-17 23:10:59,656: INFO: 3312381835: length y_train=6499]
[2024-06-17 23:10:59,656: INFO: 3312381835: le