In [1]:
import os
os.chdir("../")

In [7]:
%pwd

'c:\\Users\\Ansh Lulla\\VS-Code\\MLOps-Practice'

In [48]:
import os
import pandas as pd
from src.utils.utils import *
from src.constants import *
from dataclasses import dataclass
from sklearn.preprocessing import StandardScaler
from pathlib import Path
from src.logging import logging

In [49]:
@dataclass
class DataTransformationConfig:
    root_dir: Path
    data_file: Path
    save_file: Path

In [50]:
class ConfigurationManager:
    def __init__(self, config_file_path=CONFIG_FILE_PATH, params_file_path=PARAMS_FILE_PATH):
        self.config = read_yaml(config_file_path)
        self.params = read_yaml(params_file_path)
        
        create_directories([self.config.artifacts_root])
    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=Path(config.root_dir),
            data_file=Path(config.data_file),
            save_file=Path(config.save_file)
        )

        return data_transformation_config

In [30]:
df = pd.read_csv("artifacts/data_ingestion/data/data.csv")
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [31]:
categorical_cols = df.select_dtypes(include=["object"]).columns[:-1]
new_df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
new_df.y = df.y.map({"no": 0, "yes": 1})
new_df = new_df.astype(int)  # Convert all columns to int (0/1)
new_df.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,y,job_blue-collar,job_entrepreneur,...,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_other,poutcome_success,poutcome_unknown
0,58,2143,5,261,1,-1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
1,44,29,5,151,1,-1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
2,33,2,5,76,1,-1,0,0,0,1,...,0,0,0,1,0,0,0,0,0,1
3,47,1506,5,92,1,-1,0,0,1,0,...,0,0,0,1,0,0,0,0,0,1
4,33,1,5,198,1,-1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1


In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [33]:
num_cols = df.select_dtypes(include=["int64"]).columns
num_cols

Index(['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous'], dtype='object')

In [34]:
num_cols = df.select_dtypes(include=["int64"]).columns
scaler = StandardScaler()
new_df[num_cols] = scaler.fit_transform(new_df[num_cols])
new_df.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,y,job_blue-collar,job_entrepreneur,...,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_other,poutcome_success,poutcome_unknown
0,1.606965,0.256419,-1.298476,0.011016,-0.569351,-0.411453,-0.25194,0,0,0,...,0,0,0,1,0,0,0,0,0,1
1,0.288529,-0.437895,-1.298476,-0.416127,-0.569351,-0.411453,-0.25194,0,0,0,...,0,0,0,1,0,0,0,0,0,1
2,-0.747384,-0.446762,-1.298476,-0.707361,-0.569351,-0.411453,-0.25194,0,0,1,...,0,0,0,1,0,0,0,0,0,1
3,0.571051,0.047205,-1.298476,-0.645231,-0.569351,-0.411453,-0.25194,0,1,0,...,0,0,0,1,0,0,0,0,0,1
4,-0.747384,-0.447091,-1.298476,-0.23362,-0.569351,-0.411453,-0.25194,0,0,0,...,0,0,0,1,0,0,0,0,0,1


In [None]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    def read_data(self) -> pd.DataFrame:
        try:
            df = pd.read_csv(self.config.data_file)
            logging.info(f"Data read successfully from {self.config.data_file}")
            return df
        except Exception as e:
            logging.error(f"Error reading data from {self.config.data_file}: {e}")
            raise e

    def one_hot_encode(self, df: pd.DataFrame) -> pd.DataFrame:
        try:
            categorical_cols = df.select_dtypes(include=["object"]).columns[:-1]
            new_df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
            new_df.y = df.y.map({"no": 0, "yes": 1})
            new_df = new_df.astype(int)
            logging.info("One Hot Encoded the categorical columns successfully.")
            return new_df
        except Exception as e:
            logging.info("Error while creating one hot labels")
            raise e

    def normalize_cols(self, df: pd.DataFrame) -> pd.DataFrame:
        try:
            scaler = StandardScaler()
            num_cols = df.select_dtypes(include=["int64"]).columns
            new_df = self.one_hot_encode(df)
            new_df[num_cols] = scaler.fit_transform(new_df[num_cols])
            logging.info("Standardized numerical columns successfully.")  
            return new_df
        except Exception as e: 
            logging.info("Error while trying to standardize features")
            raise e
        
    def save_data(self, df: pd.DataFrame):
        try:
            os.makedirs(self.config.save_file, exist_ok=True)
            file_path = os.path.join(self.config.save_file, "processed_data.csv")
            df.to_csv(file_path, index=False)
            logging.info(f"Transformed Data saved successfully to {file_path}")
        except Exception as e:
            logging.error(f"Error saving data to {self.config.save_file}: {e}")
            raise e

In [52]:
config = ConfigurationManager()
data_transformation_config = config.get_data_transformation_config()
data_transformation = DataTransformation(data_transformation_config)
df = data_transformation.read_data()
new_df = data_transformation.normalize_cols(df)
data_transformation.save_data(new_df)

[2025-07-27 05:04:20,352: INFO: utils: yaml file: config\config.yaml loaded successfully]
[2025-07-27 05:04:20,353: INFO: utils: yaml file: params.yaml loaded successfully]
[2025-07-27 05:04:20,354: INFO: utils: Created Directory at: artifacts]
[2025-07-27 05:04:20,356: INFO: utils: Created Directory at: artifacts/data_transformation]
[2025-07-27 05:04:20,426: INFO: 518047259: Data read successfully from artifacts\data_ingestion\data\data.csv]
[2025-07-27 05:04:20,466: INFO: 518047259: One Hot Encoded the categorical columns successfully.]
[2025-07-27 05:04:20,474: INFO: 518047259: Standardized numerical columns successfully.]
[2025-07-27 05:04:20,987: INFO: 518047259: Transformed Data saved successfully to artifacts\data_transformation\data\processed_data.csv]
