In [2]:
import os

In [3]:
%pwd

'd:\\IT-service_delivery_risk_predictor\\research'

In [4]:
os.chdir("../")

In [5]:
%pwd

'd:\\IT-service_delivery_risk_predictor'

In [6]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataPreprocessingConfig:
    root_dir: str
    raw_data_path: str
    pickle_save: str  
     

In [8]:
from risk_predictor.constants import *
from risk_predictor.utils.common import read_yaml,create_directories

In [10]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    def get_data_preprocessing_config(self) -> DataPreprocessingConfig:
        config = self.config.data_preprocessing
        return DataPreprocessingConfig(
            root_dir=config.root_dir,
            raw_data_path=config.raw_data_path,
            pickle_save=config.pickle_save
            
        )    

In [11]:
import os
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from risk_predictor import logger


In [16]:
class DataPreprocessing:
    def __init__(self, config: DataPreprocessingConfig):
        self.config = config
        self.scaler = StandardScaler()
        self.le = LabelEncoder()

    def load_data(self, data_path: str):
        """Load raw CSV data"""
        logger.info(f"Loading data from {data_path}")
        df = pd.read_csv(data_path)
        logger.info(f"Shape: {df.shape}")
        logger.info(f"info: \n{str(df.info())}")
        logger.info(f"describe: \n{df.describe().to_dict()}")
        logger.info(f"Missing values:\n{df.isnull().sum()}")
        logger.info(f"Target distribution:\n{df['predicted_risk'].value_counts().to_dict()}")
        return df

    def validate_duration(self, df: pd.DataFrame):
        """Validate duration calculation"""
        df["start_date"] = pd.to_datetime(df["start_date"])
        df["end_date"] = pd.to_datetime(df["end_date"])
        df['actual_duration'] = (df['end_date'] - df['start_date']).dt.days
        discrepancy = (df['actual_duration'] - df['actual_duration_days']).abs().sum()
        if discrepancy != 0:
            logger.warning(f"Discrepancy in actual_duration_days: {discrepancy}")



        """cross validate expected delivery_delay_days"""  
        df["calculated_delay"] = df["actual_duration_days"] - df["planned_duration_days"]
        if (df["delivery_delay_days"] != df["calculated_delay"]).all():
            logger.warning(f"mismatch found")
      
        return df
    

    def encode_target(self, df: pd.DataFrame):
        """Encode target labels"""
        df['predicted_risk'] = self.le.fit_transform(df['predicted_risk'])
        return df

    def drop_columns(self, df: pd.DataFrame):
        """Drop unnecessary columns"""
        df.drop(['project_id', 'start_date', 'end_date', 'delivery_delay_days','actual_duration','calculated_delay'], axis=1, inplace=True)
        return df

    def split_data(self, df: pd.DataFrame):
        """Split into train and test"""
        X = df.drop('predicted_risk', axis=1)
        y = df['predicted_risk']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        logger.info(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")
        logger.info(f"Class distribution before SMOTE: {y_train.value_counts().to_dict()}")
        return X_train, X_test, y_train, y_test

    def apply_smote(self, X_train, y_train):
        """Apply SMOTE oversampling"""
        smote = SMOTE(random_state=42)
        X_res, y_res = smote.fit_resample(X_train, y_train)
        logger.info(f"Class distribution after SMOTE: {pd.Series(y_res).value_counts().to_dict()}")
        return X_res, y_res

    def scale_data(self, X_train, X_test):
        """Scale features"""
        X_train = self.scaler.fit_transform(X_train)
        X_test = self.scaler.transform(X_test)
        return X_train, X_test
    
    
    def save_dataset(self, X_train, X_test, y_train, y_test):
        """
        Saves the processed train and test datasets as joblib files
        """
        logger.info("ðŸ’¾ Saving train and test datasets as joblib files")
        pickle_save = self.config.pickle_save
        os.makedirs(pickle_save, exist_ok=True)

        # Paths
        x_train_path = os.path.join(pickle_save, 'X_train.joblib')
        x_test_path = os.path.join(pickle_save, 'X_test.joblib')
        y_train_path = os.path.join(pickle_save, 'y_train.joblib')
        y_test_path = os.path.join(pickle_save, 'y_test.joblib')

        # Save
        joblib.dump(X_train, x_train_path)
        joblib.dump(X_test, x_test_path)
        joblib.dump(y_train, y_train_path)
        joblib.dump(y_test, y_test_path)

        logger.info(f" Datasets saved to {pickle_save}")
        return X_train, X_test, y_train, y_test


In [17]:
try:
    config = ConfigurationManager()
    data_preprocessing_config = config.get_data_preprocessing_config()

    # Initialize class
    data_preprocessing = DataPreprocessing(config=data_preprocessing_config)

    
    df = data_preprocessing.load_data(data_preprocessing_config.raw_data_path)
    df = data_preprocessing.validate_duration(df)
    df = data_preprocessing.encode_target(df)
    df = data_preprocessing.drop_columns(df)
    X_train, X_test, y_train, y_test = data_preprocessing.split_data(df)
    X_train, y_train = data_preprocessing.apply_smote(X_train, y_train)
    X_train, X_test = data_preprocessing.scale_data(X_train, X_test)
    data_preprocessing.save_dataset(X_train, X_test, y_train, y_test)

    logger.info("Data Preprocessing pipeline executed successfully")

except Exception as e:
    logger.exception("Error occurred during Data Preprocessing stage")
    raise e

[2025-10-07 16:39:43,641: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-10-07 16:39:43,652: INFO: common: yaml file: params.yaml loaded successfully]
[2025-10-07 16:39:43,652: INFO: common: created directory at: artifacts]
[2025-10-07 16:39:43,656: INFO: 1942427899: Loading data from artifacts/data_ingestion/data.csv]
[2025-10-07 16:39:43,756: INFO: 1942427899: Shape: (50000, 11)]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   project_id             50000 non-null  object 
 1   start_date             50000 non-null  object 
 2   end_date               50000 non-null  object 
 3   planned_duration_days  50000 non-null  int64  
 4   actual_duration_days   50000 non-null  int64  
 5   team_size              50000 non-null  int64  
 6   num_bugs               50000 non-null  int64  
 7   num_chang

--- Logging error ---
Traceback (most recent call last):
  File "C:\Users\Arya\anaconda3\Lib\logging\__init__.py", line 1163, in emit
    stream.write(msg + self.terminator)
  File "C:\Users\Arya\anaconda3\Lib\encodings\cp1252.py", line 19, in encode
    return codecs.charmap_encode(input,self.errors,encoding_table)[0]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
UnicodeEncodeError: 'charmap' codec can't encode character '\U0001f4be' in position 44: character maps to <undefined>
Call stack:
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "d:\IT-service_delivery_risk_predictor\venv\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "d:\IT-service_delivery_risk_predictor\venv\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "d:\IT-service_delivery_risk_predictor\venv\Lib\site-packages\ipykernel\kerne