In [None]:
import os 
os.chdir("../")

In [None]:
from dataclasses import dataclass
from pathlib import Path
from Foodtimepredictor.constant import CONFIG_FILE_PATH, PARAMS_FILE_PATH, SCHEMA_FILE_PATH
from Foodtimepredictor.utils.common import read_yaml, create_directories, save_obj
from Foodtimepredictor import logger
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
import os

In [None]:
# Data Transformation Configuration Class
@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path  # Path to the dataset (e.g., 'finalTrain.csv')
    data_transformation_preprocessing_obj: Path
    data_transformation_dir: Path  # Directory to save transformed data

In [None]:
# Configuration Manager Class
class ConfigurationManager:
    def __init__(self, config_filepath=CONFIG_FILE_PATH, params_filepath=PARAMS_FILE_PATH, schema_filepath=SCHEMA_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)
        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        create_directories([Path(config.root_dir), Path(config.data_transformation_dir)])
        return DataTransformationConfig(
            root_dir=Path(config.root_dir),
            data_path=Path(config.data_path),
            data_transformation_preprocessing_obj=Path(config.data_transformation_preprocessing_obj),
            data_transformation_dir=Path(config.data_transformation_dir)
        )


In [None]:
# Feature Engineering Class
class FeatureEngineering(BaseEstimator, TransformerMixin):
    def __init__(self):
        logger.info("Feature Engineering started")

    def distance_numpy(self, df, lat1, lon1, lat2, lon2):
        # Calculate distance based on latitude and longitude
        p = np.pi / 180
        a = 0.5 - np.cos((df[lat2] - df[lat1]) * p) / 2 + \
            np.cos(df[lat1] * p) * np.cos(df[lat2] * p) * \
            (1 - np.cos((df[lon2] - df[lon1]) * p)) / 2
        return 12734 * np.arcsin(np.sqrt(a))

    def transform_data(self, df):
        df['distance'] = self.distance_numpy(
            df, 
            'Restaurant_latitude', 'Restaurant_longitude',
            'Delivery_location_latitude', 'Delivery_location_longitude'
        )

        columns_to_drop = ['Delivery_person_ID', 'Restaurant_latitude', 'Restaurant_longitude',
                           'Delivery_location_latitude', 'Delivery_location_longitude',
                           'Order_Date', 'Time_Orderd', 'Time_Order_picked']
        df.drop(columns_to_drop, axis=1, inplace=True)
        logger.info("Dropped unnecessary columns")

        return df

    def fit(self, X, y=None):
        return self

    def transform(self, X: pd.DataFrame, y=None):
        transformed_df = self.transform_data(X)
        return transformed_df

# Data Transformation Class
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    def get_data_transformation_obj(self):
        # Define the categories and pipelines for transformation
        Road_traffic_density = ['Low', 'Medium', 'High', 'Jam']
        Weather_conditions = ['Sunny', 'Cloudy', 'Fog', 'Sandstorms', 'Windy', 'Stormy']
        categorical_columns = ['Type_of_order', 'Type_of_vehicle', 'Festival', 'City']
        ordinal_encoder = ['Road_traffic_density', 'Weather_conditions']
        numerical_column = ['Delivery_person_Age', 'Delivery_person_Ratings', 
                            'Vehicle_condition', 'multiple_deliveries', 'distance']

        # Pipelines
        numerical_pipeline = Pipeline([
            ('impute', SimpleImputer(strategy='constant', fill_value=0)),
            ('scaler', StandardScaler())
        ])

        categorical_pipeline = Pipeline([
            ('impute', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore')),
            ('scaler', StandardScaler())
        ])

        ordinal_pipeline = Pipeline([
            ('impute', SimpleImputer(strategy='most_frequent')),
            ('ordinal', OrdinalEncoder(categories=[Road_traffic_density, Weather_conditions])),
            ('scaler', StandardScaler())
        ])

        preprocessor = ColumnTransformer([
            ('numerical_pipeline', numerical_pipeline, numerical_column),
            ('categorical_pipeline', categorical_pipeline, categorical_columns),
            ('ordinal_pipeline', ordinal_pipeline, ordinal_encoder)
        ])

        logger.info("Pipeline steps completed")
        return preprocessor

    def initiate_data_transformation(self, dataset: pd.DataFrame):
        # Read train and test datasets
        train_df = dataset.iloc[:len(dataset)//2]
        test_df = dataset.iloc[len(dataset)//2:]
        logger.info("Read train and test data")

        # Apply Feature Engineering
        fe_obj = FeatureEngineering()
        train_df = fe_obj.transform(train_df)
        test_df = fe_obj.transform(test_df)

        # Saving transformed data
        transformed_train_path = self.config.data_transformation_train_dir
        transformed_test_path = self.config.data_transformation_test_dir
        train_df.to_csv(transformed_train_path, index=False)
        test_df.to_csv(transformed_test_path, index=False)
        logger.info("Saved transformed train and test data")

        # Apply Data Transformation
        transformation_obj = self.get_data_transformation_obj()
        target_column = "Time_taken (min)"
        X_train = train_df.drop(columns=[target_column])
        y_train = train_df[target_column]
        X_test = test_df.drop(columns=[target_column])
        y_test = test_df[target_column]

        X_train_transformed = transformation_obj.fit_transform(X_train)
        X_test_transformed = transformation_obj.transform(X_test)

        # Save the transformation objects
        save_obj(file_path=self.config.data_transformation_preprocessing_obj, obj=transformation_obj)
        save_obj(file_path=self.config.data_transformation_dir / "fe_obj.pkl", obj=fe_obj)
        logger.info("Saved transformation objects")

        # Return the transformed data and paths
        return train_df, test_df

In [6]:
# Main execution block
try:
    # Initialize configuration manager and load configuration
    config_manager = ConfigurationManager()
    data_transformation_config = config_manager.get_data_transformation_config()
    
    # Initialize DataTransformation class
    data_transformation = DataTransformation(config=data_transformation_config)

    # Load the dataset
    dataset_path = data_transformation_config.data_path
    if not dataset_path.exists():
        raise FileNotFoundError(f"Dataset file not found at {dataset_path}")

    dataset = pd.read_csv(dataset_path)
    logger.info("Dataset loaded for transformation")

    # Apply Feature Engineering
    fe = FeatureEngineering()
    dataset_transformed = fe.transform(dataset)
    logger.info("Feature engineering applied")

    # Apply Data Transformation
    train_df, test_df = data_transformation.initiate_data_transformation(dataset_transformed)
    logger.info("Data transformation applied")

    # Save the transformed data
    os.makedirs(data_transformation_config.data_transformation_dir, exist_ok=True)
    train_df.to_csv(data_transformation_config.data_transformation_dir / "transformed_train.csv", index=False)
    test_df.to_csv(data_transformation_config.data_transformation_dir / "transformed_test.csv", index=False)
    logger.info("Transformed data saved")

    # Save the transformation objects (if necessary)
    # ...
    # Code for saving transformation objects
    # ...

except Exception as e:
    logger.error(f"Error in data transformation: {e}")
    raise e


[2024-01-13 13:42:34,903:INFO:common:yaml file: config\config.yaml loaded successfully]
[2024-01-13 13:42:34,905:INFO:common:yaml file: params.yaml loaded successfully]
[2024-01-13 13:42:34,907:INFO:common:yaml file: schema.yaml loaded successfully]
[2024-01-13 13:42:34,908:INFO:common:created directory at: artifacts]
[2024-01-13 13:42:34,909:INFO:common:created directory at: artifacts\data_transformation]
[2024-01-13 13:42:34,910:INFO:common:created directory at: artifacts\data_transformation\transformation]
[2024-01-13 13:42:34,985:INFO:2547875897:Dataset loaded for transformation]
[2024-01-13 13:42:34,985:INFO:1195955226:Feature Engineering started]
[2024-01-13 13:42:34,992:INFO:1195955226:Dropped unnecessary columns]
[2024-01-13 13:42:34,992:INFO:2547875897:Feature engineering applied]
[2024-01-13 13:42:34,994:INFO:1195955226:Read train and test data]
[2024-01-13 13:42:34,994:INFO:1195955226:Feature Engineering started]
[2024-01-13 13:42:34,996:ERROR:2547875897:Error in data transf

KeyError: 'Delivery_location_latitude'