# **Feature Engineering**

In [1]:
import logging
import pandas as pd
import os
import sys

# Add the 'scripts' directory to the Python path for module imports
sys.path.append(os.path.abspath(os.path.join('..', 'scripts')))

# Import the load_data module
try:
    from data_loader import load_data
    logger_initialized = True
except ImportError as e:
    logger_initialized = False
    print(f"Error importing 'load_data': {e}")

# Set pandas display options for better visibility
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

In [2]:
# Configure logging
def setup_logger(name: str = 'my_logger') -> logging.Logger:
    """
    Set up a logger with INFO level and StreamHandler.
    
    Parameters:
    -----------
    name : str
        The name of the logger.
    
    Returns:
    --------
    logging.Logger
        Configured logger instance.
    """
    logger = logging.getLogger(name)
    logger.setLevel(logging.INFO)
    
    # Prevent duplicate handlers
    if not logger.hasHandlers():
        handler = logging.StreamHandler()
        handler.setLevel(logging.INFO)
        formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
        handler.setFormatter(formatter)
        logger.addHandler(handler)
    
    return logger

# Initialize logger
logger = setup_logger()
logger.info("Imported necessary libraries.")

# Check and log if 'load_data' was successfully imported
if logger_initialized:
    logger.info("'load_data' module imported successfully.")
else:
    logger.warning("'load_data' module could not be imported. Check the 'scripts' directory and file availability.")

2025-01-24 14:16:53,186 - INFO - Imported necessary libraries.
2025-01-24 14:16:53,187 - INFO - 'load_data' module imported successfully.


In [3]:
logger.info("🟢 Starting the data loading process...")
df = load_data('../data/data.csv')
if not df.empty:
    logger.info(f"✅ Data loaded successfully! The dataset contains {df.shape[0]} rows and {df.shape[1]} columns.")
else:
    logger.warning("⚠️ Data loading completed, but the dataset is empty.")

2025-01-24 14:16:53,195 - INFO - 🟢 Starting the data loading process...
2025-01-24 14:16:53,427 - INFO - ✅ Data loaded successfully! The dataset contains 95662 rows and 15 columns.


Data successfully loaded from '../data/data.csv' with 95662 rows and 15 columns.


In [4]:
# Import the python class for feature engineering
from feature_engineering import FeatureEngineering

# Instantiate the FeatureEngineering class
feature_engineer = FeatureEngineering()

In [5]:
# Identify columns to exclude and categorical columns to encode
cols_to_drop = ['ProductId', 'BatchId', 'AccountId', 'ProviderId', 'SubscriptionId', 'Value', 'CountryCode', 'CurrencyCode']
cat_features = ['ProductCategory', 'ChannelId']

# Drop unnecessary columns
df.drop(columns=cols_to_drop, inplace=True)

In [12]:
if __name__ == '__main__':
    print("🟢 Starting feature engineering process...")
    print("===============================================")

    # Create aggregate features
    df_copy = df.copy().reset_index()
    agg_features = feature_engineer.create_aggregate_features(df_copy)
    print("Aggregate features created.")
    print("===============================================")
    # Extract time features
    df_with_time_features = feature_engineer.extract_time_features(agg_features)
    print("Time features extracted.")
    print("===============================================")
    # Encode categorical features
    df_encoded = feature_engineer.encode_categorical_features(df_with_time_features, cat_features)
    print("Categorical features encoded.")
    print("===============================================")
    # Handle missing values
    df_cleaned = feature_engineer.handle_missing_values(df_encoded)
    print("Missing values handled.")
    print("===============================================")
    # Normalize numerical features
    numeric_cols = df_encoded.select_dtypes(include='number').columns
    exclude_cols = ['Amount', 'FraudResult']  # Replace with actual column names to exclude
    numeric_cols = numeric_cols.difference(exclude_cols)

    df_normalized = feature_engineer.normalize_numerical_features(df_encoded, numeric_cols, method='standardize')
    print("✅ Numerical features normalized.")
    print("===============================================")


🟢 Starting feature engineering process...
Aggregate features created.
Time features extracted.
Categorical features encoded.
Missing values handled.
✅ Numerical features normalized.


In [7]:
# Display the results
df_normalized

Unnamed: 0_level_0,CustomerId,ProductCategory,ChannelId,Amount,TransactionStartTime,PricingStrategy,FraudResult,Total_Transaction_Amount,Average_Transaction_Amount,Transaction_Count,Std_Transaction_Amount,Transaction_Hour,Transaction_Day,Transaction_Month,Transaction_Year
TransactionId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
TransactionId_76871,CustomerId_4406,-0.799047,0.746738,1000.0,2018-11-15 02:18:49+00:00,-0.349252,0.0,0.170118,-0.067623,-0.311831,-0.168551,-2.155530,-0.100739,0.848684,-0.994246
TransactionId_73770,CustomerId_4406,0.491064,-1.175144,-20.0,2018-11-15 02:19:08+00:00,-0.349252,0.0,0.170118,-0.067623,-0.311831,-0.168551,-2.155530,-0.100739,0.848684,-0.994246
TransactionId_26203,CustomerId_4683,-0.799047,0.746738,500.0,2018-11-15 02:44:21+00:00,-0.349252,0.0,0.165122,-0.072568,-0.444993,-0.202748,-2.155530,-0.100739,0.848684,-0.994246
TransactionId_380,CustomerId_988,4.361398,0.746738,20000.0,2018-11-15 03:32:55+00:00,-0.349252,0.0,0.175567,-0.008155,-0.404020,-0.009754,-1.949214,-0.100739,0.848684,-0.994246
TransactionId_28195,CustomerId_988,0.491064,-1.175144,-644.0,2018-11-15 03:34:21+00:00,-0.349252,0.0,0.175567,-0.008155,-0.404020,-0.009754,-1.949214,-0.100739,0.848684,-0.994246
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TransactionId_89881,CustomerId_3078,0.491064,-1.175144,-1000.0,2019-02-13 09:54:09+00:00,-0.349252,0.0,0.276904,-0.028743,0.204885,0.050775,-0.711319,-0.323884,-0.874040,1.005787
TransactionId_91597,CustomerId_3874,-0.799047,0.746738,1000.0,2019-02-13 09:54:25+00:00,-0.349252,0.0,0.167759,-0.062526,-0.398330,-0.177178,-0.711319,-0.323884,-0.874040,1.005787
TransactionId_82501,CustomerId_3874,0.491064,-1.175144,-20.0,2019-02-13 09:54:35+00:00,-0.349252,0.0,0.167759,-0.062526,-0.398330,-0.177178,-0.711319,-0.323884,-0.874040,1.005787
TransactionId_136354,CustomerId_1709,3.716343,0.746738,3000.0,2019-02-13 10:01:10+00:00,-0.349252,0.0,0.204153,-0.059427,0.149116,-0.166689,-0.505004,-0.323884,-0.874040,1.005787


In [8]:
# Display normalized Columns
df_normalized.columns

Index(['CustomerId', 'ProductCategory', 'ChannelId', 'Amount',
       'TransactionStartTime', 'PricingStrategy', 'FraudResult',
       'Total_Transaction_Amount', 'Average_Transaction_Amount',
       'Transaction_Count', 'Std_Transaction_Amount', 'Transaction_Hour',
       'Transaction_Day', 'Transaction_Month', 'Transaction_Year'],
      dtype='object')

In [13]:
# Save extracted and cleaned features to CSV
df_normalized.to_csv('../data/extracted_features.csv', index=False)
print("✅ Extracted features saved to '../data/extracted_features.csv'.")

✅ Extracted features saved to '../data/extracted_features.csv'.


### **Feature Engineering Process**

The feature engineering process involves several key steps to prepare the dataset for analysis and model training:

#### **Steps Involved**

1. **Encoding Categorical Variables**:
   - Categorical variables were encoded using one-hot encoding to convert them into a numerical format suitable for machine learning algorithms.

2. **Standardizing Numerical Features**:
   - Numerical features were standardized using the `StandardScaler`. This ensures consistency in scale across features, which is crucial for many machine learning models.

3. **Handling Missing Values**:
   - During the feature engineering process, the new feature `Std_Transaction_Amount` was found to have **712 missing values**. To ensure data completeness, these missing values were imputed with the mean of the feature.

#### **Summary**

These steps improve the quality of the dataset, making it more suitable for further analysis and predictive modeling. Proper encoding, scaling, and handling of missing values are essential for building effective machine learning models.