# **Modelling**

### **Prepare the Data**
+ Combine all datasets to enhance predictive power and insights.

In [1]:
import logging
import pandas as pd
import os
import sys

# Add the 'scripts' directory to the Python path for module imports
sys.path.append(os.path.abspath(os.path.join('..', 'scripts')))

# Import the load_data module
try:
    from data_loader import load_data
    logger_initialized = True
except ImportError as e:
    logger_initialized = False
    print(f"Error importing 'load_data': {e}")

# Set pandas display options for better visibility
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

In [2]:
# Configure logging
def setup_logger(name: str = 'my_logger') -> logging.Logger:
    """
    Set up a logger with INFO level and StreamHandler.
    
    Parameters:
    -----------
    name : str
        The name of the logger.
    
    Returns:
    --------
    logging.Logger
        Configured logger instance.
    """
    logger = logging.getLogger(name)
    logger.setLevel(logging.INFO)
    
    # Prevent duplicate handlers
    if not logger.hasHandlers():
        handler = logging.StreamHandler()
        handler.setLevel(logging.INFO)
        formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
        handler.setFormatter(formatter)
        logger.addHandler(handler)
    
    return logger

# Initialize logger
logger = setup_logger()
logger.info("Imported necessary libraries.")

# Check and log if 'load_data' was successfully imported
if logger_initialized:
    logger.info("'load_data' module imported successfully.")
else:
    logger.warning("'load_data' module could not be imported. Check the 'scripts' directory and file availability.")

2025-01-27 16:56:20,344 - INFO - Imported necessary libraries.
2025-01-27 16:56:20,345 - INFO - 'load_data' module imported successfully.


In [3]:
logger.info("üü¢ Starting the data loading process...")
df = load_data('../data/extracted_features.csv')
if not df.empty:
    logger.info(f"‚úÖ Data loaded successfully! The dataset contains {df.shape[0]} rows and {df.shape[1]} columns.")
else:
    logger.warning("‚ö†Ô∏è Data loading completed, but the dataset is empty.")

2025-01-27 16:56:20,354 - INFO - üü¢ Starting the data loading process...
2025-01-27 16:56:20,623 - INFO - ‚úÖ Data loaded successfully! The dataset contains 95662 rows and 14 columns.


Data successfully loaded from '../data/extracted_features.csv' with 95662 rows and 14 columns.


In [4]:
df_normalized = pd.read_csv('../data/extracted_features.csv')

In [5]:
df_normalized.columns

Index(['CustomerId', 'ProductCategory', 'ChannelId', 'Amount',
       'TransactionStartTime', 'PricingStrategy', 'FraudResult',
       'Total_Transaction_Amount', 'Average_Transaction_Amount',
       'Transaction_Count', 'Std_Transaction_Amount', 'Transaction_Hour',
       'Transaction_Day', 'Transaction_Month', 'Transaction_Year'],
      dtype='object')

In [18]:
from credit_scoring_model import CreditScoreRFM

# Initialize RFMS object with normalized DataFrame
rfm = CreditScoreRFM(df_normalized.reset_index())

In [7]:
# Calculate Recency, Frequency, and Monetary metrics using the RFM model
rfm_df = rfm.calculate_rfm()

# Display the first five rows of the RFM DataFrame
print("Calculated RFM metrics for the first five customers:")
print("=========================================================")
rfm_df[['CustomerId', 'Recency', 'Frequency', 'Monetary']].head()

Calculated RFM metrics for the first five customers:


Unnamed: 0,CustomerId,Recency,Frequency,Monetary
0,CustomerId_4406,0,119,109921.75
2,CustomerId_4683,81,2,1000.0
3,CustomerId_988,5,38,228727.2
5,CustomerId_1432,90,1,2000.0
6,CustomerId_2858,5,29,93400.0


In [None]:
# Merge the DataFrames
# Create a mapping dictionary
risk_label_mapping = {'Good': 1, 'Bad': 0}

# Convert Risk_Label to binary values
rfm_df['Risk_Label'] = rfm_df['Risk_Label'].map(risk_label_mapping)

rfm_data = rfm_df[['CustomerId','Recency', 'Frequency', 'Monetary', 'Risk_Label']]
# Extract unique rows
df_normalized_unique = df_normalized.drop_duplicates(subset='CustomerId', keep='first')
features = pd.merge(df_normalized_unique, rfm_data, on='CustomerId', how='left')

# Set the CustomerId to index
features.set_index('CustomerId', inplace=True)

# Drop the irrelevant features
features.drop(columns=['TransactionStartTime'], inplace=True)

target = rfm_df['Risk_Label']

In [None]:
features.head(10)

Unnamed: 0_level_0,ProductCategory,ChannelId,Amount,PricingStrategy,FraudResult,Total_Transaction_Amount,Average_Transaction_Amount,Transaction_Count,Std_Transaction_Amount,Transaction_Hour,Transaction_Day,Transaction_Month,Transaction_Year,Recency,Frequency,Monetary,Risk_Label
CustomerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
CustomerId_4406,-0.799047,0.746738,1000.0,-0.349252,0.0,0.170118,-0.067623,-0.311831,-0.168551,-2.15553,-0.100739,0.848684,-0.994246,2176,119,109921.75,1
CustomerId_4683,-0.799047,0.746738,500.0,-0.349252,0.0,0.165122,-0.072568,-0.444993,-0.202748,-2.15553,-0.100739,0.848684,-0.994246,2256,2,1000.0,0
CustomerId_988,4.361398,0.746738,20000.0,-0.349252,0.0,0.175567,-0.008155,-0.40402,-0.009754,-1.949214,-0.100739,0.848684,-0.994246,2180,38,228727.2,1
CustomerId_1432,-0.799047,0.746738,2000.0,-0.349252,0.0,0.165168,-0.055062,-0.446132,0.0,-1.949214,-0.100739,0.848684,-0.994246,2265,1,2000.0,0
CustomerId_2858,-0.799047,0.746738,10000.0,2.379557,0.0,0.16936,-0.040815,-0.414264,-0.140992,-1.949214,-0.100739,0.848684,-0.994246,2180,29,93400.0,1
CustomerId_598,-0.799047,0.746738,500.0,-0.349252,0.0,0.165446,-0.054887,-0.442717,-0.165175,-1.742898,-0.100739,0.848684,-0.994246,2263,4,8060.0,0
CustomerId_1053,0.491064,0.746738,600.0,-0.349252,0.0,0.165709,-0.055415,-0.439303,-0.181262,-1.742898,-0.100739,0.848684,-0.994246,2264,7,13788.0,0
CustomerId_3052,-0.799047,0.746738,890.0,-0.349252,0.0,0.165567,-0.067997,-0.433612,-0.194972,-1.742898,-0.100739,0.848684,-0.994246,2236,12,10700.0,1
CustomerId_3105,0.491064,0.746738,500.0,-0.349252,0.0,0.172394,-0.069046,-0.22078,-0.194733,-1.742898,-0.100739,0.848684,-0.994246,2175,199,159548.0,1
CustomerId_3507,-0.799047,0.746738,500.0,2.379557,0.0,0.165368,-0.071666,-0.43475,-0.193976,-1.742898,-0.100739,0.848684,-0.994246,2215,11,6350.0,1


**Correlation analysis to select the best features**

In [None]:
import pandas as pd

# Calculate the correlation matrix
correlation_matrix = features.corr()

# Get the correlation values for the target column
corr_with_target = correlation_matrix['Risk_Label'].abs()


# Select features that have correlation above the threshold (excluding the target column itself)
selected_features = corr_with_target[corr_with_target > 0.1].index.tolist()
selected_features.remove('Risk_Label')  # Exclude the target column itself

# Return the DataFrame with only the selected features
features = features[selected_features]

In [None]:
corr_with_target.sort_values(ascending=False).reset_index(name='Correlation with Risk_Label')

Unnamed: 0,index,Correlation with Risk_Label
0,Risk_Label,1.0
1,Recency,0.35363
2,Transaction_Count,0.217522
3,Frequency,0.217522
4,Transaction_Year,0.154506
5,Transaction_Month,0.140093
6,ProductCategory,0.130532
7,PricingStrategy,0.108829
8,Transaction_Day,0.093316
9,Amount,0.088088


In [None]:
features.columns

Index(['ProductCategory', 'PricingStrategy', 'Transaction_Count',
       'Transaction_Month', 'Transaction_Year', 'Recency', 'Frequency'],
      dtype='object')