In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import scorecardpy as sc

In [2]:
# Load your dataset (replace 'your_file.csv' with your actual file path)
df = pd.read_csv('C:/Users/elbet/OneDrive/Desktop/Ten/week-6/github-notebook/Credit-Scoring-Model-/data/data/cleaned_data.csv')

## 1. Create Aggregate Features

In [3]:
# Total Transaction Amount per customer
df['TotalTransactionAmount'] = df.groupby('CustomerId')['Amount'].transform('sum')

# Average Transaction Amount per customer
df['AverageTransactionAmount'] = df.groupby('CustomerId')['Amount'].transform('mean')

# Transaction Count per customer
df['TransactionCount'] = df.groupby('CustomerId')['TransactionId'].transform('count')

# Standard Deviation of Transaction Amounts per customer
df['TransactionAmountStd'] = df.groupby('CustomerId')['Amount'].transform('std').fillna(0)

## 2. Extract Time-Based Features

In [4]:
df['TransactionStartTime'] = pd.to_datetime(df['TransactionStartTime'], errors='coerce')
df['TransactionHour'] = df['TransactionStartTime'].dt.hour
df['TransactionDay'] = df['TransactionStartTime'].dt.day
df['TransactionMonth'] = df['TransactionStartTime'].dt.month
df['TransactionYear'] = df['TransactionStartTime'].dt.year

## 3. Encode Categorical Variables using WOE

In [5]:
# 3. Encode Categorical Variables using scorecardpy's WOE
# ---------------------
# Assuming 'FraudResult' is the target variable and 'ProductCategory', 'ProviderId', 'ChannelId' are the features
features = ['ProductCategory', 'ProviderId', 'ChannelId']

# Calculate the WOE and IV for each feature
bins = sc.woebin(df, y='FraudResult', x=features)

# Apply the WOE transformation to the dataset
df_woe = sc.woebin_ply(df, bins)

[INFO] creating woe binning ...


  datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd.to_datetime,errors='ignore').select_dtypes('datetime64').columns.tolist()
  datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd.to_datetime,errors='ignore').select_dtypes('datetime64').columns.tolist()
  datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd.to_datetime,errors='ignore').select_dtypes('datetime64').columns.tolist()
  datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd.to_datetime,errors='ignore').select_dtypes('datetime64').columns.tolist()
  datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd.to_datetime,errors='ignore').select_dtypes('datetime64').columns.tolist()
  datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd.to_datetime,errors='ignore').select_dtypes('datetime64').columns.tolist()
  datetime_cols 

[INFO] converting into woe values ...


In [8]:
print(df_woe.columns)

Index(['AverageTransactionAmount', 'CountryCode', 'TransactionYear',
       'TransactionStartTime', 'TransactionAmountStd',
       'TotalTransactionAmount', 'TransactionMonth', 'CustomerId', 'Amount',
       'FraudResult', 'TransactionId', 'TransactionCount', 'Value',
       'TransactionDay', 'ProductId', 'SubscriptionId', 'CurrencyCode',
       'PricingStrategy', 'AccountId', 'BatchId', 'TransactionHour',
       'ProviderId_woe', 'ProductCategory_woe', 'ChannelId_woe'],
      dtype='object')


## 4. Handle Missing Values

In [9]:
# Fill missing numerical columns with median
for col in ['Amount', 'Value', 'TotalTransactionAmount', 'AverageTransactionAmount', 'TransactionAmountStd']:
    df_woe[col].fillna(df_woe[col].median(), inplace=True)

# Handle missing values for categorical WOE columns
for col in ['ProductCategory_woe', 'CurrencyCode', 'ProviderId_woe', 'ChannelId_woe']:
    df_woe[col].fillna(df_woe[col].mode()[0], inplace=True)



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_woe[col].fillna(df_woe[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_woe[col].fillna(df_woe[col].mode()[0], inplace=True)


## 5. Normalize/Standardize Numerical Features

In [10]:
# Standardize (mean=0, std=1) the numerical features
scaler = StandardScaler()
df_woe[['Amount', 'Value', 'TotalTransactionAmount', 'AverageTransactionAmount', 'TransactionAmountStd']] = scaler.fit_transform(
    df_woe[['Amount', 'Value', 'TotalTransactionAmount', 'AverageTransactionAmount', 'TransactionAmountStd']]
)

In [11]:
# Final check of the dataset
print(df_woe.head())

   AverageTransactionAmount  CountryCode  TransactionYear  \
0                 -0.067623          256             2018   
1                 -0.067623          256             2018   
2                 -0.072568          256             2018   
3                 -0.008155          256             2018   
4                 -0.008155          256             2018   

       TransactionStartTime  TransactionAmountStd  TotalTransactionAmount  \
0 2018-11-15 02:18:49+00:00             -0.167016                0.170118   
1 2018-11-15 02:19:08+00:00             -0.167016                0.170118   
2 2018-11-15 02:44:21+00:00             -0.201209                0.165122   
3 2018-11-15 03:32:55+00:00             -0.008243                0.175567   
4 2018-11-15 03:34:21+00:00             -0.008243                0.175567   

   TransactionMonth       CustomerId    Amount  FraudResult  ...  \
0                11  CustomerId_4406 -0.046371            0  ...   
1                11  CustomerId_44

In [12]:
# Save the processed dataset to CSV (optional)
df_woe.to_csv('C:/Users/elbet/OneDrive/Desktop/Ten/week-6/github-notebook/Credit-Scoring-Model-/data/data/processed_data.csv', index=False)