In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from xverse.transformer import WOE
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the dataset
df = pd.read_csv('../data/credit_scoring_data.csv')


In [None]:
# Create Aggregate Features
df['Total_Transaction_Amount'] = df.groupby('CustomerID')['TransactionAmount'].transform('sum')
df['Avg_Transaction_Amount'] = df.groupby('CustomerID')['TransactionAmount'].transform('mean')
df['Transaction_Count'] = df.groupby('CustomerID')['TransactionAmount'].transform('count')
df['Std_Transaction_Amount'] = df.groupby('CustomerID')['TransactionAmount'].transform('std').fillna(0)


# Extract Features


In [None]:
# Extract Features
df['Transaction_Hour'] = pd.to_datetime(df['TransactionDate']).dt.hour
df['Transaction_Day'] = pd.to_datetime(df['TransactionDate']).dt.day
df['Transaction_Month'] = pd.to_datetime(df['TransactionDate']).dt.month
df['Transaction_Year'] = pd.to_datetime(df['TransactionDate']).dt.year

# Encode Categorical Variables


In [None]:
# Encode Categorical Variables
label_enc = LabelEncoder()
df['Category_Label'] = label_enc.fit_transform(df['TransactionCategory'])

ohe = OneHotEncoder(sparse_output=False)
categorical_encoded = ohe.fit_transform(df[['TransactionCategory']])
categorical_encoded_df = pd.DataFrame(categorical_encoded, columns=ohe.get_feature_names_out())
df = pd.concat([df, categorical_encoded_df], axis=1)

In [None]:
# Handle Missing Values
df.fillna(df.mean(), inplace=True)


In [None]:
# Normalize/Standardize Numerical Features
scaler = MinMaxScaler()
df[['Total_Transaction_Amount', 'Avg_Transaction_Amount', 'Std_Transaction_Amount']] = scaler.fit_transform(
    df[['Total_Transaction_Amount', 'Avg_Transaction_Amount', 'Std_Transaction_Amount']])

# Weight of Evidence (WoE) Binning


In [None]:
# Weight of Evidence (WoE) Binning
woe = WOE()
woe.fit(df.drop(columns=['Default']), df['Default'])
df_woe = woe.transform(df.drop(columns=['Default']))

# Construct Default Estimator (Proxy)


In [None]:

# Construct Default Estimator (Proxy)
df['RFMS_Score'] = df['Total_Transaction_Amount'] * df['Transaction_Count']
df['Default_Label'] = df['RFMS_Score'].apply(lambda x: 'Good' if x > df['RFMS_Score'].median() else 'Bad')

# Visualizing RFMS Distribution


In [None]:
# Visualizing RFMS Distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['RFMS_Score'], kde=True, bins=30)
plt.title('RFMS Score Distribution')
plt.show()

print("Feature Engineering completed successfully!")