In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from xverse.transformer import WOE
import matplotlib.pyplot as plt
import seaborn as sns

In [10]:

# Load dataset
file_path = "../data/credit_scoring_data.csv"
df = pd.read_csv(file_path)

# Display the first few rows to understand the structure
print(df.head())


         TransactionId         BatchId       AccountId       SubscriptionId  \
0  TransactionId_76871   BatchId_36123  AccountId_3957   SubscriptionId_887   
1  TransactionId_73770   BatchId_15642  AccountId_4841  SubscriptionId_3829   
2  TransactionId_26203   BatchId_53941  AccountId_4229   SubscriptionId_222   
3    TransactionId_380  BatchId_102363   AccountId_648  SubscriptionId_2185   
4  TransactionId_28195   BatchId_38780  AccountId_4841  SubscriptionId_3829   

        CustomerId CurrencyCode  CountryCode    ProviderId     ProductId  \
0  CustomerId_4406          UGX          256  ProviderId_6  ProductId_10   
1  CustomerId_4406          UGX          256  ProviderId_4   ProductId_6   
2  CustomerId_4683          UGX          256  ProviderId_6   ProductId_1   
3   CustomerId_988          UGX          256  ProviderId_1  ProductId_21   
4   CustomerId_988          UGX          256  ProviderId_4   ProductId_6   

      ProductCategory    ChannelId   Amount  Value  TransactionStart

In [11]:
# Aggregate features for each customer
agg_features = df.groupby("CustomerId").agg(
    total_transaction_amount=("Amount", "sum"),
    avg_transaction_amount=("Amount", "mean"),
    transaction_count=("Amount", "count"),
    std_transaction_amount=("Amount", "std")
).reset_index()

# Merge back with the original DataFrame if necessary
df = df.merge(agg_features, on="CustomerId", how="left")

print(agg_features.head())


        CustomerId  total_transaction_amount  avg_transaction_amount  \
0     CustomerId_1                  -10000.0           -10000.000000   
1    CustomerId_10                  -10000.0           -10000.000000   
2  CustomerId_1001                   20000.0             4000.000000   
3  CustomerId_1002                    4225.0              384.090909   
4  CustomerId_1003                   20000.0             3333.333333   

   transaction_count  std_transaction_amount  
0                  1                     NaN  
1                  1                     NaN  
2                  5             6558.963333  
3                 11              560.498966  
4                  6             6030.478146  


# Extract Features


In [12]:
# Ensure TransactionStartTime is in datetime format
df["TransactionStartTime"] = pd.to_datetime(df["TransactionStartTime"])

# Extract features
df["transaction_hour"] = df["TransactionStartTime"].dt.hour
df["transaction_day"] = df["TransactionStartTime"].dt.day
df["transaction_month"] = df["TransactionStartTime"].dt.month
df["transaction_year"] = df["TransactionStartTime"].dt.year

print(df[["TransactionStartTime", "transaction_hour", "transaction_day", "transaction_month", "transaction_year"]].head())


       TransactionStartTime  transaction_hour  transaction_day  \
0 2018-11-15 02:18:49+00:00                 2               15   
1 2018-11-15 02:19:08+00:00                 2               15   
2 2018-11-15 02:44:21+00:00                 2               15   
3 2018-11-15 03:32:55+00:00                 3               15   
4 2018-11-15 03:34:21+00:00                 3               15   

   transaction_month  transaction_year  
0                 11              2018  
1                 11              2018  
2                 11              2018  
3                 11              2018  
4                 11              2018  


# Encode Categorical Variables


In [13]:
# One-Hot Encoding
df = pd.get_dummies(df, columns=["ProductCategory", "ChannelId", "PricingStrategy"], drop_first=True)

# For demonstration: Label Encoding (if required)
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df["CurrencyCode_encoded"] = le.fit_transform(df["CurrencyCode"])



In [14]:
from datetime import datetime
import pandas as pd

# Ensure TransactionStartTime is in datetime format and timezone-naive
df["TransactionStartTime"] = pd.to_datetime(df["TransactionStartTime"]).dt.tz_localize(None)

# Recency: Days since the last transaction
current_date = pd.Timestamp.now(tz=None)  # Ensure current_date is timezone-naive
df["transaction_date"] = df["TransactionStartTime"]  # Keep it as a datetime object
last_transaction = df.groupby("CustomerId")["transaction_date"].max().reset_index()

# Calculate recency in days (difference between current date and last transaction date)
last_transaction["recency"] = (current_date - last_transaction["transaction_date"]).dt.days

# Frequency, Monetary, Size
rfms = df.groupby("CustomerId").agg(
    frequency=("TransactionId", "count"),  # Count of transactions
    monetary=("Amount", "sum"),           # Sum of transaction amounts
    size=("Amount", "mean")               # Average transaction amount
).reset_index()

# Merge recency with RFMS data
rfms = rfms.merge(last_transaction[["CustomerId", "recency"]], on="CustomerId", how="left")

# Display the result
print(rfms.head())


        CustomerId  frequency  monetary          size  recency
0     CustomerId_1          1  -10000.0 -10000.000000     2255
1    CustomerId_10          1  -10000.0 -10000.000000     2255
2  CustomerId_1001          5   20000.0   4000.000000     2261
3  CustomerId_1002         11    4225.0    384.090909     2198
4  CustomerId_1003          6   20000.0   3333.333333     2184


In [15]:
import seaborn as sns
import matplotlib.pyplot as plt

# Visualize RFMS
sns.pairplot(rfms, diag_kind="kde")
plt.show()

# Define thresholds for high/low RFMS scores (adjust thresholds based on visualization)
rfms["rfms_score"] = rfms.apply(lambda x: "good" if (x["recency"] < 30 and x["frequency"] > 5 and x["monetary"] > 1000) else "bad", axis=1)


  plt.show()


In [16]:
from woe import WoE

# Example WoE binning for ProductCategory
woe = WoE()
woe.fit(df["ProductCategory_airtime"], df["FraudResult"])
df["ProductCategory_airtime_woe"] = woe.transform(df["ProductCategory_airtime"])
print(woe.summary())


ImportError: cannot import name 'WoE' from 'woe' (/home/azazh/Documents/10-acadamy/w-6/CreditScoring/envcredit/lib/python3.10/site-packages/woe/__init__.py)

In [None]:
# # Impute missing values (example with std deviation)
# df['Std_Dev_Transaction_Amount'] = df['Std_Dev_Transaction_Amount'].fillna(df['Std_Dev_Transaction_Amount'].mean())


In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Normalization
scaler = MinMaxScaler()
df['Amount_Normalized'] = scaler.fit_transform(df[['Amount']])

# Standardization
standardizer = StandardScaler()
df['Total_Transaction_Amount_Standardized'] = standardizer.fit_transform(df[['Total_Transaction_Amount']])

print(df[['Amount', 'Amount_Normalized', 'Total_Transaction_Amount', 'Total_Transaction_Amount_Standardized']])


# RFMS calculation


In [8]:
# RFMS calculation
df['Recency'] = (df['TransactionStartTime'].max() - df['TransactionStartTime']).dt.days
rfms = df.groupby('CustomerId').agg(
    Recency=('Recency', 'mean'),
    Frequency=('TransactionId', 'count'),
    Monetary=('Amount', 'sum'),
    Std_Dev=('Amount', 'std')
).reset_index()

# Example thresholds for RFMS classification
rfms['RFMS_Score'] = (
    0.3 * rfms['Recency'].rank(ascending=False) +
    0.3 * rfms['Frequency'].rank(ascending=True) +
    0.4 * rfms['Monetary'].rank(ascending=True)
)
rfms['Risk_Label'] = ['Good' if score >= rfms['RFMS_Score'].median() else 'Bad' for score in rfms['RFMS_Score']]
print(rfms)


In [None]:
from xverse.transformer import WOE

# Initialize WoE binning
woe = WOE()
df['FraudResult'] = [0, 0, 0]  # Sample FraudResult values
woe.fit(df, df['FraudResult'])
transformed_df = woe.transform(df)

print(transformed_df.head())


# Construct Default Estimator (Proxy)


In [9]:
# # Construct Default Estimator (Proxy)
# df['RFMS_Score'] = df['Total_Transaction_Amount'] * df['Transaction_Count']
# df['Default_Label'] = df['RFMS_Score'].apply(lambda x: 'Good' if x > df['RFMS_Score'].median() else 'Bad')


# Visualizing RFMS Distribution


In [None]:
# # Visualizing RFMS Distribution
# plt.figure(figsize=(10, 6))
# sns.histplot(df['RFMS_Score'], kde=True, bins=30)
# plt.title('RFMS Score Distribution')
# plt.show()

# print("Feature Engineering completed successfully!")