In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')

In [24]:
df = pd.read_csv("../data/data.csv")

In [25]:
df.dtypes

TransactionId            object
BatchId                  object
AccountId                object
SubscriptionId           object
CustomerId               object
CurrencyCode             object
CountryCode               int64
ProviderId               object
ProductId                object
ProductCategory          object
ChannelId                object
Amount                  float64
Value                     int64
TransactionStartTime     object
PricingStrategy           int64
FraudResult               int64
dtype: object

In [26]:
# Grouping by CustomerId to create aggregate features
aggregated_features = df.groupby('CustomerId').agg({
    'Amount': ['sum', 'mean', 'max', 'std'],  # Spending patterns
    'TransactionStartTime': 'count'  # Number of transactions
})

# Rename columns for clarity
aggregated_features.columns = ['_'.join(col).strip() for col in aggregated_features.columns]
aggregated_features.reset_index(inplace=True)

aggregated_features

Unnamed: 0,CustomerId,Amount_sum,Amount_mean,Amount_max,Amount_std,TransactionStartTime_count
0,CustomerId_1,-10000.0,-10000.000000,-10000.0,,1
1,CustomerId_10,-10000.0,-10000.000000,-10000.0,,1
2,CustomerId_1001,20000.0,4000.000000,10000.0,6558.963333,5
3,CustomerId_1002,4225.0,384.090909,1500.0,560.498966,11
4,CustomerId_1003,20000.0,3333.333333,10000.0,6030.478146,6
...,...,...,...,...,...,...
3737,CustomerId_992,20000.0,3333.333333,10000.0,6088.240030,6
3738,CustomerId_993,20000.0,4000.000000,10000.0,6745.368782,5
3739,CustomerId_994,543873.0,5384.881188,90000.0,14800.656784,101
3740,CustomerId_996,139000.0,8176.470588,10000.0,4433.329648,17


In [27]:
import sys
sys.path.append('../scripts')

from feature_extraction import extract_transaction_features
# Extract features
df = extract_transaction_features(df, 'TransactionStartTime')
df

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult,Transaction Hour,Transaction Day,Transaction Month,Transaction Year
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15 02:18:49+00:00,2,0,2,15,11,2018
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15 02:19:08+00:00,2,0,2,15,11,2018
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15 02:44:21+00:00,2,0,2,15,11,2018
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15 03:32:55+00:00,2,0,3,15,11,2018
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15 03:34:21+00:00,2,0,3,15,11,2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95657,TransactionId_89881,BatchId_96668,AccountId_4841,SubscriptionId_3829,CustomerId_3078,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-1000.0,1000,2019-02-13 09:54:09+00:00,2,0,9,13,2,2019
95658,TransactionId_91597,BatchId_3503,AccountId_3439,SubscriptionId_2643,CustomerId_3874,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2019-02-13 09:54:25+00:00,2,0,9,13,2,2019
95659,TransactionId_82501,BatchId_118602,AccountId_4841,SubscriptionId_3829,CustomerId_3874,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2019-02-13 09:54:35+00:00,2,0,9,13,2,2019
95660,TransactionId_136354,BatchId_70924,AccountId_1346,SubscriptionId_652,CustomerId_1709,UGX,256,ProviderId_6,ProductId_19,tv,ChannelId_3,3000.0,3000,2019-02-13 10:01:10+00:00,2,0,10,13,2,2019


In [28]:
df.isnull().sum()

TransactionId           0
BatchId                 0
AccountId               0
SubscriptionId          0
CustomerId              0
CurrencyCode            0
CountryCode             0
ProviderId              0
ProductId               0
ProductCategory         0
ChannelId               0
Amount                  0
Value                   0
TransactionStartTime    0
PricingStrategy         0
FraudResult             0
Transaction Hour        0
Transaction Day         0
Transaction Month       0
Transaction Year        0
dtype: int64

In [29]:
# Convert TransactionStartTime to datetime
df['TransactionStartTime'] = pd.to_datetime(df['TransactionStartTime'])

In [30]:
# Encoding Categorical Variables
categorical_cols = ['CurrencyCode', 'CountryCode', 'ProviderId', 'ProductId', 'ProductCategory', 'ChannelId']
ohe = OneHotEncoder(drop='first', sparse_output=False)
df_encoded = pd.DataFrame(ohe.fit_transform(df[categorical_cols]))
df_encoded.columns = ohe.get_feature_names_out(categorical_cols)
df = df.drop(columns=categorical_cols).join(df_encoded)

# Normalize Numerical Features
scaler = StandardScaler()
numeric_cols = ['Amount', 'Value', 'Transaction Hour', 'Transaction Day', 'Transaction Month', 'Transaction Year']
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

# Feature Engineering: RFMS (Recency, Frequency, Monetary, Stability)
df['Recency'] = df.groupby('CustomerId')['TransactionStartTime'].transform(lambda x: (pd.to_datetime(x).max() - pd.to_datetime(x)).dt.days)
df['Frequency'] = df.groupby('CustomerId')['TransactionId'].transform('count')
df['Monetary'] = df.groupby('CustomerId')['Amount'].transform('sum')
df['Stability'] = df.groupby('CustomerId')['Amount'].transform('std').fillna(0)

In [33]:
# Function to Calculate WoE and IV
def woe_iv(df, feature, target):
  df = df[[feature, target]].copy()
  df[feature] = pd.qcut(df[feature], q=10, duplicates='drop')  # Binning into deciles

  grouped = df.groupby(feature)[target].agg(['count', 'sum'])
  grouped.columns = ['Total', 'Bad']

  grouped['Good'] = grouped['Total'] - grouped['Bad']
  grouped['Bad_Dist'] = grouped['Bad'] / grouped['Bad'].sum()
  grouped['Good_Dist'] = grouped['Good'] / grouped['Good'].sum()
  grouped['WoE'] = np.log(grouped['Good_Dist'] / grouped['Bad_Dist']).replace({np.inf: 0, -np.inf: 0})
  grouped['IV'] = (grouped['Good_Dist'] - grouped['Bad_Dist']) * grouped['WoE']

  iv = grouped['IV'].sum()

  # Replace original values with WoE
  woe_map = grouped['WoE'].to_dict()
  df[feature] = df[feature].map(woe_map)

  return df[feature], iv

# Define Features to Apply WoE
rfms_features = ['Recency', 'Frequency', 'Monetary', 'Stability']
target_col = 'FraudResult'

# Apply WoE Transformation
iv_dict = {}  # Store IV values
for feature in rfms_features:
  df[feature], iv = woe_iv(df, feature, target_col)
  iv_dict[feature] = iv

print("Information Value (IV) of Features:", iv_dict)

Information Value (IV) of Features: {'Recency': np.float64(1.1103073712440035), 'Frequency': np.float64(0.6903273875475657), 'Monetary': np.float64(2.3330050493343535), 'Stability': np.float64(1.9624575764683714)}


In [34]:
# Define Target Variable (Binary Classification)
df['Creditworthiness'] = np.where(df['FraudResult'] == 0, 1, 0)  # Good = 1, Bad = 0
df.drop(columns=['FraudResult'], inplace=True)

In [39]:
# Split Data
X = df.drop(columns=['CustomerId', 'TransactionId', 'BatchId', 'AccountId', 'SubscriptionId','TransactionStartTime', 'Creditworthiness'])
y = df['Creditworthiness']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [41]:
# Train Models
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [42]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

In [43]:
# Predictions
y_pred_logreg = logreg.predict(X_test)
y_pred_rf = rf.predict(X_test)

In [44]:
from model_evaluation import evaluate_model
evaluate_model(y_test, y_pred_logreg, "Logistic Regression")
evaluate_model(y_test, y_pred_rf, "Random Forest")

Logistic Regression Performance:
Accuracy: 0.9984
Precision: 0.9985
Recall: 0.9999
F1 Score: 0.9992
ROC-AUC: 0.5972

Random Forest Performance:
Accuracy: 0.9998
Precision: 0.9999
Recall: 0.9998
F1 Score: 0.9999
ROC-AUC: 0.9860



In [46]:
import joblib

# Save the models
joblib.dump(logreg, "../models/logistic_regression.pkl")
joblib.dump(rf, "../models/random_forest.pkl")

['../models/random_forest.pkl']