## Import Libraries

In [2]:
import sys
sys.path.append('..')
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from src.data_processing import load_data, CreditRiskFeatureEngineer
from src.rmf_target import RFMBasedTargetBuilder
from src.train import Trainer
trainer = Trainer()
feature_engineer = CreditRiskFeatureEngineer()
rfm_builder = RFMBasedTargetBuilder()



## Load data

In [3]:
df = load_data('../data/raw/data.csv')
df.head(5)

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15T02:18:49Z,2,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15T02:19:08Z,2,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15T02:44:21Z,2,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15T03:32:55Z,2,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15T03:34:21Z,2,0


## Feature Engineering

In [4]:
X_features, features_df = feature_engineer.fit_transform(df)

features_df.head()

Unnamed: 0,CustomerId,total_amount,avg_amount,transaction_count,std_amount,transaction_hour,transaction_day,transaction_month,transaction_year,ProductCategory,ChannelId,ProviderId,PricingStrategy
0,CustomerId_1,-10000.0,-10000.0,1,,16.0,21.0,11.0,2018.0,airtime,ChannelId_2,ProviderId_4,4
1,CustomerId_10,-10000.0,-10000.0,1,,16.0,21.0,11.0,2018.0,airtime,ChannelId_2,ProviderId_4,4
2,CustomerId_1001,20000.0,4000.0,5,6558.963333,7.8,16.0,11.0,2018.0,financial_services,ChannelId_3,ProviderId_5,2
3,CustomerId_1002,4225.0,384.090909,11,560.498966,13.454545,14.727273,8.636364,2018.272727,airtime,ChannelId_3,ProviderId_5,4
4,CustomerId_1003,20000.0,3333.333333,6,6030.478146,14.333333,1.0,2.0,2019.0,financial_services,ChannelId_3,ProviderId_5,2


## Processed features

In [5]:
X_features

array([[-0.06689056, -0.15336429, -0.25345907, ...,  0.        ,
         0.        ,  1.        ],
       [-0.06689056, -0.15336429, -0.25345907, ...,  0.        ,
         0.        ,  1.        ],
       [-0.05584873, -0.06987027, -0.21218649, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [ 0.13696844, -0.06161104,  0.77835538, ...,  0.        ,
         1.        ,  0.        ],
       [-0.01204948, -0.04496239, -0.08836876, ...,  0.        ,
         1.        ,  0.        ],
       [-0.02640386, -0.06661726, -0.03677804, ...,  0.        ,
         0.        ,  1.        ]], shape=(3742, 31))

## Proxy Target Engineering

In [6]:
rfm_df = rfm_builder.compute_rfm(df)
target_df = rfm_builder.assign_high_risk_label(rfm_df)


target_df.head()

Unnamed: 0,CustomerId,is_high_risk
0,CustomerId_1,1
1,CustomerId_10,1
2,CustomerId_1001,1
3,CustomerId_1002,0
4,CustomerId_1003,0


In [7]:
target_df["is_high_risk"].value_counts()

is_high_risk
0    2315
1    1427
Name: count, dtype: int64

## FINAL DATASET

In [8]:
final_dataset = features_df.merge(
    target_df,
    on="CustomerId",
    how="left"
)

final_dataset.head()

Unnamed: 0,CustomerId,total_amount,avg_amount,transaction_count,std_amount,transaction_hour,transaction_day,transaction_month,transaction_year,ProductCategory,ChannelId,ProviderId,PricingStrategy,is_high_risk
0,CustomerId_1,-10000.0,-10000.0,1,,16.0,21.0,11.0,2018.0,airtime,ChannelId_2,ProviderId_4,4,1
1,CustomerId_10,-10000.0,-10000.0,1,,16.0,21.0,11.0,2018.0,airtime,ChannelId_2,ProviderId_4,4,1
2,CustomerId_1001,20000.0,4000.0,5,6558.963333,7.8,16.0,11.0,2018.0,financial_services,ChannelId_3,ProviderId_5,2,1
3,CustomerId_1002,4225.0,384.090909,11,560.498966,13.454545,14.727273,8.636364,2018.272727,airtime,ChannelId_3,ProviderId_5,4,0
4,CustomerId_1003,20000.0,3333.333333,6,6030.478146,14.333333,1.0,2.0,2019.0,financial_services,ChannelId_3,ProviderId_5,2,0


## Prepare Data


In [9]:
# Using x_featuers for predictors and is_high_risk as target
X = X_features
y = final_dataset["is_high_risk"]
trainer.split_data(X, y)

## Baseline Logistic Regression

In [10]:
trainer.train_baseline(LogisticRegression(max_iter=1000))
print(f"Baseline Logistic Regression: {trainer.evaluate_model()}")

Baseline Logistic Regression: {'accuracy': 0.8878504672897196, 'precision': 0.8430034129692833, 'recall': 0.8666666666666667, 'f1_score': 0.8546712802768166, 'roc_auc': 0.9618345432546885}


In [12]:
param_grid = {
    "C": [0.01, 0.1, 1, 10],
    "solver": ["lbfgs"]
}

best_logreg, best_params = trainer.hyperparameter_tuning(
    model=LogisticRegression(max_iter=1000),
    param_grid=param_grid
)

print("Best params:", best_params)
print("Tuned Logistic Regression:", trainer.evaluate_model())


Best params: {'C': 10, 'solver': 'lbfgs'}
Tuned Logistic Regression: {'accuracy': 0.8918558077436582, 'precision': 0.8566433566433567, 'recall': 0.8596491228070176, 'f1_score': 0.8581436077057794, 'roc_auc': 0.9675060496067756}


In [13]:
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=6,
    random_state=42
)

trainer.train_baseline(rf)
rf_metrics = trainer.evaluate_model()

print("Random Forest:", rf_metrics)

Random Forest: {'accuracy': 0.9465954606141522, 'precision': 0.9959514170040485, 'recall': 0.8631578947368421, 'f1_score': 0.924812030075188, 'roc_auc': 0.9871748336358136}


In [15]:
rf_param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [None, 4, 6, 10],
    "min_samples_split": [2, 5, 10],
    "class_weight": ["balanced"]
}
best_logreg, best_params = trainer.hyperparameter_tuning(
    model=RandomForestClassifier(
    n_estimators=200,
    max_depth=6,
    random_state=42
),
    param_grid=rf_param_grid
)

print("Best params:", best_params)
print("Tuned Logistic Regression:", trainer.evaluate_model())

Best params: {'class_weight': 'balanced', 'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Tuned Logistic Regression: {'accuracy': 0.9746328437917223, 'precision': 0.9854014598540146, 'recall': 0.9473684210526315, 'f1_score': 0.9660107334525939, 'roc_auc': 0.9942264065335753}
