In [15]:
import pandas as pd
import sys
import os
import warnings
warnings.filterwarnings('ignore')
sys.path.append('../scripts')
from data_loader import *
from credit_score import *
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [11]:
# Load the data
df = pd.read_parquet('processed_model_input.parquet')
df.isna().sum()

RFMS_Score                  0
RFMS_bin                    0
Amount                      0
Value                       0
FraudResult                 0
TotalTransactionAmount      0
AverageTransactionAmount    0
TransactionCount            0
StdTransactionAmount        0
TransactionHour             0
TransactionDay              0
TransactionMonth            0
TransactionYear             0
Recency                     0
Frequency                   0
Monetary                    0
StdDev                      0
Assessment_Binary           0
RFMS_bin_woe                0
dtype: int64

In [12]:
selected_features = ['RFMS_Score', 'RFMS_bin', 'Recency', 'Frequency', 'Monetary', 'StdDev', 'Assessment_Binary']
final_df = df[selected_features]

In [13]:
final_df.shape

(94593, 7)

In [14]:
final_df.head()

Unnamed: 0,RFMS_Score,RFMS_bin,Recency,Frequency,Monetary,StdDev,Assessment_Binary
0,-0.900552,0,-1.009137,-0.719608,0.766793,-0.323907,0
1,-0.907971,0,-1.009137,1.441846,-1.281614,-0.447545,0
2,-1.106788,0,-1.000266,-0.724095,0.767108,-0.623037,0
3,0.202991,1,-1.009137,-0.722413,0.770518,1.250644,0
4,-0.907971,0,-1.009137,1.441846,-1.281614,-0.447545,0


# Model 1 - model that assigns risk probability for a new customer

In [16]:
# Split the data into features and target
X = final_df.drop('Assessment_Binary', axis=1)
y = final_df['Assessment_Binary']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# logistic regression model
assessment_model = LogisticRegression()

In [18]:
# Perform cross-validation during training
cv_accuracy = cross_val_score(assessment_model, X_train, y_train, cv=5)
cv_precision = cross_val_score(assessment_model, X_train, y_train, cv=5, scoring='precision')
cv_recall = cross_val_score(assessment_model, X_train, y_train, cv=5, scoring='recall')
cv_f1 = cross_val_score(assessment_model, X_train, y_train, cv=5, scoring='f1')

print("Cross-Validation Accuracy:", cv_accuracy.mean())
print("Cross-Validation Precision:", cv_precision.mean())
print("Cross-Validation Recall:", cv_recall.mean())
print("Cross-Validation F1-score:", cv_f1.mean())

Cross-Validation Accuracy: 0.9999603567888998
Cross-Validation Precision: 0.9999471109348143
Cross-Validation Recall: 0.9999735554674072
Cross-Validation F1-score: 0.9999603305785124


## Model Training

In [19]:
assessment_model.fit(X_train, y_train)

## Model Evaluation

In [20]:
# Evaluate the model on the testing set
test_accuracy = accuracy_score(y_test, assessment_model.predict(X_test))
test_precision = precision_score(y_test, assessment_model.predict(X_test))
test_recall = recall_score(y_test, assessment_model.predict(X_test))
test_f1 = f1_score(y_test, assessment_model.predict(X_test))

print("Testing Accuracy:", test_accuracy)
print("Testing Precision:", test_precision)
print("Testing Recall:", test_recall)
print("Testing F1-score:", test_f1)

Testing Accuracy: 0.999894286167345
Testing Precision: 0.9997892296343134
Testing Recall: 1.0
Testing F1-score: 0.9998946037099494


## Test

In [21]:
y_pred = assessment_model.predict(X_test)

In [22]:
y_pred

array([1, 1, 1, ..., 0, 0, 1], dtype=int32)

# Model 2 - model that assigns credit score from risk probability estimates


In [5]:
# path to the CSV file
filename = 'processed_model_input.parquet'
path = os.path.join('..', 'data/model_input', filename)

# Load dataset
featured_df = load_data(path)
featured_df.head()

Unnamed: 0,CustomerId,CurrencyCode,CountryCode,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionHour,TransactionDay,TransactionMonth,TransactionYear,PricingStrategy,FraudResult,RFMS_Score,RFMS_bin,Assessment_Binary,RFMS_bin_woe,default_rate_per_bin,woe_per_bin
0,0.595592,0.0,0.0,-1.579657,-0.797775,0.748262,-0.099739,-0.292853,-2.155,-0.101329,0.848594,-0.994095,-0.352001,-0.014114,-1.281459,0,0,-10.367253,0.0,-10.367253
1,0.595592,0.0,0.0,0.913723,0.492978,-1.172748,-0.1595,-0.352663,-2.155,-0.101329,0.848594,-0.994095,-0.352001,-0.014114,-1.29237,0,0,-10.367253,0.0,-10.367253
2,0.808675,0.0,0.0,-1.718179,-0.797775,0.748262,-0.129034,-0.323368,-2.155,-0.101329,0.848594,-0.994095,-0.352001,-0.014114,-1.576665,0,0,-10.367253,0.0,-10.367253
3,1.694602,0.0,0.0,-0.194446,4.365238,0.748262,1.013448,0.976583,-1.948696,-0.101329,0.848594,-0.994095,-0.352001,-0.014114,0.298088,1,0,0.011755,0.502955,0.011755
4,1.694602,0.0,0.0,0.913723,0.492978,-1.172748,-0.196059,-0.314579,-1.948696,-0.101329,0.848594,-0.994095,-0.352001,-0.014114,-1.29237,0,0,-10.367253,0.0,-10.367253


## Calculate credit score

In [6]:
credit_df = calculate_credit_score(featured_df)
credit_df.head()

Unnamed: 0,CustomerId,CurrencyCode,CountryCode,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionHour,TransactionDay,...,PricingStrategy,FraudResult,RFMS_Score,RFMS_bin,Assessment_Binary,RFMS_bin_woe,default_rate_per_bin,woe_per_bin,credit_score,fico_credit_score
0,0.595592,0.0,0.0,-1.579657,-0.797775,0.748262,-0.099739,-0.292853,-2.155,-0.101329,...,-0.352001,-0.014114,-1.281459,0,0,-10.367253,0.0,-10.367253,-38.417309,300.0
1,0.595592,0.0,0.0,0.913723,0.492978,-1.172748,-0.1595,-0.352663,-2.155,-0.101329,...,-0.352001,-0.014114,-1.29237,0,0,-10.367253,0.0,-10.367253,-38.626177,300.0
2,0.808675,0.0,0.0,-1.718179,-0.797775,0.748262,-0.129034,-0.323368,-2.155,-0.101329,...,-0.352001,-0.014114,-1.576665,0,0,-10.367253,0.0,-10.367253,-44.068405,300.0
3,1.694602,0.0,0.0,-0.194446,4.365238,0.748262,1.013448,0.976583,-1.948696,-0.101329,...,-0.352001,-0.014114,0.298088,1,0,0.011755,0.502955,0.011755,5.722004,848.205644
4,1.694602,0.0,0.0,0.913723,0.492978,-1.172748,-0.196059,-0.314579,-1.948696,-0.101329,...,-0.352001,-0.014114,-1.29237,0,0,-10.367253,0.0,-10.367253,-38.626177,300.0


In [7]:
# Split the data into features and target
X = credit_df[['RFMS_Score', 'RFMS_bin_woe']]
y = credit_df['fico_credit_score']

In [8]:
# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
    
 # Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [9]:
from sklearn.ensemble import RandomForestRegressor
# Train the Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [13]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Evaluate the RandomForestRegressor model on the test set
y_pred = model.predict(X_test)

# Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae:.2f}")

# Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")

# Root Mean Squared Error (RMSE)
rmse = mse ** 0.5
print(f"Root Mean Squared Error: {rmse:.2f}")

# R-squared (R2) Score
r2 = r2_score(y_test, y_pred)
print(f"R-squared (R2) Score: {r2:.2f}")

Mean Absolute Error: 0.00
Mean Squared Error: 0.00
Root Mean Squared Error: 0.04
R-squared (R2) Score: 1.00


In [14]:
import pandas as pd

# Create a DataFrame with the real and predicted values
results = pd.DataFrame({
    'Real_Value': y_test,
    'Predicted_Value': y_pred
})

# Display the first few rows of the DataFrame
print(results.head())

       Real_Value  Predicted_Value
71514  850.000000       850.000000
29275  300.000000       300.000006
78507  850.000000       850.000000
24345  326.811625       326.625270
58924  850.000000       850.000000
