<a href="https://colab.research.google.com/github/Abirslab/coll_mate/blob/main/base_model_for%20_Ai_COLLMATE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# prompt: install tensorflow

!pip install tensorflow



In [None]:
import tensorflow as tf

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split


def generate_synthetic_collections_data(num_samples=10000):
    """
    Generate synthetic data for customers whose loans are already 30+ days overdue.
    The target is whether the bank manages to collect in the next 30 days.
    """
    # Days overdue: from 30 to 120
    days_overdue = np.random.randint(30, 121, size=num_samples)

    # Outstanding balance: e.g., 1,000 to 50,000
    outstanding_balance = np.random.randint(1000, 50001, size=num_samples)

    # Monthly income: from 2,000 to 40,000
    monthly_income = np.random.randint(2000, 40001, size=num_samples)

    # Collection attempts so far: from 0 to 5
    collection_attempts = np.random.randint(0, 6, size=num_samples)

    # Credit score: from 300 to 850
    credit_score = np.random.randint(300, 851, size=num_samples)

    # Simulate likelihood of recovery next month
    # We'll combine multiple factors to produce a "base probability" of successful collection.
    base_prob = (
        # Higher credit score => higher chance of recovery
        (credit_score - 300) / 550 * 0.4 +
        # Higher income => better chance of paying
        (monthly_income / 40000) * 0.3 -
        # More days overdue => lower chance
        (days_overdue / 120) * 0.3 -
        # More collection attempts => possibly lower chance (debtor is resisting)
        (collection_attempts * 0.05)
    )

    # Clip the base probability to [0, 1]
    recovery_probability = np.clip(base_prob, 0, 1)

    # Sample the target using this probability
    recovered_next_month = (np.random.rand(num_samples) < recovery_probability).astype(int)

    data = pd.DataFrame({
        'DaysOverdue': days_overdue,
        'OutstandingBalance': outstanding_balance,
        'MonthlyIncome': monthly_income,
        'CollectionAttempts': collection_attempts,
        'CreditScore': credit_score,
        'RecoveredNextMonth': recovered_next_month
    })

    return data

# Generate dataset
df = generate_synthetic_collections_data(10000)

# Split into train/test
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

print("Train set size:", train_df.shape)
print("Test set size:", test_df.shape)
print(train_df.head())

Train set size: (8000, 6)
Test set size: (2000, 6)
      DaysOverdue  OutstandingBalance  MonthlyIncome  CollectionAttempts  \
9254           64               38177          14621                   0   
1561          120                7737          37221                   2   
1670           97               33280          28591                   3   
6087           95               15591          23985                   3   
6669           50                8637          22446                   0   

      CreditScore  RecoveredNextMonth  
9254          330                   0  
1561          423                   0  
1670          364                   0  
6087          591                   0  
6669          815                   0  


In [None]:
features = ['DaysOverdue', 'OutstandingBalance', 'MonthlyIncome',
            'CollectionAttempts', 'CreditScore']
target = 'RecoveredNextMonth'

X_train = train_df[features]
y_train = train_df[target]
X_test = test_df[features]
y_test = test_df[target]

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')  # output layer for binary classification
])

model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model.summary()

history = model.fit(
    X_train_scaled,
    y_train,
    validation_split=0.2,  # use 20% of training set for validation
    epochs=20,
    batch_size=64,
    verbose=1
)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 22ms/step - accuracy: 0.8505 - loss: 0.4708 - val_accuracy: 0.8944 - val_loss: 0.2792
Epoch 2/20
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.9002 - loss: 0.2602 - val_accuracy: 0.8950 - val_loss: 0.2636
Epoch 3/20
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.9056 - loss: 0.2484 - val_accuracy: 0.8944 - val_loss: 0.2578
Epoch 4/20
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9068 - loss: 0.2440 - val_accuracy: 0.8944 - val_loss: 0.2589
Epoch 5/20
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9065 - loss: 0.2442 - val_accuracy: 0.8944 - val_loss: 0.2578
Epoch 6/20
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9024 - loss: 0.2462 - val_accuracy: 0.8944 - val_loss: 0.2554
Epoch 7/20
[1m100/100[0m

In [None]:

loss, accuracy = model.evaluate(X_test_scaled, y_test, verbose=0)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

Test Loss: 0.2557
Test Accuracy: 0.9045


In [None]:
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report

y_pred_proba = model.predict(X_test_scaled)
y_pred = (y_pred_proba > 0.5).astype(int)

auc_score = roc_auc_score(y_test, y_pred_proba)
print(f"AUC Score: {auc_score:.4f}")

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
AUC Score: 0.8125
Classification Report:
              precision    recall  f1-score   support

           0       0.90      1.00      0.95      1809
           1       0.00      0.00      0.00       191

    accuracy                           0.90      2000
   macro avg       0.45      0.50      0.47      2000
weighted avg       0.82      0.90      0.86      2000

Confusion Matrix:
[[1809    0]
 [ 191    0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Probabilities of next-month recovery
predicted_probabilities = model.predict(X_test_scaled).flatten()

# Create a DataFrame that holds the features and predicted probabilities
results_df = test_df.copy()
results_df['PredictedProbability'] = predicted_probabilities
results_df.head(10)

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step


Unnamed: 0,DaysOverdue,OutstandingBalance,MonthlyIncome,CollectionAttempts,CreditScore,RecoveredNextMonth,PredictedProbability
6252,89,42737,6985,5,428,0,1.03751e-07
4684,47,2087,16291,2,696,0,0.2418184
1731,90,15250,4516,4,753,0,0.001739747
4742,73,36174,37201,2,333,0,0.01392278
4521,69,36417,25594,0,758,0,0.3064003
6340,70,42479,29471,0,767,0,0.3053914
576,95,14491,36044,4,348,0,0.0001563758
5202,55,26435,33584,1,329,0,0.08063285
6363,52,6002,2347,5,595,0,0.0001140268
439,60,25395,9068,3,357,0,1.385832e-05


In [None]:
def assign_action(prob):
    if prob >= 0.70:
        return "High Probability -> Simple SMS or 1 phone call"
    elif prob >= 0.40:
        return "Medium Probability -> Phone call + possibly partial restructuring"
    else:
        return "Low Probability -> Intensive approach or specialized follow-up"

results_df['ActionPlan'] = results_df['PredictedProbability'].apply(assign_action)
results_df.head(10)

Unnamed: 0,DaysOverdue,OutstandingBalance,MonthlyIncome,CollectionAttempts,CreditScore,RecoveredNextMonth,PredictedProbability,ActionPlan
6252,89,42737,6985,5,428,0,1.03751e-07,Low Probability -> Intensive approach or speci...
4684,47,2087,16291,2,696,0,0.2418184,Low Probability -> Intensive approach or speci...
1731,90,15250,4516,4,753,0,0.001739747,Low Probability -> Intensive approach or speci...
4742,73,36174,37201,2,333,0,0.01392278,Low Probability -> Intensive approach or speci...
4521,69,36417,25594,0,758,0,0.3064003,Low Probability -> Intensive approach or speci...
6340,70,42479,29471,0,767,0,0.3053914,Low Probability -> Intensive approach or speci...
576,95,14491,36044,4,348,0,0.0001563758,Low Probability -> Intensive approach or speci...
5202,55,26435,33584,1,329,0,0.08063285,Low Probability -> Intensive approach or speci...
6363,52,6002,2347,5,595,0,0.0001140268,Low Probability -> Intensive approach or speci...
439,60,25395,9068,3,357,0,1.385832e-05,Low Probability -> Intensive approach or speci...
