In [1]:
# Import necessary libraries
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Step 1: Load and explore the data
# Load your dataset into a DataFrame, e.g., complaints = pd.read_csv('your_data.csv')
complaints = pd.read_csv('shared/complaints_25Nov21.csv')  # Replace 'your_data.csv' with your dataset path
# ...

# Step 2: Set your X and y variables
X = complaints[['Product', 'Sub-product', 'Issue', 'State', 'Tags', 'Submitted via', 'Company response to consumer', 'Timely response?']]
y = complaints['Consumer disputed?']

# Use one-hot encoding to convert categorical features
categorical_features = ['Product', 'Sub-product', 'Issue', 'State', 'Tags', 'Submitted via', 'Company response to consumer', 'Timely response?']
categorical_transformer = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(), categorical_features)])

X = categorical_transformer.fit_transform(X)

le = preprocessing.LabelEncoder()
y = le.fit_transform(y)

# Step 3: Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Step 4: Check and balance class proportions
undersampler = RandomUnderSampler(random_state=123)
X_train, y_train = undersampler.fit_resample(X_train, y_train)

# Step 5: Train the XGBoost Classifier
model_xgb = XGBClassifier(random_state=123)
model_xgb.fit(X_train, y_train)

# Rest of the code remains the same as in the previous answer


In [2]:
# Calculate the proportion of consumers who raised a dispute in the test set
proportion_disputed_test_set = (y_test == 1).mean()

# Print the result
print(f"Proportion of consumers who raised a dispute in the test set: {proportion_disputed_test_set:.6f}")

Proportion of consumers who raised a dispute in the test set: 0.215864


In [3]:
# Calculate the proportion of consumers who raised a dispute in the modified training dataset
proportion_disputed_training_set = (y_train == 1).mean()

# Print the result
print(f"Proportion of consumers who raised a dispute in the modified training dataset: {proportion_disputed_training_set:.6f}")


Proportion of consumers who raised a dispute in the modified training dataset: 0.500000


In [4]:
from sklearn.metrics import recall_score

# Make predictions on the test set using the trained XGBoost model
y_pred = model_xgb.predict(X_test)

# Calculate the recall for the category 'Consumer disputed?' = 'Yes'
recall_disputed_yes = recall_score(y_test, y_pred, pos_label=1)

# Print the recall
print(f"Recall for 'Consumer disputed?' = 'Yes' on the test set: {recall_disputed_yes:.2f}")

Recall for 'Consumer disputed?' = 'Yes' on the test set: 0.63


In [5]:
# Define the cost structure (in dollars)
cost_per_disputed = 600  # Cost for every disputed complaint
cost_per_non_disputed = 100  # Cost for every non-disputed complaint

# Calculate the total cost for the test set without using a model (base-case)
total_base_case_cost = (y_test == 1).sum() * cost_per_disputed + (y_test == 0).sum() * cost_per_non_disputed

# Print the total cost
print(f"Total Cost without Model (Base-Case): ${total_base_case_cost:,.2f}")

Total Cost without Model (Base-Case): $8,619,200.00


In [6]:
import numpy as np

# Calculate the total cost based on model results using the default threshold (0.5)
true_positives_default = np.sum((y_pred == 1) & (y_test == 1))
false_negatives_default = np.sum((y_pred == 0) & (y_test == 1))

# The banks decide to spend $90 on complaints predicted to be disputed
extra_diligence_cost = true_positives_default * 90

total_cost_with_model = (false_negatives_default * 100) + extra_diligence_cost

print(f"Total cost to the banks based on model results: ${total_cost_with_model}")

Total cost to the banks based on model results: $838340


In [7]:
import numpy as np
from sklearn.metrics import precision_score, recall_score

# Assuming you have trained your model (model_xgb) and have the true labels in y_test
# ...

# Calculate the probability scores for the test set
y_prob = model_xgb.predict_proba(X_test)[:, 1]

# Define cost parameters
disputed_cost = 600  # Cost for a disputed complaint
non_disputed_cost = 100  # Cost for a non-disputed complaint
extra_diligence_cost = 90  # Cost for doing extra diligence

best_threshold = 0.5
best_cost = float("inf")

# Generate a range of threshold values to test
thresholds = np.arange(0.1, 1.0, 0.05)

for threshold in thresholds:
    y_pred_thresholded = (y_prob >= threshold).astype(int)
    
    # Calculate the total cost for this threshold
    true_positives = np.sum((y_pred_thresholded == 1) & (y_test == 1))
    false_negatives = np.sum((y_pred_thresholded == 0) & (y_test == 1))
    false_positives = np.sum((y_pred_thresholded == 1) & (y_test == 0))
    
    total_cost = (true_positives * extra_diligence_cost) + (false_negatives * disputed_cost) + (false_positives * non_disputed_cost)
    
    # Update the best threshold and cost if the current threshold yields a lower cost
    if total_cost < best_cost:
        best_cost = total_cost
        best_threshold = threshold

print(f"Optimal threshold: {best_threshold}")
print(f"Lowest total cost: {best_cost}")

Optimal threshold: 0.40000000000000013
Lowest total cost: 3815120
