# Consumer Financial Protection Bureau (CFPB) Consumer Complaints Modeling

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn import preprocessing

import numpy as np

In [11]:
df = pd.read_csv('shared/complaints_25Nov21.csv')
df

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,2016-10-26,Money transfers,International money transfer,Other transaction issues,,"To whom it concerns, I would like to file a fo...",Company has responded to the consumer and the ...,"CITIBANK, N.A.",,,,Consent provided,Web,2016-10-29,Closed with explanation,Yes,No,2180490
1,2015-03-27,Bank account or service,Other bank product/service,"Account opening, closing, or management",,My name is XXXX XXXX XXXX and huband name is X...,Company chooses not to provide a public response,"CITIBANK, N.A.",PA,151XX,Older American,Consent provided,Web,2015-03-27,Closed with explanation,Yes,No,1305453
2,2015-04-20,Bank account or service,Other bank product/service,"Making/receiving payments, sending money",,XXXX 2015 : I called to make a payment on XXXX...,Company chooses not to provide a public response,U.S. BANCORP,PA,152XX,,Consent provided,Web,2015-04-22,Closed with monetary relief,Yes,No,1337613
3,2013-04-29,Mortgage,Conventional fixed mortgage,"Application, originator, mortgage broker",,,,JPMORGAN CHASE & CO.,VA,22406,Servicemember,,Phone,2013-04-30,Closed with explanation,Yes,Yes,393900
4,2013-05-29,Mortgage,Other mortgage,"Loan modification,collection,foreclosure",,,,"BANK OF AMERICA, NATIONAL ASSOCIATION",GA,30044,,,Referral,2013-05-31,Closed with explanation,Yes,No,418647
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207255,2015-05-24,Debt collection,Credit card,Taking/threatening an illegal action,Sued w/o proper notification of suit,,,JPMORGAN CHASE & CO.,FL,33133,,Consent not provided,Web,2015-05-24,Closed with explanation,Yes,No,1390395
207256,2012-01-10,Mortgage,Conventional fixed mortgage,"Loan modification,collection,foreclosure",,,,JPMORGAN CHASE & CO.,NY,10312,,,Referral,2012-01-11,Closed without relief,Yes,Yes,12192
207257,2012-07-17,Student loan,Non-federal student loan,Repaying your loan,,,,"BANK OF AMERICA, NATIONAL ASSOCIATION",NH,032XX,,,Web,2012-07-18,Closed with explanation,Yes,No,118351
207258,2016-09-29,Bank account or service,Checking account,"Account opening, closing, or management",,Near the end of XXXX 2016 I opened a Citigold ...,Company has responded to the consumer and the ...,"CITIBANK, N.A.",CA,900XX,,Consent provided,Web,2016-09-29,Closed with non-monetary relief,Yes,No,2138969


In [12]:
selected_features = ['Product', 'Sub-product', 'Issue', 'State', 'Tags', 'Submitted via',
                     'Company response to consumer', 'Timely response?']

In [19]:
X = df[selected_features]
y = df['Consumer disputed?']
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)


In [21]:
from sklearn.compose import ColumnTransformer
categorical_features = X.select_dtypes(include=['object']).columns
transformers = [
    ('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'), categorical_features)
]
preprocessor = ColumnTransformer(transformers)
X = preprocessor.fit_transform(X)




In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 123)

In [24]:
proportion_disputed = np.sum(y_train) / len(y_train)
proportion_disputed

0.21684719675769565

In [25]:
from imblearn.under_sampling import RandomUnderSampler
undersampler = RandomUnderSampler(random_state=123)
X_train, y_train = undersampler.fit_resample(X, y)


In [26]:
model_xgb = XGBClassifier(random_state=123)
model_xgb.fit(X_train, y_train)


In [27]:
y_pred = model_xgb.predict(X_test)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

In [28]:
report

'              precision    recall  f1-score   support\n\n           0       0.85      0.53      0.66     32504\n           1       0.28      0.65      0.39      8948\n\n    accuracy                           0.56     41452\n   macro avg       0.56      0.59      0.52     41452\nweighted avg       0.73      0.56      0.60     41452\n'

In [29]:
conf_matrix


array([[17353, 15151],
       [ 3104,  5844]])

In [50]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.5596111164720641

In [30]:
total_cost = 0
for i in range(len(y_test)):
    if y_test[i] == 1:  
        total_cost += 600
    else:  
        total_cost += 100

In [31]:
total_cost

8619200

In [33]:
proportion_dispute_test = sum(y_test) / len(y_test)
proportion_dispute_test

0.21586413200810575

In [34]:
proportion_dispute_train = sum(y_train) / len(y_train)
proportion_dispute_train

0.5

In [35]:
report = classification_report(y_test, y_pred, target_names=['No', 'Yes'])
recall_disputed_yes = float(report.split()[5])

In [36]:
recall_disputed_yes

0.85

In [47]:
total_cost_with_model = 0

for i in range(len(y_test)):
    if y_pred[i] == 1:  
        total_cost_with_model += 90
    else:
        if y_test[i] == 1:  
            total_cost_with_model += 600
        else:  
            total_cost_with_model += 100



In [48]:
total_cost_with_model

5487250

In [43]:
best_threshold = 0.5
min_total_cost = float('inf')

for threshold in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
    y_pred_threshold = (y_pred >= threshold).astype(int)
    
    total_cost = 0
    for i in range(len(y_test)):
        if y_pred_threshold[i] == 1:  
            total_cost += 90
        else:
            if y_test[i] == 1:  
                total_cost += 600
            else:  
                total_cost += 100

    if total_cost < min_total_cost:
        min_total_cost = total_cost
        best_threshold = threshold


In [45]:
best_threshold


0.1

In [46]:
min_total_cost

5487250