In [1]:
# data manipulation
import pandas as pd
import numpy as np
import itertools

# visuals
import matplotlib.pyplot as plt
import seaborn as sns

# nlp
import unicodedata
import re
import nltk
from wordcloud import WordCloud
from nltk.corpus import stopwords
import nltk.sentiment
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# modeling
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn import naive_bayes as nb
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay,recall_score

# local
import wrangle as w
import model as m

imports loaded successfully, awaiting commands...


# Wrangle

In [2]:
df = w.wrangle_complaints()

parquet file found and loaded


In [3]:
relief = ['Closed with monetary relief', 'Closed with non-monetary relief']
no_relief = ['Closed with explanation']
df = df[df['company_response_to_consumer'] != "Untimely response"]
df = df[df['company_response_to_consumer'] != "Closed"]
df['response'] = np.where(df['company_response_to_consumer'].isin(relief),'relief','')
df['response'] = np.where(df['company_response_to_consumer'].isin(no_relief),'no_relief',df['response'])
df = df.drop(columns='company_response_to_consumer')

In [4]:
train, val, test = w.split_data(df,'response')

Prepared DF: (1238536, 7)
Train: (743121, 7)
Validate: (247707, 7)
Test: (247708, 7)


# Modeling

In [6]:
# X_train, etc...
X_train, y_train, X_val, y_val, X_test, y_test = m.process_data_modeling2(train, val, test)

### vectorizing

In [7]:
# vectorize
X_train_tfe, X_val_tfe, X_test_tfe = m.process_vector_merge(X_train,X_val,X_test)
X_train_tfe

Unnamed: 0,Older American,"Older American, Servicemember",Servicemember,credit_card,credit_report,debt_collection,loans,money_service,mortgage_x,ability,...,year,year old,yes,yesterday,yet,yet feel,yet feel like,yet receive,yet still,zero
0,0,0,0,0,1,0,0,0,0,0.0,...,0.000000,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0,0.0
1,0,0,1,0,1,0,0,0,0,0.0,...,0.000000,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0,0.0
2,0,0,0,0,1,0,0,0,0,0.0,...,0.110518,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0,0.0
3,0,0,0,0,1,0,0,0,0,0.0,...,0.000000,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0,0.0
4,0,0,0,0,1,0,0,0,0,0.0,...,0.000000,0.0,0.0,0.0,0.23154,0.0,0.0,0.384389,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148620,0,0,0,0,1,0,0,0,0,0.0,...,0.000000,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0,0.0
148621,0,0,0,0,1,0,0,0,0,0.0,...,0.000000,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0,0.0
148622,0,0,1,0,0,1,0,0,0,0.0,...,0.000000,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0,0.0
148623,0,0,0,0,1,0,0,0,0,0.0,...,0.054731,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0,0.0


In [16]:
def tree_models(Xtr,ytr,Xv,yv):
    metrics = []
    # cycle through depth,leaf,class_weight for dec tree
    for c,d,l,f in itertools.product(['gini', 'entropy', 'log_loss'],[7,9],range(7,16),["sqrt", "log2", None]):
        # decision tree
        tree = DecisionTreeClassifier(criterion=c,max_depth=d,min_samples_leaf=l,max_features=f,random_state=123)
        tree.fit(Xtr,ytr)
        # predictions
        y_pred_train = tree.predict(Xtr)
        y_pred_val = tree.predict(Xv)
        # recall score
        tr_rec = recall_score(ytr, y_pred_train, pos_label='no_relief')
        v_rec = recall_score(yv, y_pred_val, pos_label='no_relief')
        # accuracies
        ytr_acc = tree.score(Xtr,ytr)
        yv_acc = tree.score(Xv,yv)
        # table-ize
        output ={
                'model':'Decision Tree',
                'params':f"criterion={c},max_depth={d},min_samples_leaf={l},max_features={f},random_state=123",
                'tr_acc':ytr_acc,
                'v_acc':yv_acc,
                'tr_rec':tr_rec,
                'v_rec':v_rec,
            }
        metrics.append(output)
    return pd.DataFrame(metrics)

def svc_models(Xtr,ytr,Xv,yv):
    metrics = []
    # cycle through C for svc
    for p,c in itertools.product(['l1', 'l2'],[.01,.1,1,10,100]):
        # svc
        sv = LinearSVC(penalty=p,C=c,dual=False,random_state=123)
        sv.fit(Xtr,ytr)
        # predictions
        y_pred_train = sv.predict(Xtr)
        y_pred_val = sv.predict(Xv)
        # recall score
        tr_rec = recall_score(ytr, y_pred_train, pos_label='no_relief')
        v_rec = recall_score(yv, y_pred_val, pos_label='no_relief')
        # accuracies
        ytr_acc = sv.score(Xtr,ytr)
        yv_acc = sv.score(Xv,yv)
        # table-ize
        output ={
                'model':'SVC',
                'params':f"penalty={p},C={c},dual=False,random_state=123",
                'tr_acc':ytr_acc,
                'v_acc':yv_acc,
                'tr_rec':tr_rec,
                'v_rec':v_rec,
            }
        metrics.append(output)
    return pd.DataFrame(metrics)

def mlp_models(Xtr,ytr,Xv,yv):
    metrics = []
    # cycle through C for mlp
    for a,es in itertools.product(['relu', 'identity', 'logistic', 'tanh'],[True,False]):
        # mlp
        ml = MLPClassifier(activation=a,solver='adam',random_state=123,early_stopping=es)
        ml.fit(Xtr,ytr)
        # predictions
        y_pred_train = ml.predict(Xtr)
        y_pred_val = ml.predict(Xv)
        # recall score
        tr_rec = recall_score(ytr, y_pred_train, pos_label='no_relief')
        v_rec = recall_score(yv, y_pred_val, pos_label='no_relief')
        # accuracies
        ytr_acc = ml.score(Xtr,ytr)
        yv_acc = ml.score(Xv,yv)
        # table-ize
        output ={
                'model':'MLP',
                'params':f"activation={a},solver='adam',random_state=123,early_stopping={es}",
                'tr_acc':ytr_acc,
                'v_acc':yv_acc,
                'tr_rec':tr_rec,
                'v_rec':v_rec,
            }
        metrics.append(output)
    return pd.DataFrame(metrics)



def log_models(Xtr,ytr,Xv,yv):
    metrics = []
    # cycle through C for log reg
    for c,s in itertools.product([.01,.1,1,10,100,1000],['newton-cholesky', 'sag', 'saga']):
        # logistic regression
        lr = LogisticRegression(C=c,solver=s,random_state=123,max_iter=500,n_jobs=-1)
        lr.fit(Xtr,ytr)
        # predictions
        y_pred_train = lr.predict(Xtr)
        y_pred_val = lr.predict(Xv)
        # recall score
        tr_rec = recall_score(ytr, y_pred_train, pos_label='no_relief')
        v_rec = recall_score(yv, y_pred_val, pos_label='no_relief')
        # accuracies
        ytr_acc = lr.score(Xtr,ytr)
        yv_acc = lr.score(Xv,yv)
        # table-ize
        output ={
                'model':'LogReg',
                'params':f"C={c},solver={s},random_state=123,max_iter=500,n_jobs=-1",
                'tr_acc':ytr_acc,
                'v_acc':yv_acc,
                'tr_rec':tr_rec,
                'v_rec':v_rec,
            }
        metrics.append(output)
    return pd.DataFrame(metrics)

In [9]:
baseline = y_train.value_counts(normalize=True)[0]
baseline

0.7931034482758621

In [10]:
tree_tfe_results = tree_models(X_train_tfe,y_train,X_val_tfe,y_val)
tree_tfe_results

Unnamed: 0,model,params,tr_acc,v_acc,tr_rec,v_rec
0,Decision Tree,"criterion=gini,max_depth=7,min_samples_leaf=7,...",0.794489,0.792802,0.994935,0.994299
1,Decision Tree,"criterion=gini,max_depth=7,min_samples_leaf=7,...",0.793285,0.793024,0.999864,0.999567
2,Decision Tree,"criterion=gini,max_depth=7,min_samples_leaf=7,...",0.795465,0.793125,0.993290,0.991881
3,Decision Tree,"criterion=gini,max_depth=7,min_samples_leaf=8,...",0.794786,0.793004,0.994367,0.993332
4,Decision Tree,"criterion=gini,max_depth=7,min_samples_leaf=8,...",0.793305,0.792984,0.999822,0.999516
...,...,...,...,...,...,...
157,Decision Tree,"criterion=log_loss,max_depth=9,min_samples_lea...",0.793588,0.792923,0.997370,0.996971
158,Decision Tree,"criterion=log_loss,max_depth=9,min_samples_lea...",0.796104,0.792903,0.990660,0.989285
159,Decision Tree,"criterion=log_loss,max_depth=9,min_samples_lea...",0.794873,0.793105,0.995156,0.994426
160,Decision Tree,"criterion=log_loss,max_depth=9,min_samples_lea...",0.793588,0.792923,0.997370,0.996971


In [11]:
svc_tfe_results = svc_models(X_train_tfe,y_train,X_val_tfe,y_val)
svc_tfe_results

Unnamed: 0,model,params,tr_acc,v_acc,tr_rec,v_rec
0,SVC,"penalty=l1,C=0.01,dual=False,random_state=123",0.793056,0.793044,0.999533,0.99944
1,SVC,"penalty=l1,C=0.1,dual=False,random_state=123",0.794691,0.794154,0.99435,0.994045
2,SVC,"penalty=l1,C=1,dual=False,random_state=123",0.795876,0.794498,0.989345,0.988496
3,SVC,"penalty=l1,C=10,dual=False,random_state=123",0.796528,0.793186,0.988081,0.985824
4,SVC,"penalty=l1,C=100,dual=False,random_state=123",0.796582,0.793186,0.987902,0.985595
5,SVC,"penalty=l2,C=0.01,dual=False,random_state=123",0.794375,0.793993,0.997328,0.997099
6,SVC,"penalty=l2,C=0.1,dual=False,random_state=123",0.79527,0.794578,0.992161,0.99155
7,SVC,"penalty=l2,C=1,dual=False,random_state=123",0.796084,0.793852,0.989158,0.987631
8,SVC,"penalty=l2,C=10,dual=False,random_state=123",0.796521,0.793367,0.988191,0.986028
9,SVC,"penalty=l2,C=100,dual=False,random_state=123",0.796582,0.793307,0.988004,0.985697


In [12]:
mlp_tfe_results = mlp_models(X_train_tfe,y_train,X_val_tfe,y_val)
mlp_tfe_results

Unnamed: 0,model,params,tr_acc,v_acc,tr_rec,v_rec
0,MLP,"activation=relu,solver='adam',random_state=123...",0.800458,0.79587,0.983559,0.981319
1,MLP,"activation=relu,solver='adam',random_state=123...",0.955754,0.736083,0.978817,0.853558
2,MLP,"activation=identity,solver='adam',random_state...",0.795976,0.793347,0.990091,0.987962
3,MLP,"activation=identity,solver='adam',random_state...",0.796259,0.792964,0.981981,0.97997
4,MLP,"activation=logistic,solver='adam',random_state...",0.795088,0.79478,0.988496,0.988471
5,MLP,"activation=logistic,solver='adam',random_state...",0.796212,0.794356,0.987767,0.986409
6,MLP,"activation=tanh,solver='adam',random_state=123...",0.795156,0.793872,0.994867,0.99407
7,MLP,"activation=tanh,solver='adam',random_state=123...",0.873514,0.737072,0.946214,0.853125


In [17]:
log_tfe_results = log_models(X_train_tfe,y_train,X_val_tfe,y_val)
log_tfe_results

Unnamed: 0,model,params,tr_acc,v_acc,tr_rec,v_rec
0,LogReg,"C=0.01,solver=newton-cholesky,random_state=123,max_iter=500,n_jobs=-1",0.793103,0.793105,1.0,1.0
1,LogReg,"C=0.01,solver=sag,random_state=123,max_iter=500,n_jobs=-1",0.793103,0.793105,1.0,1.0
2,LogReg,"C=0.01,solver=saga,random_state=123,max_iter=500,n_jobs=-1",0.793103,0.793105,1.0,1.0
3,LogReg,"C=0.1,solver=newton-cholesky,random_state=123,max_iter=500,n_jobs=-1",0.794563,0.794114,0.995334,0.995063
4,LogReg,"C=0.1,solver=sag,random_state=123,max_iter=500,n_jobs=-1",0.79457,0.794114,0.995334,0.995063
5,LogReg,"C=0.1,solver=saga,random_state=123,max_iter=500,n_jobs=-1",0.794563,0.794114,0.995334,0.995063
6,LogReg,"C=1,solver=newton-cholesky,random_state=123,max_iter=500,n_jobs=-1",0.795795,0.794154,0.986596,0.985697
7,LogReg,"C=1,solver=sag,random_state=123,max_iter=500,n_jobs=-1",0.795795,0.794154,0.986596,0.985697
8,LogReg,"C=1,solver=saga,random_state=123,max_iter=500,n_jobs=-1",0.795808,0.794154,0.986613,0.985697
9,LogReg,"C=10,solver=newton-cholesky,random_state=123,max_iter=500,n_jobs=-1",0.796602,0.792661,0.981955,0.978927


In [18]:
results = pd.concat([
                    tree_tfe_results,
                    svc_tfe_results,
                    mlp_tfe_results,
                    log_tfe_results,
                    ])

In [19]:
new_results = results[results.v_acc>baseline].sort_values('v_acc',ascending=False).reset_index(drop=True)
new_results

Unnamed: 0,model,params,tr_acc,v_acc,tr_rec,v_rec
0,MLP,"activation=relu,solver='adam',random_state=123,early_stopping=True",0.800458,0.795870,0.983559,0.981319
1,MLP,"activation=logistic,solver='adam',random_state=123,early_stopping=True",0.795088,0.794780,0.988496,0.988471
2,SVC,"penalty=l2,C=0.1,dual=False,random_state=123",0.795270,0.794578,0.992161,0.991550
3,SVC,"penalty=l1,C=1,dual=False,random_state=123",0.795876,0.794498,0.989345,0.988496
4,MLP,"activation=logistic,solver='adam',random_state=123,early_stopping=False",0.796212,0.794356,0.987767,0.986409
...,...,...,...,...,...,...
98,LogReg,"C=0.01,solver=saga,random_state=123,max_iter=500,n_jobs=-1",0.793103,0.793105,1.000000,1.000000
99,Decision Tree,"criterion=entropy,max_depth=7,min_samples_leaf=12,max_features=None,random_state=123",0.794685,0.793105,0.992365,0.991525
100,Decision Tree,"criterion=gini,max_depth=7,min_samples_leaf=11,max_features=log2,random_state=123",0.793292,0.793105,0.999822,0.999618
101,Decision Tree,"criterion=gini,max_depth=7,min_samples_leaf=10,max_features=log2,random_state=123",0.793285,0.793105,0.999881,0.999669


In [20]:
pd.set_option("display.max_colwidth", 250)
new_results.head(25)

Unnamed: 0,model,params,tr_acc,v_acc,tr_rec,v_rec
0,MLP,"activation=relu,solver='adam',random_state=123,early_stopping=True",0.800458,0.79587,0.983559,0.981319
1,MLP,"activation=logistic,solver='adam',random_state=123,early_stopping=True",0.795088,0.79478,0.988496,0.988471
2,SVC,"penalty=l2,C=0.1,dual=False,random_state=123",0.79527,0.794578,0.992161,0.99155
3,SVC,"penalty=l1,C=1,dual=False,random_state=123",0.795876,0.794498,0.989345,0.988496
4,MLP,"activation=logistic,solver='adam',random_state=123,early_stopping=False",0.796212,0.794356,0.987767,0.986409
5,Decision Tree,"criterion=gini,max_depth=9,min_samples_leaf=11,max_features=sqrt,random_state=123",0.795472,0.794154,0.994078,0.993866
6,SVC,"penalty=l1,C=0.1,dual=False,random_state=123",0.794691,0.794154,0.99435,0.994045
7,LogReg,"C=1,solver=saga,random_state=123,max_iter=500,n_jobs=-1",0.795808,0.794154,0.986613,0.985697
8,LogReg,"C=1,solver=newton-cholesky,random_state=123,max_iter=500,n_jobs=-1",0.795795,0.794154,0.986596,0.985697
9,LogReg,"C=1,solver=sag,random_state=123,max_iter=500,n_jobs=-1",0.795795,0.794154,0.986596,0.985697
