In [1]:
# data manipulation
import pandas as pd
import numpy as np
import itertools

# visuals
import matplotlib.pyplot as plt
import seaborn as sns

# nlp
from sklearn.feature_extraction.text import TfidfVectorizer

# modeling
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import recall_score

# local
import wrangle as w
import model as m

imports loaded successfully, awaiting commands...


# Wrangle

In [2]:
df = w.wrangle_complaints()

parquet file found and loaded


In [3]:
relief = ['Closed with monetary relief', 'Closed with non-monetary relief']
no_relief = ['Closed with explanation']
df = df[df['company_response_to_consumer'] != "Untimely response"]
df = df[df['company_response_to_consumer'] != "Closed"]
df['response'] = np.where(df['company_response_to_consumer'].isin(relief),'relief','')
df['response'] = np.where(df['company_response_to_consumer'].isin(no_relief),'no_relief',df['response'])
df = df.drop(columns='company_response_to_consumer')

In [4]:
train, val, test = w.split_data(df,'response')

Prepared DF: (1238536, 7)
Train: (743121, 7)
Validate: (247707, 7)
Test: (247708, 7)


# Modeling

In [5]:
def encode(df):
    '''Encode categorical columns'''
    # columns to encode
    cols = ['tags','product_bins']
    # encode the dummies
    dummy = pd.get_dummies(df[cols],prefix='',prefix_sep='',drop_first=True)
    # bring the dummies along
    return pd.concat([df,dummy],axis=1)

def process_data_modeling(train, validate, test):
    """
    The function `process_data_modeling` reads data from parquet files, performs data sampling,
    encoding, and splits the data into training, validation, and test sets.
    
    :param train: The `train` parameter is the training dataset, which is a pandas DataFrame containing
    the data for training the model
    :param validate: The `validate` parameter is a DataFrame that contains the validation data. It is
    read from a parquet file named 'validate.parquet'
    :param test: The `test` parameter is a DataFrame that contains the test data for your model. It is
    read from a parquet file named 'test.parquet'
    :return: six variables: X_train, y_train, X_val, y_val, X_test, and y_test.
    """
    random_state = 123
    response_categories = [
        'relief',
        'no_relief'
    ]

    sm_train = []
    sm_val = []
    sm_test = []

    small_train = pd.DataFrame()
    small_val = pd.DataFrame()
    small_test = pd.DataFrame()

    random_state = 123
    percent = .2

    for category in response_categories:
        sm_train.append(int(round(len(train[train.response == category]) * percent, 0)))
        sm_val.append(int(round(len(validate[validate.response == category]) * percent, 0)))
        sm_test.append(int(round(len(test[test.response == category]) * percent, 0)))
        
        small_train = small_train.append(train[train.response == category].sample(sm_train[-1], random_state=random_state))
        small_val = small_val.append(validate[validate.response == category].sample(sm_val[-1], random_state=random_state))
        small_test = small_test.append(test[test.response == category].sample(sm_test[-1], random_state=random_state))

    small_train.reset_index(drop=True, inplace=True)
    small_val.reset_index(drop=True, inplace=True)
    small_test.reset_index(drop=True, inplace=True)

    X_train = encode(small_train)
    X_train = X_train.drop(columns=['date_received','state','company_name','tags','product_bins', 'response'])
    y_train = small_train['response']
    X_val = encode(small_val)
    X_val = X_val.drop(columns=['date_received','state','company_name','tags','product_bins', 'response'])
    y_val = small_val['response']
    X_test = encode(small_test)
    X_test = X_test.drop(columns=['date_received','state','company_name','tags','product_bins', 'response'])
    y_test = small_test['response']

    return X_train, y_train, X_val, y_val, X_test, y_test

In [6]:
# X_train, etc...
X_train, y_train, X_val, y_val, X_test, y_test = process_data_modeling(train, val, test)

### vectorizing

In [7]:
def make_mbt_tfidf(Xtr,Xv,Xt,max_f=2900):
    """
    The function `make_tfidf` takes in three sets of data (train, validation, and test) and applies the
    TF-IDF vectorization technique to convert the text data into numerical features, using n-grams up to
    trigrams and keeping single characters. It then returns the transformed data as pandas DataFrames.
    
    :param Xtr: Xtr is the training data, which is a dataframe containing the text data that you want to
    transform into TF-IDF features. The "lemmatized" column in the dataframe contains the preprocessed
    text data
    :param Xv: Xv is the validation dataset, which is used to evaluate the performance of the model
    during training
    :param Xt: Xt is the input data for the test set. It is a dataframe containing the text data that
    needs to be transformed into TF-IDF representation
    :return: three dataframes: Xtr_tfidf, Xv_tfidf, and Xt_tfidf.
    """
    #make my bag of words up to trigrams tfidf and keep single characters
    tfidf = TfidfVectorizer(token_pattern=r'(?u)\b\w+\b',lowercase=False, max_features=max_f, ngram_range=(1,3))
    # fit and transform train
    Xtr_bow_tfidf = tfidf.fit_transform(Xtr.lemon.astype(str))
    # transform val and test
    Xv_bow_tfidf = tfidf.transform(Xv.lemon.astype(str))
    Xt_bow_tfidf = tfidf.transform(Xt.lemon.astype(str))
    # make dfs
    Xtr_tfidf = pd.DataFrame(Xtr_bow_tfidf.todense(),columns=tfidf.get_feature_names_out(),index=Xtr.index)
    Xv_tfidf = pd.DataFrame(Xv_bow_tfidf.todense(),columns=tfidf.get_feature_names_out(),index=Xv.index)
    Xt_tfidf = pd.DataFrame(Xt_bow_tfidf.todense(),columns=tfidf.get_feature_names_out(),index=Xt.index)
    return Xtr_tfidf,Xv_tfidf,Xt_tfidf

def process_vector_merge(X_train,X_val,X_test,max_f=2900):
    X_train_tf, X_val_tf, X_test_tf = make_mbt_tfidf(X_train[['lemon']], X_val[['lemon']], X_test[['lemon']],max_f)
    
    encoded_train = X_train.iloc[:, 1:]
    encoded_val = X_val.iloc[:, 1:]
    encoded_test = X_test.iloc[:, 1:]
    
    X_train_tfe = encoded_train.merge(X_train_tf, left_index=True, right_index=True)
    X_val_tfe = encoded_val.merge(X_val_tf, left_index=True, right_index=True)
    X_test_tfe = encoded_test.merge(X_test_tf, left_index=True, right_index=True)
    
    # Visualization of train data
    return X_train_tfe, X_val_tfe, X_test_tfe

In [8]:
# vectorize
X_train_tfe, X_val_tfe, X_test_tfe = process_vector_merge(X_train,X_val,X_test)
X_train_tfe

Unnamed: 0,Older American,"Older American, Servicemember",Servicemember,credit_card,credit_report,debt_collection,loans,money_service,mortgage_x,ability,...,year,year old,yes,yesterday,yet,yet feel,yet feel like,yet receive,yet still,zero
0,0,0,0,0,1,0,0,0,0,0.0,...,0.000000,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0,0.0
1,0,0,1,0,1,0,0,0,0,0.0,...,0.000000,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0,0.0
2,0,0,0,0,1,0,0,0,0,0.0,...,0.110518,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0,0.0
3,0,0,0,0,1,0,0,0,0,0.0,...,0.000000,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0,0.0
4,0,0,0,0,1,0,0,0,0,0.0,...,0.000000,0.0,0.0,0.0,0.23154,0.0,0.0,0.384389,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148620,0,0,0,0,1,0,0,0,0,0.0,...,0.000000,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0,0.0
148621,0,0,0,0,1,0,0,0,0,0.0,...,0.000000,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0,0.0
148622,0,0,1,0,0,1,0,0,0,0.0,...,0.000000,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0,0.0
148623,0,0,0,0,1,0,0,0,0,0.0,...,0.054731,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0,0.0


In [9]:
def tree_multigram_model(Xtr,ytr,Xv,yv):
    # unique features and tfidf
    # decision tree
    tree = DecisionTreeClassifier(criterion='log_loss',max_depth=7,min_samples_leaf=15,max_features=None,random_state=123)
    tree.fit(Xtr,ytr)
    # predictions
    y_pred_train = tree.predict(Xtr)
    y_pred_val = tree.predict(Xv)
    # recall score
    tr_rec = recall_score(ytr, y_pred_train, pos_label='no_relief')
    v_rec = recall_score(yv, y_pred_val, pos_label='no_relief')
    # accuracies
    ytr_acc = tree.score(Xtr,ytr)
    yv_acc = tree.score(Xv,yv)
    # print results
    print('Decision Tree Unigrams, Bigrams, Trigrams')
    print(f'Train Accuracy:      {round(ytr_acc,4)*100}%')
    print(f'Validation Accuracy: {round(yv_acc,4)*100}%')
    print(f'Train Recall:        {round(tr_rec,4)*100}%')
    print(f'Validation Recall:   {round(v_rec,4)*100}%')

def svc_multigram_model(Xtr,ytr,Xv,yv):
    # unique features and tfidf
    # Linear svc
    sv = LinearSVC(penalty='l2',C=1,dual=False,random_state=123,max_iter=500)
    sv.fit(Xtr,ytr)
    # predictions
    y_pred_train = sv.predict(Xtr)
    y_pred_val = sv.predict(Xv)
    # recall score
    tr_rec = recall_score(ytr, y_pred_train, pos_label='no_relief')
    v_rec = recall_score(yv, y_pred_val, pos_label='no_relief')
    # accuracies
    ytr_acc = sv.score(Xtr,ytr)
    yv_acc = sv.score(Xv,yv)
    # print results
    print('Linear SVC Unigrams, Bigrams, Trigrams')
    print(f'Train Accuracy:      {round(ytr_acc,4)*100}%')
    print(f'Validation Accuracy: {round(yv_acc,4)*100}%')
    print(f'Train Recall:        {round(tr_rec,4)*100}%')
    print(f'Validation Recall:   {round(v_rec,4)*100}%')

def mlp_multigram_model(Xtr,ytr,Xv,yv):
    # unique features and tfidf
    # mlp
    ml = MLPClassifier(activation='relu',solver='adam',random_state=123,early_stopping=True)
    ml.fit(Xtr,ytr)
    # predictions
    y_pred_train = ml.predict(Xtr)
    y_pred_val = ml.predict(Xv)
    # recall score
    tr_rec = recall_score(ytr, y_pred_train, pos_label='no_relief')
    v_rec = recall_score(yv, y_pred_val, pos_label='no_relief')
    # accuracies
    ytr_acc = ml.score(Xtr,ytr)
    yv_acc = ml.score(Xv,yv)
    # print results
    print('Multi-Layer Perceptron Unigrams, Bigrams, Trigrams')
    print(f'Train Accuracy:      {round(ytr_acc,4)*100}%')
    print(f'Validation Accuracy: {round(yv_acc,4)*100}%')
    print(f'Train Recall:        {round(tr_rec,4)*100}%')
    print(f'Validation Recall:   {round(v_rec,4)*100}%')


In [10]:
baseline = y_train.value_counts(normalize=True)[0]
baseline

0.7931034482758621

In [11]:
# tree_multigram_model(X_train_tfe,y_train,X_val_tfe,y_val)

In [12]:
# svc_multigram_model(X_train_tfe,y_train,X_val_tfe,y_val)

In [13]:
# mlp_multigram_model(X_train_tfe,y_train,X_val_tfe,y_val)

In [14]:
# train_acc	    val_acc	        train_recall    val_recall

# Tree	criterion='log_loss',max_depth=7,min_samples_leaf=15,max_features=None,random_state=123	
# 0.7989100084	0.7974445925	0.9954273595	0.9944008959

# SVC	penalty='l2',C=1,dual=False,random_state=123,max_iter=500
# 0.8007939445	0.7976464414	0.98326193	    0.9813702535

# MLP	activation='relu',solver='adam',random_state=123,early_stopping=True
# 0.810079058	0.8000484437	0.9726150583	0.966990736

In [15]:
X_train_tfe, X_val_tfe, X_test_tfe = process_vector_merge(X_train,X_val,X_test,10000)

In [16]:
tree_multigram_model(X_train_tfe,y_train,X_val_tfe,y_val)

Decision Tree Unigrams, Bigrams, Trigrams
Train Accuracy:      79.47%
Validation Accuracy: 79.29%
Train Recall:        99.29%
Validation Recall:   99.18%


In [19]:
for i in np.arange(1000,3000,100):
    print('---')
    X_train_tfe, X_val_tfe, X_test_tfe = process_vector_merge(X_train,X_val,X_test,i)
    print(f'max_features={i}')
    tree_multigram_model(X_train_tfe,y_train,X_val_tfe,y_val)
    svc_multigram_model(X_train_tfe,y_train,X_val_tfe,y_val)
    mlp_multigram_model(X_train_tfe,y_train,X_val_tfe,y_val)


---
max_features=1000
Decision Tree Unigrams, Bigrams, Trigrams
Train Accuracy:      79.47%
Validation Accuracy: 79.25%
Train Recall:        99.21%
Validation Recall:   99.09%
Linear SVC Unigrams, Bigrams, Trigrams
Train Accuracy:      79.43%
Validation Accuracy: 79.38%
Train Recall:        99.35000000000001%
Validation Recall:   99.31%
Multi-Layer Perceptron Unigrams, Bigrams, Trigrams
Train Accuracy:      79.91%
Validation Accuracy: 79.44%
Train Recall:        97.65%
Validation Recall:   97.38%
---
max_features=1100
Decision Tree Unigrams, Bigrams, Trigrams
Train Accuracy:      79.47%
Validation Accuracy: 79.24%
Train Recall:        99.22%
Validation Recall:   99.08%
Linear SVC Unigrams, Bigrams, Trigrams
Train Accuracy:      79.42%
Validation Accuracy: 79.38%
Train Recall:        99.31%
Validation Recall:   99.29%
Multi-Layer Perceptron Unigrams, Bigrams, Trigrams
Train Accuracy:      79.5%
Validation Accuracy: 79.47%
Train Recall:        99.44%
Validation Recall:   99.42%
---
max_f