In [1]:
# Import packages
import pandas as pd
import numpy as np 

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from bs4 import BeautifulSoup
import re

from tqdm import tqdm


In [None]:
# Import dataset
data = pd.read_csv("Consumer_Complaints.csv")

In [None]:
# Drop NA
data = data.rename(columns={'Consumer complaint narrative':'Narrative'})
data = data.dropna(subset=['Narrative'])

In [None]:
### Apply VaderSentiment to each complaint

In [None]:
# Function for cleaning the narratives for Vader sentiment analysis
def vader_input(text):
        
    # remove html content
    text = BeautifulSoup(text).get_text()
     
    # remove all XX words:
    text = re.sub("[XX$]"," ", text)
        
    # remove extra whitespaces
    text = re.sub(r'\s+', ' ', text)

    return(text)

In [None]:
data['vader_input'] = data['Narrative'].apply(vader_input)

In [None]:
# Apply the sentiment analysis function for each narrative and return sentiment scores from the function

analyser = SentimentIntensityAnalyzer()

def sentiment_analyzer_scores(sentence):
    score = analyser.polarity_scores(sentence)
    # Only extract the negative score of each complaint
    return pd.Series([score['pos'], score['neg']])

data[['positive_score', 'negative_score']] = data['vader_input'].apply(sentiment_analyzer_scores)

### Process text data

In [None]:
# Function for cleaning the narratives, remove stopwords and stem them, this is stricter than vader_input
def clean_sentences(text):
        
    #remove html content
    text = BeautifulSoup(text).get_text()
        
    #remove non-alphabetic characters
    text = re.sub("[^a-zA-Z]"," ", text)
        
    #remove all XX words:
    text = re.sub("[XX$]"," ", text)
        
    # remove extra whitespaces
    text = re.sub(r'\s+', ' ', text)
        
    #remove stop words
    stopword = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in (stopword)])
        
    #stem words
    porter_stemmer = PorterStemmer()
    text = porter_stemmer.stem(text)
            

    return(text)

In [None]:
data['clean_sentences'] = data['Narrative'].apply(clean_sentences)

### Save to new data

In [None]:
data.to_csv('cleaned_narrative.csv', index = False)

### Build the classification model

In [3]:
data = pd.read_csv('cluster_issue.csv')

In [10]:
data.Issue.value_counts().to_dict()

{'Incorrect information on your report': 8672,
 "Credit reporting company's investigation": 3856,
 'Attempts to collect debt not owed': 2543,
 'Account opening, closing, or management': 2292,
 'Improper use of your report': 2006,
 "Cont'd attempts collect debt not owed": 1698,
 'Dealing with your lender or servicer': 1603,
 'Loan servicing, payments, escrow account': 1515,
 'Problem when making payments': 1502,
 'Communication tactics': 1243,
 'Written notification about debt': 1119,
 'Loan modification,collection,foreclosure': 1094,
 'False statements or representation': 977,
 'Struggling to pay your loan': 833,
 'Struggling to pay mortgage': 786,
 'Disclosure verification of debt': 753,
 'Managing the loan or lease': 738,
 'Fees or interest': 723,
 'Took or threatened to take negative or legal action': 697,
 'Problem with a purchase shown on your statement': 598,
 'Unable to get your credit report or credit score': 497,
 'Application, originator, mortgage broker': 401,
 'Other featur

In [11]:
data.cluster_Issue.value_counts().to_dict()

{'Incorrect information on your report': 12528,
 'Attempts to collect debt not owed': 9809,
 'Loan servicing, payments, escrow account': 6176,
 'Account opening, closing, or management': 3495,
 'Dealing with your lender or servicer': 2436,
 'Improper use of your report': 2006,
 'Managing the loan or lease': 1292,
 'Problem with a purchase shown on your statement': 971,
 'Other features, terms, or problems': 804,
 'Fees or interest': 723,
 'Unable to get your credit report or credit score': 497,
 'Fraud or scam': 320,
 'Credit monitoring or identity theft protection services': 310,
 'Problem with fraud alerts or security freezes': 274,
 'Money was not available when promised': 269,
 "Can't contact lender or servicer": 256,
 'Getting a credit card': 236,
 'Other': 180,
 'Identity theft / Fraud / Embezzlement': 178,
 'Customer service / Customer relations': 115,
 'Trouble using your card': 113,
 'Unauthorized transactions/trans. issues': 91,
 'Getting a loan or lease': 85,
 'APR or intere

In [12]:
# Select top10 companies for classification
data['Company'] = data['Company'].replace(data['Company'].value_counts().index[10:], "Other")

In [4]:
# Select top10 states for classification
data['State'] = data['State'].replace(data['State'].value_counts().index[10:], "Other")

In [5]:
# Merge same products
data.loc[((data.Product == 'Credit card') |
          (data.Product == 'Credit card or prepaid card') | 
          (data.Product == 'Prepaid card')),
         'Product'] = 'Credit card or prepaid card'

In [6]:
data.loc[((data.Product == 'Credit reporting') |
          (data.Product == 'Credit reporting, credit repair services, or other personal consumer reports')),
         'Product'] = 'Credit reporting, credit repair services, or other personal consumer reports'

In [7]:
data.loc[((data.Product == 'Money transfer, virtual currency, or money service') |
          (data.Product == 'Money transfers') |
          (data.Product == 'Virtual currency')),
         'Product'] = 'Money transfer, virtual currency, or money service'

In [8]:
data.loc[((data.Product == 'Payday loan') |
          (data.Product == 'Payday loan, title loan, or personal loan')),
         'Product'] = 'Payday loan, title loan, or personal loan'

In [9]:
data.loc[((data.Product == 'Credit card') |
          (data.Product == 'Credit card or prepaid card')),
         'Product'] = 'Credit card or prepaid card'

In [10]:
data.loc[((data.Product == 'Credit card') |
          (data.Product == 'Credit card or prepaid card')),
         'Product'] = 'Credit card or prepaid card'

In [11]:
data.loc[((data.Product == 'Credit card') |
          (data.Product == 'Credit card or prepaid card')),
         'Product'] = 'Credit card or prepaid card'

In [12]:
# One hot encoding
df = data[['Product', 'State', 'positive_score', 'negative_score', 'clean_sentences', 'Issue', 'cluster_Issue']]
lst = ['Product', 'State']
df = pd.get_dummies(df, columns=lst, drop_first=True)
df.columns

Index(['positive_score', 'negative_score', 'clean_sentences', 'Issue',
       'cluster_Issue', 'Product_Checking or savings account',
       'Product_Consumer Loan', 'Product_Credit card or prepaid card',
       'Product_Credit reporting, credit repair services, or other personal consumer reports',
       'Product_Debt collection',
       'Product_Money transfer, virtual currency, or money service',
       'Product_Mortgage', 'Product_Other financial service',
       'Product_Payday loan, title loan, or personal loan',
       'Product_Student loan', 'Product_Vehicle loan or lease', 'State_FL',
       'State_GA', 'State_IL', 'State_NC', 'State_NJ', 'State_NY', 'State_OH',
       'State_Other', 'State_PA', 'State_TX'],
      dtype='object')

In [13]:
# Build the pipeline for TFIDF and LinearSVC model
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
import warnings
warnings.filterwarnings("ignore")

X_train, X_test, y_train, y_test = train_test_split(df.drop(['cluster_Issue', 'Issue'],axis=1), 
                                                    df['cluster_Issue'], random_state = 0, train_size = 0.7)

get_numeric_data = FunctionTransformer(lambda x: x.drop(['clean_sentences'],axis=1), validate=False)
get_text_data = FunctionTransformer(lambda x: x['clean_sentences'], validate=False)

pipeline = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector', get_numeric_data)
            ])),
             ('word_features', Pipeline([ 
                ('selector', get_text_data),
                ('tfidf', TfidfVectorizer(ngram_range=(1,2), norm='l2')),
            ])),
         ])),
           ('lsvc', LinearSVC()) 
     ])


# GridSearch the best parameter set, using precision for optimization
# Precision is the number of correct results divided by the number of all returned results
#y_train = y_train.factorize()[0]
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('features',
                 FeatureUnion(n_jobs=None,
                              transformer_list=[('numeric_features',
                                                 Pipeline(memory=None,
                                                          steps=[('selector',
                                                                  FunctionTransformer(accept_sparse=False,
                                                                                      check_inverse=True,
                                                                                      func=<function <lambda> at 0x1291f4170>,
                                                                                      inv_kw_args=None,
                                                                                      inverse_func=None,
                                                                                      kw_args=None,
                                       

In [14]:
# Predict on test set and evaluate predictions

import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import (brier_score_loss, precision_score, recall_score,
                             f1_score, accuracy_score)
from sklearn.metrics import precision_recall_fscore_support as score
y_pred = pipeline.predict(X_test)

print('precision: {}'.format(precision_score(y_test, y_pred, average='weighted')))
print('recall: {}'.format(recall_score(y_test, y_pred, average='weighted')))
print('fscore: {}'.format(f1_score(y_test, y_pred, average='weighted')))

precision: 0.8520371452489824
recall: 0.8678651685393258
fscore: 0.852764442562402


In [15]:
import dill
with open("pipeline.pkl", 'wb') as pickle_file:
    dill.dump(pipeline, pickle_file)

In [16]:
# Simulate user input in dash
product = 1
state = 22

In [17]:
index_dict = {'Product_Checking or savings account': 1,
 'Product_Consumer Loan': 2,
 'Product_Credit card or prepaid card': 3,
 'Product_Credit reporting, credit repair services, or other personal consumer reports': 4,
 'Product_Debt collection': 5,
 'Product_Money transfer, virtual currency, or money service': 6,
 'Product_Mortgage': 7,
 'Product_Other financial service': 8,
 'Product_Payday loan, title loan, or personal loan': 9,
 'Product_Student loan': 10,
 'Product_Vehicle loan or lease': 11,
 'State_FL': 12,
 'State_GA': 13,
 'State_IL': 14,
 'State_NC': 15,
 'State_NJ': 16,
 'State_NY': 17,
 'State_OH': 18,
 'State_PA': 19,
 'State_TX': 20,
 'State_Other': 21}

In [19]:
def dummy(index_dict, pro, stat): 
    for key, value in index_dict.items():
        if pro == value:
            index_dict[key] = 100
        if stat == value:
            index_dict[key] = 100
    for key, value in index_dict.items():
        if value < 100:
            index_dict[key] = 0
        if value == 100:
            index_dict[key] = 1    
    return index_dict

attribute_index = dummy(index_dict=index_dict, pro=product, stat=state)

In [20]:
attribute_index['positive_score'] = 0.1
attribute_index['negative_score'] = 0.7
attribute_index['clean_sentences'] = 'I hate you'
input_data = pd.DataFrame(attribute_index, index=[0])
input_data

Unnamed: 0,Product_Checking or savings account,Product_Consumer Loan,Product_Credit card or prepaid card,"Product_Credit reporting, credit repair services, or other personal consumer reports",Product_Debt collection,"Product_Money transfer, virtual currency, or money service",Product_Mortgage,Product_Other financial service,"Product_Payday loan, title loan, or personal loan",Product_Student loan,...,State_NC,State_NJ,State_NY,State_OH,State_PA,State_TX,State_Other,positive_score,negative_score,clean_sentences
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.1,0.7,I hate you


In [21]:
with open('pipeline.pkl', 'rb') as file:
    model = dill.load(file)

In [22]:
model.predict(input_data)[0]

'Account opening, closing, or management'