In [None]:
import os
import pandas as pd
import numpy as np
from scipy.stats import randint
import seaborn as sns # used for plot interactive graph. 
import matplotlib.pyplot as plt
from io import StringIO
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
'''
Data : Consumer complaints received about financial products and services
https://catalog.data.gov/dataset/consumer-complaint-database
Each complaint has been labeled with a specific product and this is a supervised text classification problem
'''

df = pd.read_csv('complaints.csv')
print(df.shape)  
# (2062907, 18)

### Feature Engineering

In [None]:
df.head(3)

In [None]:
# only select two meaningful columns
df1 = df[['Product', 'Consumer complaint narrative']].copy()
# drop null rows NaN
df1 = df1[pd.notnull(df1['Consumer complaint narrative'])]
# rename columns
df1.columns = ['Product', 'Consumer_complaints']
df1.shape 
# (705199, 2)

In [None]:
# Percentage of complaints that have text
total = df1['Consumer_complaints'].notnull().sum()
round((total / len(df) * 100), 2)  
# 34.18

In [None]:
pd.DataFrame(df.Product.unique()).values
'''
array([['Debt collection'],
       ['Credit reporting, credit repair services, or other personal consumer reports'],
       ['Checking or savings account'],
       ['Credit card or prepaid card'],
       ['Money transfer, virtual currency, or money service'],
       ['Vehicle loan or lease'],
       ['Mortgage'],
       ['Student loan'],
       ['Payday loan, title loan, or personal loan'],
       ['Credit card'],
       ['Consumer Loan'],
       ['Payday loan'],
       ['Bank account or service'],
       ['Credit reporting'],
       ['Other financial service'],
       ['Money transfers'],
       ['Prepaid card'],
       ['Virtual currency']], dtype=object)
'''

In [None]:
# we sample the data, so it will take a shorter time to process
df2 = df1.sample(10000, random_state=1).copy()

In [None]:
# Renaming similar categories so it will be easier to classify them
df2.replace({'Product': 
             {'Credit reporting, credit repair services, or other personal consumer reports': 'Credit reporting, repair, or other', 
              'Credit reporting': 'Credit reporting, repair, or other',
              'Credit card': 'Credit card or prepaid card',
              'Prepaid card': 'Credit card or prepaid card',
              'Payday loan': 'Payday loan, title loan, or personal loan',
              'Money transfer': 'Money transfer, virtual currency, or money service',
              'Virtual currency': 'Money transfer, virtual currency, or money service'}}, 
            inplace= True)

In [None]:
pd.DataFrame(df2.Product.unique())

In [None]:
# Product = class to guess | Consumer_complaints : text to classifiy based on it

In [None]:
# turn each category of product into a seperate id
df2['category_id'] = df2['Product'].factorize()[0]
# get a list of 1-to-1 relation of Product, category_id
category_id_df = df2[['Product', 'category_id']].drop_duplicates()

In [None]:
fig = plt.figure(figsize=(8,6))
colors = ['grey','grey','grey','grey','grey','grey','grey','grey','grey',
    'grey','darkblue','darkblue','darkblue']
df2.groupby('Product').Consumer_complaints.count().sort_values().plot.barh(
    ylim=0, color=colors, title= 'complaint count per category')

### Text Preprocessing

In [None]:
# here we will use TFIDF to turn words into vectors
'''
min_df = remove the words from the vocabulary which have occured in less tahn 'min_df' number of files in corpuis
max_df = remove words from vocab that have occured more than 'max_df' number of files
sublinear_tf = set to True to scale the term frequency in logarithmic scale
use_idf = weight factor must use inverse document frequency
ngram_range =  (1,2) to indicate that unigrams and bigrams will be considered
'''
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, ngram_range=(1,2), stop_words='english')
features = tfidf.fit_transform(df2.Consumer_complaints).toarray()
labels = df2.category_id

print(features.shape)  #10000 complaint where each of them has 27973 features
# (10000, 27973)

In [None]:
# print(features[0][f for f in featurez where f != 0])
one = features[1]
one[one > 0]

In [None]:
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'Product']].values)

In [None]:
category_to_id.items()

In [None]:
tfidf.get_feature_names()[25:30]
# ['00 account', '00 accounts', '00 added', '00 addition', '00 additional']

In [None]:
# here we want to meature the correlation between each label|product with complaint features
N=3
# .items() turns each dictionary key-val pair into a tuple
for Product, category_id in sorted(category_to_id.items()):
    features_chi2 = chi2(features, labels == category_id)
    # sort features based on their corroleation to the labels
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]  # when feature is one word
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]   # when feature is two words
    print(Product)
    # show top 3 unigrams and bigrams for this label
    print(', '.join(unigrams[-N:]))
    print(', '.join(bigrams[-N:]))  

### Classification Models
* Random forest
* SVM
* Multinomial Naive Bayes
* LogisticRegression

In [None]:
X = df2['Consumer_complaints']
y = df2['Product']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 0)

In [None]:
# create the models
models = [
    RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]

# 5 Cross-validation
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))

entries = []
for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
    
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

In [None]:
mean_accuracy = cv_df.groupby('model_name').accuracy.mean()
'''
LinearSVC               0.779103
LogisticRegression      0.758114
MultinomialNB           0.647500
RandomForestClassifier  0.387005
'''

In [None]:
# retrain with support vector machine, since it was the highest ranking model
X_train, X_test, y_train, y_test,indices_train,indices_test = train_test_split(features, 
                                                               labels, 
                                                               df2.index, test_size=0.25, 
                                                               random_state=1)
model = LinearSVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

### Precision, Recall, F1-Score

In [None]:
print(metrics.classification_report(y_test, y_pred, target_names=df2['Product'].unique()))

### Predictions

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25,
                                                    random_state = 0)

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,
                        ngram_range=(1, 2), 
                        stop_words='english')
fitted_vectorizer = tfidf.fit(X_train)
tfidf_vectorizer_vectors = fitted_vectorizer.transform(X_train)
model = LinearSVC().fit(tfidf_vectorizer_vectors, y_train)

In [None]:
new_complaint = """I have been enrolled back at XXXX XXXX University in the XX/XX/XXXX. Recently, i have been harassed by \
Navient for the last month. I have faxed in paperwork providing them with everything they needed. And yet I am still getting \
phone calls for payments. Furthermore, Navient is now reporting to the credit bureaus that I am late. At this point, \
Navient needs to get their act together to avoid me taking further action. I have been enrolled the entire time and my \
deferment should be valid with my planned graduation date being the XX/XX/XXXX."""

print(model.predict(fitted_vectorizer.transform([new_complaint])))

In [None]:
df2[df2['Consumer_complaint'] == new_complaint]