In [1]:
from tabulate import tabulate
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from gensim.models.word2vec import Word2Vec
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import StratifiedShuffleSplit

In [2]:
df = pd.read_csv('../data/Consumer_Complaints_with_Consumer_Complaint_Narratives.csv')

In [3]:
def create_df_text(df):
    """
    Take input df and get 'Consumer complaint narrative' (text features) for text
    modeling.  Also create labels column 'Company response to consumer', with 3
    categories.

    INPUT - dataframe
    OUTPUT - 'df_text' dataframe for use in modeling text features
    """

    print "Creating df_text for text feature modeling...\n"

    df_text = pd.DataFrame()  # Create empty df to fill

    df_text['Consumer complaint narrative'] = df['Consumer complaint narrative']

    #  Create numerical values for 'Company response to consumer' and map to df
    cust_resp_dict ={'Closed':0,
                 'Untimely response':0,
                 'Closed with explanation':0,
                 'Closed with non-monetary relief':1,
                 'Closed with monetary relief':1}

    df_text['Company response to consumer'] = df['Company response to consumer'].apply(lambda x: cust_resp_dict[x])
    print "Successfully created df_text for non-text feature modeling!!!\n"

    return df_text[1:10001]


In [4]:
df = create_df_text(df)
df.head()

Creating df_text for text feature modeling...

Successfully created df_text for non-text feature modeling!!!



Unnamed: 0,Consumer complaint narrative,Company response to consumer
1,I do n't know how they got my cell number. I t...,0
2,I 'm a longtime member of Charter One Bank/RBS...,0
3,"After looking at my credit report, I saw a col...",1
4,I received a call from a XXXX XXXX from XXXX @...,0
5,Was not contacted 4 years later about some pri...,0


In [5]:
def get_X_y(df):
    """
    Split dataframe into X (text) and y (labels)
    """
    X = df['Consumer complaint narrative'].tolist()
    y = df['Company response to consumer'].tolist()

    return X, y

In [6]:
X, y = get_X_y(df)

In [7]:
# train word2vec on all the texts - both training and test set
# we're not using test labels, just texts so this is fine
model = Word2Vec(X, size=100, window=5, min_count=5, workers=2)
model.index2word
w2v = {w: vec for w, vec in zip(model.index2word, model.syn0)}

In [8]:
mult_nb = Pipeline([("count_vectorizer", CountVectorizer(analyzer=lambda x: x)), ("multinomial nb", MultinomialNB())])
bern_nb = Pipeline([("count_vectorizer", CountVectorizer(analyzer=lambda x: x)), ("bernoulli nb", BernoulliNB())])
mult_nb_tfidf = Pipeline([("tfidf_vectorizer", TfidfVectorizer(analyzer=lambda x: x)), ("multinomial nb", MultinomialNB())])
bern_nb_tfidf = Pipeline([("tfidf_vectorizer", TfidfVectorizer(analyzer=lambda x: x)), ("bernoulli nb", BernoulliNB())])
# SVM - which is supposed to be more or less state of the art 
# http://www.cs.cornell.edu/people/tj/publications/joachims_98a.pdf
svc = Pipeline([("count_vectorizer", CountVectorizer(analyzer=lambda x: x)), ("linear svc", SVC(kernel="linear"))])
svc_tfidf = Pipeline([("tfidf_vectorizer", TfidfVectorizer(analyzer=lambda x: x)), ("linear svc", SVC(kernel="linear"))])

In [9]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = len(word2vec.itervalues().next())
    
    def fit(self, X, y):
        return self 

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec] 
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])
    
# and a tf-idf version of the same

class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = len(word2vec.itervalues().next())
        
    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of 
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf, 
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])
    
        return self
    
    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

In [10]:
# Extra Trees classifier is almost universally great, let's stack it with our embeddings
# etree_glove_small = Pipeline([("glove vectorizer", MeanEmbeddingVectorizer(glove_small)), 
#                         ("extra trees", ExtraTreesClassifier(n_estimators=200))])
# etree_glove_small_tfidf = Pipeline([("glove vectorizer", TfidfEmbeddingVectorizer(glove_small)), 
#                         ("extra trees", ExtraTreesClassifier(n_estimators=200))])
# etree_glove_big = Pipeline([("glove vectorizer", MeanEmbeddingVectorizer(glove_big)), 
#                         ("extra trees", ExtraTreesClassifier(n_estimators=200))])
# etree_glove_big_tfidf = Pipeline([("glove vectorizer", TfidfEmbeddingVectorizer(glove_big)), 
#                         ("extra trees", ExtraTreesClassifier(n_estimators=200))])

etree_w2v = Pipeline([("word2vec vectorizer", MeanEmbeddingVectorizer(w2v)), 
                        ("extra trees", ExtraTreesClassifier(n_estimators=200))])
etree_w2v_tfidf = Pipeline([("word2vec vectorizer", TfidfEmbeddingVectorizer(w2v)), 
                        ("extra trees", ExtraTreesClassifier(n_estimators=200))])

In [None]:
all_models = [
#     ("mult_nb", mult_nb),
#     ("mult_nb_tfidf", mult_nb_tfidf),
#     ("bern_nb", bern_nb),
#     ("bern_nb_tfidf", bern_nb_tfidf),
    ("svc", svc),
    ("svc_tfidf", svc_tfidf),
#     ("glove_small", etree_glove_small), 
#     ("glove_small_tfidf", etree_glove_small_tfidf),
#     ("glove_big", etree_glove_big), 
#     ("glove_big_tfidf", etree_glove_big),
#     ("w2v", etree_w2v),
#     ("w2v_tfidf", etree_w2v_tfidf),
]
scores = sorted([(name, cross_val_score(model, X, y, cv=5).mean()) 
                 for name, model in all_models], 
                key=lambda (_, x): -x)
print tabulate(scores, floatfmt=".4f", headers=("model", 'score'))