In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.metrics.pairwise import linear_kernel


In [2]:
df = pd.read_csv('../data/Consumer_Complaints_with_Consumer_Complaint_Narratives.csv')

In [4]:
def create_df_text(df):
    df_text = pd.DataFrame()  # Create empty df to fill
    
    df_text['Consumer complaint narrative'] = df['Consumer complaint narrative']
    
    cust_resp_dict ={'Closed':0,
                 'Untimely response':0,
                 'Closed with explanation':1,
                 'Closed with non-monetary relief':2,
                 'Closed with monetary relief':2}
    
    df_text['Company response to consumer'] = df['Company response to consumer'].apply(lambda x: cust_resp_dict[x])
    
    return df_text

In [7]:
df_text = create_df_text(df).head()

In [8]:
tokenized = [word_tokenize(content.lower()) for content in df_text['Consumer complaint narrative']]

In [9]:
stop = set(stopwords.words('english'))

In [10]:
docs = [[word for word in words if word not in stop] for words in tokenized]

In [11]:
docs

[['received',
  'capital',
  'one',
  'charge',
  'card',
  'offer',
  'xxxx',
  '.',
  'applied',
  ',',
  'accepted',
  '(',
  '{',
  '$',
  '500.00',
  '}',
  'limit',
  ')',
  ',',
  'activated',
  'card',
  'used',
  'xxxx',
  'presents',
  '.',
  'charge',
  'card',
  '#',
  'xxxx',
  '.',
  'right',
  'activating',
  'card',
  '...',
  'capital',
  'one',
  'sent',
  'another',
  'card',
  '{',
  '$',
  '500.00',
  '}',
  'limit',
  '...',
  'never',
  'activated',
  '...',
  'never',
  'used',
  'card',
  '.',
  'first',
  'bill',
  'card',
  '#',
  'came',
  'due',
  'xxxx',
  'minimum',
  'payment',
  'due',
  '{',
  '$',
  '15.00',
  '}',
  '.',
  'sent',
  '{',
  '$',
  '20.00',
  '}',
  'via',
  'uspmo',
  'sent',
  'due',
  'date',
  '.',
  'xxxx',
  'non-activated',
  ',',
  'non',
  'used',
  'credit',
  'card',
  '...',
  '..they',
  'also',
  'sent',
  'bill',
  'yearly',
  'fees',
  'never',
  'even',
  'activated',
  'card',
  '.',
  'called',
  '...',
  '...',
  '.

In [15]:
porter = PorterStemmer()
snowball = SnowballStemmer('english')
wordnet = WordNetLemmatizer()

docs_porter = [[porter.stem(word) for word in words] for words in docs]
docs_snowball = [[snowball.stem(word) for word in words] for words in docs]
docs_wordnet = [[wordnet.lemmatize(word) for word in words] for words in docs]

In [17]:
print "%16s %16s %16s %16s" % ("word", "porter", "snowball", "lemmatizer")
for i in xrange(min(len(docs_porter[0]), len(docs_snowball[0]), len(docs_wordnet[0]))):
    p, s, w = docs_porter[0][i], docs_snowball[0][i], docs_wordnet[0][i]
    if len(set((p, s, w))) != 1:
        print "%16s %16s %16s %16s" % (docs[0][i], p, s, w)

            word           porter         snowball       lemmatizer
        received           receiv           receiv         received
         capital            capit            capit          capital
          charge            charg            charg           charge
         applied            appli            appli          applied
        accepted           accept           accept         accepted
       activated            activ            activ        activated
            used              use              use             used
          charge            charg            charg           charge
      activating            activ            activ       activating
         capital            capit            capit          capital
         another            anoth            anoth          another
       activated            activ            activ        activated
            used              use              use             used
   non-activated        non-activ        non-act

In [20]:
my_docs = docs_snowball

In [21]:
vocab_set = set()
[[vocab_set.add(token) for token in tokens] for tokens in my_docs]
vocab = list(vocab_set)


In [22]:
vocab_dict = {word: i for i, word in enumerate(vocab)}

In [25]:
import numpy as np

word_counts = np.zeros((len(docs), len(vocab)))
for doc_id, words in enumerate(my_docs):
    for word in words:
        word_id = vocab_dict[word]
        word_counts[doc_id][word_id] += 1

In [26]:
df = np.sum(word_counts > 0, axis=0)

In [27]:
tf_norm = np.sqrt((word_counts ** 2).sum(axis=1))
tf_norm[tf_norm == 0] = 1
tf = word_counts / tf_norm.reshape(len(my_docs), 1)

In [28]:
idf = np.log((len(my_docs) + 1.) / (1. + df)) + 1.
tfidf = tf * idf

In [29]:
tfidf_norm = np.sqrt((tfidf ** 2).sum(axis=1))
tfidf_norm[tfidf_norm == 0] = 1
tfidf_normed = tfidf / tfidf_norm.reshape(len(my_docs), 1)

### SKLEARN

In [30]:
def tokenize(doc):
    '''
    INPUT: string
    OUTPUT: list of strings

    Tokenize and stem/lemmatize the document.
    '''
    return [snowball.stem(word) for word in word_tokenize(doc.lower())]

In [33]:
countvect = CountVectorizer(stop_words='english', tokenizer=tokenize)
count_vectorized = countvect.fit_transform(df_text)

In [37]:
words = countvect.get_feature_names()
print "sklearn count of 'card':", count_vectorized[0, words.index('card')]
print "my count of 'card':", word_counts[0, vocab_dict['card']]

 sklearn count of 'card':

ValueError: 'card' is not in list

In [39]:
tfidfvect = TfidfVectorizer(stop_words='english', tokenizer=tokenize)
tfidf_vectorized = tfidfvect.fit_transform(df_text)




In [40]:
words_tfidf = tfidfvect.get_feature_names()
print "sklearn tfidf of 'dinner':", tfidf_vectorized[0, words_tfidf.index('dinner')]
print "my tfidf of 'dinner':", tfidf[0, vocab_dict['dinner']]

sklearn tfidf of 'dinner':

ValueError: 'dinner' is not in list