# Chatbot Conversations From Customer Service Scripts

In [52]:
import numpy as np
import pandas as pd
import sys, os, re, itertools, collections, string, time
from io import BytesIO
from collections import Counter
from time import time
import datetime
   


In [53]:
# https://catalog.data.gov/dataset/consumer-complaint-database 
complaints_df_raw = pd.read_csv("consumer_complaints.csv", 
                usecols=('Product','Consumer complaint narrative', 'Sub-issue'),
                dtype={'consumer_complaint_narrative': object})
# Only interested in data with consumer complaints
complaints_df_raw=complaints_df_raw[complaints_df_raw['Consumer complaint narrative'].notnull()]
complaints_df_raw=complaints_df_raw[complaints_df_raw['Product'].notnull()]

# remove XXXX from narratives
complaints_df_raw['Consumer complaint narrative'] =  complaints_df_raw['Consumer complaint narrative'].replace({'X':''}, regex=True)

# always seed your random generators for reporducilibity 
complaints_df_raw = complaints_df_raw.sample(200000, replace=False, random_state=1)

# basic sentence prep
# set to lower
complaints_df_raw['Consumer complaint narrative'] = complaints_df_raw['Consumer complaint narrative'].str.lower()
# remove special characters
complaints_df_raw['Consumer complaint narrative'] = complaints_df_raw['Consumer complaint narrative'].str.replace('\W', ' ')

# remove elements with no text
complaints_df_raw= complaints_df_raw[complaints_df_raw['Consumer complaint narrative'] != '']

# any dups
complaints_df_raw = complaints_df_raw.drop_duplicates(subset=['Consumer complaint narrative'])

  interactivity=interactivity, compiler=compiler, result=result)


In [54]:
complaints_df_raw.head()

Unnamed: 0,Product,Sub-issue,Consumer complaint narrative
377311,"Credit reporting, credit repair services, or o...",Old information reappears or never goes away,fl this account was over and c...
156975,Debt collection,Debt is not yours,on xx xx xxxx a friend family member was searc...
732081,Mortgage,,this is a transfer of servicing issue my mort...
438555,Mortgage,,i knew we were behind and i ve been working ...
430892,"Credit reporting, credit repair services, or o...",Their investigation did not fix an error on yo...,i have a collection account on my three 3 ...


# Clean Up Data

In [55]:
complaints_df = complaints_df_raw.copy()

In [56]:
word_similarity=complaints_df['Consumer complaint narrative'].str.split(' ').map(Counter)
word_similarity_ratio = []
complaints_df.shape 

(194141, 3)

In [57]:
for wu in word_similarity:
    word_similarity_ratio.append(np.sum([x[1] for x in wu.items()])/np.float(len(wu)))
    
complaints_df['narrative_similarity_ratio'] = word_similarity_ratio
complaints_df['narrative_similarity_ratio'].describe()

count    194141.000000
mean          2.101858
std           1.096282
min           1.000000
25%           1.620000
50%           1.969231
75%           2.409836
max         229.681818
Name: narrative_similarity_ratio, dtype: float64

In [58]:
# thin out some entries that contain too much duplicated lines within
complaints_df = complaints_df[complaints_df['narrative_similarity_ratio'] <= 1.7]
complaints_df.reset_index(drop=True,inplace=True)
complaints_df.shape

(59635, 4)

In [59]:
list(complaints_df['Consumer complaint narrative'])[0:4]

['i never authorize this account to be open at no point and i have call the bank to fix this and they only give the run round everything  i contacted them 3 days ago and they said i dont have no account with a negative balance but in my xxxx xxxx it came up with a negative balance and this is not letting me open other banks account ',
 'deposited my personal checks in my account  chase shut my account and stole the xxxx xxxx dollars    after the funds cleared the account they shut my account and unwilling to return my funds claiming that they can not verify im working for this employee    ',
 'i suspect a scam to prolong my pmi payments  i was advised that an appraiser will contact me in a month  it took over 6 months to get an appraiser scheduled  then once completed  with a 25   debt to value ratio  i was denied  it is now being reviewed  and it will take another 2 weeks for the results  culprit loancare   escrow  xxxx  va ',
 'getting repeated harrassing phone calls stating there ar

# Get Key Verbs And Nouns

In [60]:
# find most common verbs and measure coverage 
import spacy
# Load English tokenizer, tagger, parser, NER and word vectors
#nlp = spacy.load('en')

# just load what we need to avoid taxing memory
nlp = spacy.load('en', parser=False, entity=False)


In [61]:
# create one big blob of text to process things a bit faster
blob_complaints = ''.join(list(complaints_df['Consumer complaint narrative']))

# Max text of length of 1000000
n = 900000
blog_chunks = [blob_complaints[i:i+n] for i in range(0, len(blob_complaints), n)]
len(blog_chunks)
#blog_chunks

22

In [62]:
just_verbs = []
just_nouns = []
counter_=len(blog_chunks)
for sentence in blog_chunks:
    counter_ -= 1
    if (counter_ % 10 == 0): print(counter_)
    print(counter_)
    doc = nlp(sentence.encode().decode('utf-8'))
    temp_verb = []
    temp_noun = []
    for token in doc: 
        if (token.pos_ == u'VERB'): 
            temp_verb.append(token.text)
        if (token.pos_ == u'NOUN'):
            temp_noun.append(token.text)
            

    just_verbs.append(' '.join(temp_verb).encode('utf-8'))
    just_nouns.append(' '.join(temp_noun).encode('utf-8'))
    
    

21
20
20
19
18
17
16
15
14
13
12
11
10
10
9
8
7
6
5
4
3
2
1
0
0


In [63]:
just_verbs[0].split()[0:10]

[b'authorize',
 b'be',
 b'have',
 b'call',
 b'fix',
 b'give',
 b'contacted',
 b'said',
 b'do',
 b'have']

In [64]:
just_nouns[0].split()[0:10]

[b'account',
 b'point',
 b'bank',
 b'run',
 b'everything',
 b'days',
 b'account',
 b'balance',
 b'xxxx',
 b'balance']

In [65]:
print('count just_verbs: %i' % len(just_verbs))
print('count just_nouns: %i' % len(just_nouns))
    

count just_verbs: 22
count just_nouns: 22


In [66]:
# pickle both objects so you don't have to re-run spacy 
import pickle
pickle_file = "verbs_nouns.p"

overwrite_old_pickle = True
if overwrite_old_pickle:
    with open(pickle_file, "wb") as f:
        pickle.dump([just_verbs, just_nouns], f)
    
# read in saved pickle
with open(pickle_file, "rb") as f:
    backup_pos = pickle.load(f)

## Sorting Out Verbs

In [67]:
all_verbs = backup_pos[0]
len(all_verbs)

# append all verbs together so we can run frequency counts
verbs = []
for verb_set in all_verbs:
    verbs.append(verb_set.split())
    #verbs = [verb for verb in verb_set[0].split()]

len(verbs)
verbs_master = [val for sublist in verbs for val in sublist]
len(verbs_master)

836837

In [68]:
# what is your upper and lower cut offs?
from collections import Counter
verbs_df = pd.DataFrame(Counter([verb for verb in verbs_master]).most_common(), columns = ['verb', 'count'])
verbs_df.head(20)

Unnamed: 0,verb,count
0,b'have',52734
1,b'is',42147
2,b'was',39680
3,b'xxxx',24119
4,b'has',17523
5,b'are',16991
6,b'been',16264
7,b'be',15060
8,b'do',12603
9,b'had',11272


In [69]:
len(verbs_df[verbs_df['count'] > 1000])
verbs_df = verbs_df[verbs_df['count'] > 1000]
len(verbs_df)

135

## Sorting Out Nouns

In [70]:
all_nouns = backup_pos[1]

# append all verbs together so we can run frequency counts
nouns = []
for noun_set in all_nouns:
    nouns.append(noun_set.split())

nouns_master = [val for sublist in nouns for val in sublist]
len(nouns_master)

881007

In [71]:
# what is your upper and lower cut offs?
from collections import Counter
nouns_df = pd.DataFrame(Counter([noun for noun in nouns_master]).most_common(), columns = ['noun', 'count'])
nouns_df.head()

Unnamed: 0,noun,count
0,b'xxxx',77833
1,b'credit',45882
2,b'account',26849
3,b'report',20325
4,b'debt',17253


In [72]:
len(nouns_df[nouns_df['count'] > 1000])
nouns_df = nouns_df[nouns_df['count'] > 1000]
len(nouns_df)

157

## Binarize DataFrame With Official Verb & Noun List

In [73]:
# create new data frame with key verbs and nouns as features
key_words = list(nouns_df['noun']) + list(verbs_df['verb'])
row_bools = []
counter_ = len(complaints_df['Consumer complaint narrative'])
for sentence in complaints_df['Consumer complaint narrative']:
    counter_ -= 1
    if (counter_ % 10000 == 0): print(counter_)
    row_bool = []
    words = sentence.split()
    for kw in key_words:
        row_bool.append(kw in words)
    row_bools.append(row_bool)
    
print('length:', len(row_bools))
row_bools = pd.DataFrame(row_bools, columns=key_words)    
row_bools = row_bools.astype(int)
row_bools.shape

    

50000
40000
30000
20000
10000
0
length: 59635


(59635, 292)

In [74]:
row_bools.head()

Unnamed: 0,b'xxxx',b'credit',b'account',b'report',b'debt',b'information',b'company',b'loan',b'payment',b'bank',...,b'feel',b'spoke',b'work',b're',b'delete',b'gave',b'ca',b'noticed',b'must',b'responded'
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Cluster of popular sentences

In [75]:
from sklearn.cluster import KMeans

TOTAL_CLUSTERS = 50

# Number of clusters
kmeans = KMeans(n_clusters=TOTAL_CLUSTERS)
# Fitting the input data
kmeans = kmeans.fit(row_bools)
# Getting the cluster labels
labels = kmeans.predict(row_bools)

# add cluster back to data frame 
row_bools['cluster'] = labels

row_bools['cluster'].value_counts().head()

  return_n_iter=True)


0    59635
Name: cluster, dtype: int64

In [76]:
row_bools['cluster'].value_counts().head()

0    59635
Name: cluster, dtype: int64

In [77]:

# add cluster number back to orginal corpus
complaints_df['Cluster'] = labels
# import sys
# reload(sys)
# sys.setdefaultencoding('utf8')
import itertools
from collections import Counter
import nltk
from nltk.util import ngrams

unique_complaints_2grams = []
unique_complaints_3grams = []
unique_complaints_4grams = []
unique_complaints_5grams = []
unique_complaints_6grams = []
# loop through each cluster
for cluster_to_search in range(min(row_bools['cluster']), max(row_bools['cluster'])+1):
    # cluster-level research
    print('Cluster: %i' % cluster_to_search)
    df_tmp = complaints_df[complaints_df['Cluster']==cluster_to_search].copy()
    print('data cluster shape: %s' % len(df_tmp))
    
    bigrams = []
    trigrams = []
    fourgrams = []
    fivegrams = []
    sixgrams = []
    
    for index, row in df_tmp.iterrows(): 
        token = nltk.word_tokenize(row['Consumer complaint narrative'].encode().decode('utf-8'))
        bigrams.append([' '.join(pair) for pair in list(ngrams(token,2)) if len(set(pair))==2])
        trigrams.append([' '.join(pair) for pair in list(ngrams(token,3)) if len(set(pair))==3])
        fourgrams.append([' '.join(pair) for pair in list(ngrams(token,4)) if len(set(pair))==4])
        fivegrams.append([' '.join(pair) for pair in list(ngrams(token,5)) if len(set(pair))==5])
        sixgrams.append([' '.join(pair) for pair in list(ngrams(token,6)) if len(set(pair))==6])
        
    bigrams = [val for sublist in bigrams for val in sublist]
    trigrams = [val for sublist in trigrams for val in sublist]
    fourgrams = [val for sublist in fourgrams for val in sublist]
    fivegrams = [val for sublist in fivegrams for val in sublist]
    sixgrams = [val for sublist in sixgrams for val in sublist]
    
    # find top x most popular grams per size
    # 2 bigrams
    freqx = pd.DataFrame(Counter([noun for noun in bigrams]).most_common(50), columns=['bigrams','frequency'])
    freqx['Cluster'] = cluster_to_search
    unique_complaints_2grams.append(freqx)
    # 3 bigrams
    freqx = pd.DataFrame(Counter([noun for noun in trigrams]).most_common(50), columns=['trigrams','frequency'])
    freqx['Cluster'] = cluster_to_search
    unique_complaints_3grams.append(freqx)
    # 4 bigrams
    freqx = pd.DataFrame(Counter([noun for noun in fourgrams]).most_common(50), columns=['fourgrams','frequency'])
    freqx['Cluster'] = cluster_to_search
    unique_complaints_4grams.append(freqx)
    # 5 bigrams
    freqx = pd.DataFrame(Counter([noun for noun in fivegrams]).most_common(50), columns=['fivegrams','frequency'])
    freqx['Cluster'] = cluster_to_search
    unique_complaints_5grams.append(freqx)
    # 6 bigrams
    freqx = pd.DataFrame(Counter([noun for noun in sixgrams]).most_common(50), columns=['sixgrams','frequency'])
    freqx['Cluster'] = cluster_to_search
    unique_complaints_6grams.append(freqx)
 
  

Cluster: 0
data cluster shape: 59635


In [78]:
    df = pd.concat(unique_complaints_4grams)
    # freqx = pd.DataFrame(Counter([noun for noun in fourgrams]).most_common(50), columns=['fourgrams','frequency'])
    df = df.drop_duplicates(subset=['fourgrams'], keep=False)
    df.head()

Unnamed: 0,fourgrams,frequency,Cluster
0,on my credit report,6051,0
1,from my credit report,1709,0
2,my credit report and,1465,0
3,i do n t,1361,0
4,my credit report i,1315,0


In [79]:
# find top x most popular grams per size
see_grams = 6


if see_grams==2:
    df = pd.concat(unique_complaints_2grams)
    df = df.drop_duplicates(subset=['bigrams'], keep=False)
elif see_grams==3:
    df = pd.concat(unique_complaints_3grams)
    df = df.drop_duplicates(subset=['trigrams'], keep=False)
elif see_grams==4:
    df = pd.concat(unique_complaints_4grams)
    df = df.drop_duplicates(subset=['fourgrams'], keep=False)
elif see_grams==5:
    df = pd.concat(unique_complaints_5grams)
    df = df.drop_duplicates(subset=['fivegrams'], keep=False)
elif see_grams==6:
    df = pd.concat(unique_complaints_6grams)
    df = df.drop_duplicates(subset=['sixgrams'], keep=False)
 
df = df.sort_values('Cluster')
df[df['frequency'] > 10]  



Unnamed: 0,sixgrams,frequency,Cluster
0,i am a victim of identity,484,0
27,when i reviewed my credit report,176,0
28,promptly delete all information which can,176,0
29,information which can not be verified,176,0
30,when i called the company they,175,0
31,has been non compliant with removing,174,0
32,been non compliant with removing the,174,0
33,non compliant with removing the unverified,174,0
34,611 5 a of the fcra,174,0
35,compliant with removing the unverified account,173,0


## Tie It Back To Complaint

In [80]:
# tie it back to look into a couple of actual complaints
keywords = "attempting to collect a debt from"
 
for index, row in complaints_df.iterrows():
    txt = row['Consumer complaint narrative'] 
    if (keywords in txt):
        print(txt)
        print('------')
    
 

this company is attempting to collect a debt from me that is not mine 
------
portfolio recovery associates of xxxx  va  has made numerous phone calls to my home attempting to collect a debt from a third party after confirmed receipt of a certified letter from me telling them the party they are attempting to contact does not live at my address  does not have my phone number  and not to contact me any more regarding this matter 
------
i have received a xxxx xxxx 2015 letter and various phone calls from central credit services  llc attempting to collect a debt from xxxx in xxxx  these are most likely about a xxxx services accrued in a bundled service  i have statements from xxxx showing that the statements are paid in full  i previously filed a complaint with the ftc regarding this manner and i thought it was taken care of  now i am receiving bills at my new location in xxxx 
------
i had forster garbus  llp contact me at my place of employment  which my employer does not allow  from ph

  is attempting to collect a debt from a company i know nothing about  she threatened me and stated   she was going to contact my employer and there was a warrant out for my arrest from check enforcement  i asked for her to submit a document to show i owe this debt but was unsuccessful  she calling from   not sure what to do becuase she harassing me at work 
------
i received a message from global credit and collection company who said it was attempting to collect a debt i owe  i called the number back who said it was a non working number for xxxx  this same company left a message on xxxx other phone numbers that have never been associated with any account i have  it was disclosed on answering machines voicemail that they were attempting to collect a debt from me 
------
collection services of athens is attempting to collect a debt from me and i never established a contract with this company  furthermore collection services of athens is currently reporting negative information onto my 