# Text Clustering

Peeraya Khantaruangsakul 63070501054

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Preprocess the text data
stop_words = set(stopwords.words('english'))

### Read data

In [3]:
import gzip
import pickle
import requests

# url = 'http://fastdata.in.th/CPE/CPE393/consumer_complaint_dataset.data'
# response = requests.get(url)

# with open('consumer_complaint_dataset.data', 'wb') as f:
#     f.write(response.content)

with gzip.open('consumer_complaint_dataset.data', 'rb') as f:
    df = pickle.load(f)

In [4]:
df

Unnamed: 0,topic,input
0,Debt collection,transworld systems inc. \nis trying to collect...
1,"Credit reporting, credit repair services, or o...",I would like to request the suppression of the...
2,Debt collection,"Over the past 2 weeks, I have been receiving e..."
3,"Credit reporting, credit repair services, or o...",I HAD FILED WITH CFPB ON XX/XX/XXXX19 TO HAVE ...
4,"Credit reporting, credit repair services, or o...",I have several accounts that the balance is in...
...,...,...
492250,Consumer Loan,I was on automatic payment for my car loan. In...
492251,Debt collection,I recieved a collections call from an unknown ...
492252,Mortgage,"On XXXX XXXX, 2015, I contacted XXXX XXXX, who..."
492253,Mortgage,I can not get from chase who services my mortg...


In [5]:
df.shape

(492255, 2)

In [6]:
pd.crosstab(df.topic, columns='Count').sort_values(by='Count', ascending=False)

col_0,Count
topic,Unnamed: 1_level_1
"Credit reporting, credit repair services, or other personal consumer reports",145090
Debt collection,106946
Mortgage,61581
Credit card or prepaid card,32144
Credit reporting,31588
Student loan,25083
Checking or savings account,19153
Credit card,18838
Bank account or service,14885
Consumer Loan,9473


In [7]:
df.loc[df['topic'] == 'Credit reporting', 'topic'] = 'Credit reporting, credit repair services, or other personal consumer reports'
df.loc[df['topic'] == 'Credit card', 'topic'] = 'Credit card or prepaid card'
df.loc[df['topic'] == 'Prepaid card', 'topic'] = 'Credit card or prepaid card'
df.loc[df['topic'] == 'Payday loan', 'topic'] = 'Payday loan, title loan, or personal loan'
df.loc[df['topic'] == 'Virtual currency', 'topic'] = 'Money transfer, virtual currency, or money service'
df.loc[df['topic'] == 'Money transfers', 'topic'] = 'Money transfer, virtual currency, or money service'
df = df.loc[df['topic'] != 'Other financial service']

In [8]:
pd.crosstab(df.topic, columns='Count').sort_values(by='Count', ascending=False)

col_0,Count
topic,Unnamed: 1_level_1
"Credit reporting, credit repair services, or other personal consumer reports",176678
Debt collection,106946
Mortgage,61581
Credit card or prepaid card,52432
Student loan,25083
Checking or savings account,19153
Bank account or service,14885
Consumer Loan,9473
"Money transfer, virtual currency, or money service",9378
Vehicle loan or lease,8204


### Clean it

In [9]:
import re

def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalpha()]
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [re.sub(r'x{2,}', '', token) for token in tokens]
    return " ".join(tokens)

In [10]:
# clean data
df = df.dropna()
# duplicate data
df = df.drop_duplicates()
df.shape

(464245, 2)

In [11]:
# preprocess text
df['clean_input'] = df['input'].apply(preprocess_text)

In [12]:
df

Unnamed: 0,topic,input,clean_input
0,Debt collection,transworld systems inc. \nis trying to collect...,transworld systems trying collect debt mine ow...
1,"Credit reporting, credit repair services, or o...",I would like to request the suppression of the...,would like request suppression following items...
2,Debt collection,"Over the past 2 weeks, I have been receiving e...",past weeks receiving excessive amounts telepho...
3,"Credit reporting, credit repair services, or o...",I HAD FILED WITH CFPB ON XX/XX/XXXX19 TO HAVE ...,filed cfpb listed accounts deleted stating acc...
4,"Credit reporting, credit repair services, or o...",I have several accounts that the balance is in...,several accounts balance incorrect couple limi...
...,...,...,...
492250,Consumer Loan,I was on automatic payment for my car loan. In...,automatic payment car loan fine print supposed...
492251,Debt collection,I recieved a collections call from an unknown ...,recieved collections call unknown company morn...
492252,Mortgage,"On XXXX XXXX, 2015, I contacted XXXX XXXX, who...",contacted branch manager gateway funding...
492253,Mortgage,I can not get from chase who services my mortg...,get chase services mortgage owns original loan...


In [13]:
df.loc[3, 'clean_input']

'filed cfpb listed accounts deleted stating accounts mine uploaded identity theft report prior disputed accounts supplied identity theft report received investigative results reports  trans union  third attempt dispute fraudulent accounts attached identity theft police report'

In [14]:
df.loc[3, 'input']

'I HAD FILED WITH CFPB ON XX/XX/XXXX19 TO HAVE THE LISTED ACCOUNTS DELETED BY STATING THAT THESE ACCOUNTS WERE " NOT MINE \'\' AND I UPLOADED MY IDENTITY THEFT REPORT. PRIOR TO XX/XX/2019 I HAD DISPUTED THESE SAME ACCOUNTS AND SUPPLIED AN IDENTITY THEFT REPORT BUT I HAVE NOT RECEIVED INVESTIGATIVE RESULTS REPORTS FROM XXXX, TRANS UNION OR XXXX. \n\nTHIS IS THE THIRD ATTEMPT TO DISPUTE THESE FRAUDULENT ACCOUNTS AND I HAVE ATTACHED MY IDENTITY THEFT POLICE REPORT.'

### represent it

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer( max_features=1000, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(df['clean_input'])

In [16]:
X

<464245x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 20465992 stored elements in Compressed Sparse Row format>

### Perform text clustering and interpret the results by KMEAN

In [17]:
from sklearn.cluster import KMeans

km = KMeans(n_clusters=11, random_state=42).fit(X)



In [18]:
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
for i in range(11):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()

Cluster 0: credit report inquiry inquiries information experian reporting transunion removed remove
Cluster 1: accounts credit identity theft report information reporting victim fraudulent account
Cluster 2: loan mortgage loans payments interest modification payment home would student
Cluster 3: payment late payments account due made credit month would mortgage
Cluster 4: wells fargo account mortgage loan bank home would payment credit
Cluster 5: account credit report reporting balance closed collection paid information opened
Cluster 6: bank account money america check funds chase checking would deposit
Cluster 7: equifax credit information report breach reporting account security data dispute
Cluster 8: card credit account chase charges balance would bank charge one
Cluster 9: call phone called company told would number received calls said
Cluster 10: debt collection credit company report letter collect validation account owe


In [19]:
# predict cluster
df['cluster_kmean'] = km.predict(X)
df

Unnamed: 0,topic,input,clean_input,cluster_kmean
0,Debt collection,transworld systems inc. \nis trying to collect...,transworld systems trying collect debt mine ow...,10
1,"Credit reporting, credit repair services, or o...",I would like to request the suppression of the...,would like request suppression following items...,1
2,Debt collection,"Over the past 2 weeks, I have been receiving e...",past weeks receiving excessive amounts telepho...,9
3,"Credit reporting, credit repair services, or o...",I HAD FILED WITH CFPB ON XX/XX/XXXX19 TO HAVE ...,filed cfpb listed accounts deleted stating acc...,1
4,"Credit reporting, credit repair services, or o...",I have several accounts that the balance is in...,several accounts balance incorrect couple limi...,1
...,...,...,...,...
492250,Consumer Loan,I was on automatic payment for my car loan. In...,automatic payment car loan fine print supposed...,3
492251,Debt collection,I recieved a collections call from an unknown ...,recieved collections call unknown company morn...,9
492252,Mortgage,"On XXXX XXXX, 2015, I contacted XXXX XXXX, who...",contacted branch manager gateway funding...,2
492253,Mortgage,I can not get from chase who services my mortg...,get chase services mortgage owns original loan...,2


In [20]:
# most common topic in each cluster but not duplicate
pd.crosstab(df.cluster_kmean, df.topic).idxmax(axis=1)

cluster_kmean
0     Credit reporting, credit repair services, or o...
1     Credit reporting, credit repair services, or o...
2                                              Mortgage
3     Credit reporting, credit repair services, or o...
4                                              Mortgage
5     Credit reporting, credit repair services, or o...
6                           Checking or savings account
7     Credit reporting, credit repair services, or o...
8                           Credit card or prepaid card
9                                       Debt collection
10                                      Debt collection
dtype: object

### LDA

In [21]:
# LDAModel
from gensim.models import LdaModel
from gensim.corpora import Dictionary
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_df=0.05, min_df=2, stop_words='english')

# Create a corpus from a list of texts
common_dictionary = Dictionary([text.split() for text in df['clean_input']])
common_corpus = [common_dictionary.doc2bow(text.split()) for text in df['clean_input']]
# Train the model on the corpus.
lda = LdaModel(common_corpus, num_topics=11, id2word=common_dictionary)


In [22]:
# predict cluster most likely 3 topics only
df['cluster_lda'] = [lda.get_document_topics(common_dictionary.doc2bow(text.split())) for text in df['clean_input']]
df['cluster_lda'] = df['cluster_lda'].apply(lambda x: sorted(x, key=lambda y: y[1], reverse=True)[:3])
df['cluster_lda'] = df['cluster_lda'].apply(lambda x: [y[0] for y in x])

In [23]:
# print each cluster topic no prob
for i in range(11):
    print("Cluster %d:" % i, end='')
    for ind in lda.show_topic(i, 10):
        print(' %s' % ind[0], end='')
    print()

Cluster 0: identity theft information consumer fraud fraudulent report police name victim
Cluster 1: credit report information equifax accounts removed score experian remove card
Cluster 2: told called would call said back phone get could asked
Cluster 3: debt collection company received letter sent account agency amount never
Cluster 4: account credit late payment report reporting balance reported paid payments
Cluster 5: account bank card check money payment chase would received funds
Cluster 6: loan payment mortgage payments interest loans would home amount pay
Cluster 7: information reporting credit account request act provide letter fcra law
Cluster 8: insurance car vehicle company paid loan finance union lease financial
Cluster 9: inquiry unauthorized inquiries hard date usaa authorized authorize auto authorization
Cluster 10: wells fargo one offer capital us american bank bonus complaint


In [24]:
df

Unnamed: 0,topic,input,clean_input,cluster_kmean,cluster_lda
0,Debt collection,transworld systems inc. \nis trying to collect...,transworld systems trying collect debt mine ow...,10,"[3, 4, 1]"
1,"Credit reporting, credit repair services, or o...",I would like to request the suppression of the...,would like request suppression following items...,1,"[0, 1, 7]"
2,Debt collection,"Over the past 2 weeks, I have been receiving e...",past weeks receiving excessive amounts telepho...,9,"[2, 3]"
3,"Credit reporting, credit repair services, or o...",I HAD FILED WITH CFPB ON XX/XX/XXXX19 TO HAVE ...,filed cfpb listed accounts deleted stating acc...,1,"[0, 1, 7]"
4,"Credit reporting, credit repair services, or o...",I have several accounts that the balance is in...,several accounts balance incorrect couple limi...,1,"[4, 1, 2]"
...,...,...,...,...,...
492250,Consumer Loan,I was on automatic payment for my car loan. In...,automatic payment car loan fine print supposed...,3,"[4, 6, 8]"
492251,Debt collection,I recieved a collections call from an unknown ...,recieved collections call unknown company morn...,9,"[2, 3, 4]"
492252,Mortgage,"On XXXX XXXX, 2015, I contacted XXXX XXXX, who...",contacted branch manager gateway funding...,2,"[5, 6, 10]"
492253,Mortgage,I can not get from chase who services my mortg...,get chase services mortgage owns original loan...,2,"[6, 3, 5]"
