In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition
import matplotlib.pyplot as plt
import numpy as np
import re
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
df = pd.read_csv('https://github.com/srivatsan88/YouTubeLI/blob/master/dataset/consumer_compliants.zip?raw=true', compression = 'zip', sep = ',', quotechar='"')

In [5]:
complaints_df = df[['Consumer complaint narrative', 'Product', 'Company']].rename(columns = {'Consumer complaint narrative': 'complaints'})

In [6]:
X_train, X_hold = train_test_split(complaints_df, test_size = 0.6, random_state = 111)

In [7]:
X_train['Product'].value_counts()

Debt collection                8720
Credit card or prepaid card    5297
Mortgage                       3809
Checking or savings account    2822
Student loan                   1236
Vehicle loan or lease          1097
Name: Product, dtype: int64

In [10]:
stemmer = PorterStemmer()

def tokenize(text):
  tokens = [word for word in nltk.word_tokenize(text) if len(word) > 3 and (len(word.strip('Xx/')) > 2)]
  tokens = [stemmer.stem(item) for item in tokens]
  return tokens

vectorizer_tf = TfidfVectorizer(tokenizer = tokenize, stop_words = 'english', max_df = 0.75, min_df = 50, max_features = 10000, use_idf = False, norm = None)
tf_vectors = vectorizer_tf.fit_transform(X_train.complaints)



In [16]:
tf_vectors.A

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [18]:
lda = decomposition.LatentDirichletAllocation(n_components = 6, max_iter = 3, learning_method = 'online', learning_offset = 50, n_jobs = -1, random_state = 111)

W1 = lda.fit_transform(tf_vectors)
H1 = lda.components_

In [19]:
W1

array([[0.00114034, 0.11071407, 0.23730026, 0.00114375, 0.49695421,
        0.15274736],
       [0.32157986, 0.07933833, 0.00117156, 0.00116331, 0.00116719,
        0.59557976],
       [0.00522585, 0.00526942, 0.10523552, 0.00526173, 0.00529843,
        0.87370906],
       ...,
       [0.00304667, 0.27103466, 0.15027818, 0.00306514, 0.00305656,
        0.56951879],
       [0.01859061, 0.01865304, 0.37339021, 0.01872748, 0.01857241,
        0.55206625],
       [0.00133922, 0.00133128, 0.00133334, 0.21368028, 0.78098801,
        0.00132788]])

In [23]:
num_words = 15

vocab = np.array(vectorizer_tf.get_feature_names_out())

top_words = lambda t: [vocab[i] for i in np.argsort(t)[:-num_words-1:-1]]
topic_words = ([top_words(t) for t in H1])
topics = [' '.join(t) for t in topic_words]

In [24]:
topics

['citi offer state provid applic requir purchas thi document term contract consum servic complaint sale',
 'thi told receiv said phone time ask number inform contact email becaus sent need compani',
 'credit thi charg account card disput report balanc payment receiv statement capit late inform issu',
 'thi debt report collect credit account compani inform letter agenc provid valid request receiv state',
 'payment loan mortgag thi month paid time make year servic insur receiv compani home late',
 'account bank card check thi close money open credit chase charg fund transact deposit fraud']

In [25]:
colnames = ['Topic' + str(i) for i in range(lda.n_components)]
docnames = ['Doc' + str(i) for i in range(len(X_train.complaints))]
df_doc_topic = pd.DataFrame(np.round(W1, 2), columns = colnames, index = docnames)
df_doc_topic['dominant_topic'] = np.argmax(df_doc_topic.values, axis = 1)

In [26]:
df_doc_topic

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,dominant_topic
Doc0,0.00,0.11,0.24,0.00,0.50,0.15,4
Doc1,0.32,0.08,0.00,0.00,0.00,0.60,5
Doc2,0.01,0.01,0.11,0.01,0.01,0.87,5
Doc3,0.04,0.51,0.02,0.43,0.00,0.00,1
Doc4,0.00,0.00,0.20,0.64,0.00,0.15,3
...,...,...,...,...,...,...,...
Doc22976,0.36,0.22,0.18,0.00,0.00,0.23,0
Doc22977,0.17,0.00,0.00,0.17,0.16,0.49,5
Doc22978,0.00,0.27,0.15,0.00,0.00,0.57,5
Doc22979,0.02,0.02,0.37,0.02,0.02,0.55,5


In [29]:
WHold = lda.transform(vectorizer_tf.transform(X_hold.complaints[:5]))

In [31]:
colnames = ['Topic' + str(i) for i in range(lda.n_components)]
docnames = ['Doc' + str(i) for i in range(len(X_hold.complaints[:5]))]
df_doc_topic = pd.DataFrame(np.round(WHold, 2), columns = colnames, index = docnames)
df_doc_topic['dominant_topic'] = np.argmax(df_doc_topic.values, axis = 1)

In [32]:
df_doc_topic

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,dominant_topic
Doc0,0.01,0.04,0.04,0.0,0.0,0.91,5
Doc1,0.0,0.0,0.0,0.5,0.49,0.0,3
Doc2,0.01,0.01,0.33,0.01,0.01,0.64,5
Doc3,0.03,0.12,0.0,0.01,0.84,0.0,4
Doc4,0.0,0.12,0.43,0.0,0.35,0.1,2
