In [1]:
# Libraries for data loading, data viz and EDA
import json 
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Libraries for text preprocessing and analysis
import re,nltk,spacy,string
from nltk.corpus import stopwords
nlp=spacy.load("en_core_web_sm")
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.decomposition import NMF
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
from wordcloud import WordCloud, STOPWORDS
import swifter
from sklearn.decomposition import NMF

In [3]:
# Libraries for model evaluation metrics
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, f1_score, classification_report,accuracy_score

In [4]:
#Remove warnings
import warnings
warnings.filterwarnings('ignore')

# row and column display limit
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [None]:
j = open('complaints.json')
  
data = json.load(j)
 
df = pd.json_normalize(data)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.columns

### Removing _ From Column names

In [None]:
for str_loop in df.columns:
#     print("str_loop",str_loop)
    if str_loop.startswith("_"):
        renamed_col = str_loop.lstrip('_')
        df.rename(columns={str_loop:renamed_col}, inplace=True)
print("df cols renamed",df.columns)

In [None]:
df.head()

### Inspecting the number of missing values percentage

In [None]:
round(df.isna().sum()*100/78313,2)

### There are many blank rows in 'source.complaint_what_happened'. Converting them into NaN values

In [None]:
df['source.complaint_what_happened'].replace("", np.nan, inplace=True)

## Dropping NaN rows from "source.complaint_what_happened"

In [None]:
df.dropna(subset=['source.complaint_what_happened'], inplace=True)

df.shape

In [None]:
#Assign new column names
df.rename(columns={'source.complaint_what_happened':'complaints_what_happened', 'source.product':'tag'}, inplace=True)

### Prepare the text for topic modeling
#### Once you have removed all the blank complaints, you need to:

- Make the text lowercase
- Remove text in square brackets
- Remove punctuation
- Remove words containing numbers
- Once you have done these cleaning operations you need to perform the following:

#### Lemmatize the texts
Use POS tags to get relevant words from the texts.

In [None]:
def clean_texts(text):
    #Make the text lowercase
    text=text.lower()
    
    #Remove text in square brackets
    text=re.sub(r'\[.*?\]','',text)
    
    #Remove punctuation
    text=re.sub(r'[%s]%re.escape(string.punctuation)','',text)
    
    #Remove words containing numbers
    text = re.sub(r'\w*\d\w*', '', text)
    
    return text

In [None]:
def lemma_texts(text):     
        
    # Initialize empty list to store lemmas
    lemma_list = []
    
    # Extract lemmas of given text and add to the list 'sent'
    document = nlp(text)
    for word in document:
        lemma_list.append(word.lemma_)
        
    # return string converted form of the list of lemmas
    return " ".join(lemma_list)

In [None]:
df['complaints'] = df['complaints_what_happened'].swifter.apply(clean_texts)
# df.head()

In [None]:
# Add a column for lemmatized complaints to the dataframe
df["lemmatized_complaint"] =  df.swifter.apply(lambda x: lemma_texts(x['complaints_what_happened']), axis=1)

In [None]:
df_clean=df[['complaints_what_happened','lemmatized_complaint']]

####  Function to extract the POS tags

In [None]:
def get_pos_tags(text):
    nn_words = []
    doc = nlp(text)
    for tok in doc:
        if(tok.tag_ == 'NN'):
            nn_words.append(tok.lemma_)
    nn_words_str = " ".join(nn_words)
    return nn_words_str

#this column should contain lemmatized text with all the words removed which have tags other than NN[tag == "NN"].
df_clean["complaint_POS_removed"] =  df_clean.swifter.apply(lambda x: get_pos_tags(x['lemmatized_complaint']), axis=1)

In [None]:
df_clean.head()

### Exploratory data analysis

### Visualise the data according to the 'Complaint' character length

In [None]:
plt.figure(figsize=(10,6))
doc_lens = [len(d) for d in df_clean.complaint_POS_removed]
plt.hist(doc_lens, bins = 50)
plt.ylabel('Number of Complaint')
plt.xlabel('Complaint character length')
sns.despine();

### Worcloud of most frequent words

In [None]:
stop_words = set(STOPWORDS)
word_cloud = WordCloud(
                          background_color='white',
                          stopwords=stop_words,
                          max_font_size=40,
                          max_words=50, 
                          random_state=100
                         ).generate(str(df_clean['complaint_POS_removed']))

fig = plt.figure(figsize=(20,20))
plt.imshow(word_cloud)
plt.axis('off')
plt.show()

### Top unigrams,bigrams and trigrams by frequency among all the complaints

### Unigram

In [None]:
def get_top_unigram(text, n=30):

    vector = CountVectorizer(stop_words='english').fit(text)
    bag_of_words = vector.transform(text)
    sum_of_words = bag_of_words.sum(axis=0) 
    word_freq = [(word, sum_of_words[0, idx]) for word, idx in vector.vocabulary_.items()]
    word_freq =sorted(word_freq, key = lambda x: x[1], reverse=True)
    return word_freq[:n]

In [None]:
top_common_words = get_top_unigram(df_clean['complaint_POS_removed'].values.astype('U'))
df_unigram = pd.DataFrame(top_common_words, columns = ['unigram' , 'count'])
df_unigram.head(15)

In [None]:
plt.figure(figsize=(20,8))
sns.barplot(x='unigram', y='count', data=df_unigram, palette="Reds_d")
plt.xticks(rotation=90)
plt.title("Top 30 unigrams in the Complaint text", fontsize=20)
plt.show()

### Bigram

In [None]:
#Write your code here to find the top 30 bigram frequency among the complaints in the cleaned datafram(df_clean). 
def get_top_bigram(text, n=30):

    vector = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(text)
    bag_of_words = vector.transform(text)
    sum_of_words = bag_of_words.sum(axis=0) 
    word_freq = [(word, sum_of_words[0, idx]) for word, idx in vector.vocabulary_.items()]
    word_freq =sorted(word_freq, key = lambda x: x[1], reverse=True)
    return word_freq[:n]

In [None]:
top_common_words = get_top_bigram(df_clean['complaint_POS_removed'].values.astype('U'))
df_bigram = pd.DataFrame(top_common_words, columns = ['Bigram' , 'count'])
df_bigram.head(10)

In [None]:
plt.figure(figsize=(20,8))
sns.barplot(x='Bigram', y='count', data=df_bigram, palette="Blues_d")
plt.xticks(rotation=90)
plt.title("Top 30 bigrams in the Complaint text", fontsize=20)
plt.show()

In [None]:
def get_top_trigram(text, n=30):

    vector = CountVectorizer(ngram_range=(3, 3), stop_words='english').fit(text)
    bag_of_words = vector.transform(text)
    sum_of_words = bag_of_words.sum(axis=0) 
    word_freq = [(word, sum_of_words[0, idx]) for word, idx in vector.vocabulary_.items()]
    word_freq =sorted(word_freq, key = lambda x: x[1], reverse=True)
    return word_freq[:n]

In [None]:
top_common_words = get_top_trigram(df_clean['complaint_POS_removed'].values.astype('U'))
df_trigram = pd.DataFrame(top_common_words, columns = ['trigram' , 'count'])
df_trigram.head(10)

In [None]:
plt.figure(figsize=(15,6))
sns.barplot(x='trigram', y='count', data=df_trigram, palette="Greens_d")
plt.xticks(rotation=90)
plt.title("Top 30 trigrams in the Complaint text", fontsize=20)
plt.show()

#### The personal details of customer has been masked in the dataset with xxxx. Let's remove the masked text as it won't help us in analysis

In [None]:
df_clean['complaint_POS_removed'] = df_clean['complaint_POS_removed'].str.replace('xxxx','')
df_clean['complaint_POS_removed'] = df_clean['complaint_POS_removed'].str.replace('XXXX','')

In [None]:
# df_clean.to_excel("df_clean.xlsx")
# df_clean=pd.read_excel("df_clean.xlsx")

### Feature Extraction

##### Convert the raw texts to a matrix of TF-IDF features

- max_df is used for removing terms that appear too frequently, also known as "corpus-specific stop words" max_df = 0.95 means "ignore terms that appear in more than 95% of the complaints"

- min_df is used for removing terms that appear too infrequently min_df = 2 means "ignore terms that appear in less than 2 complaints"

In [None]:
tfidf = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')

### Create a document term matrix using fit_transform

In [None]:
dtm = tfidf.fit_transform(df_clean['complaint_POS_removed'])

### Topic Modelling using NMF
- Non-Negative Matrix Factorization (NMF) is an unsupervised technique so there are no labeling of topics that the model will be trained on. The way it works is that, NMF decomposes (or factorizes) high-dimensional vectors into a lower-dimensional representation. These lower-dimensional vectors are non-negative which also means their coefficients are non-negative.

#### Load the nmf_model with the n_components = 5

In [None]:
num_topics = 5

nmf_model = NMF(random_state=40, n_components=num_topics)

In [None]:
nmf_model.fit(dtm)
len(tfidf.get_feature_names())

### Print the Top 20 words for each of the topics

In [None]:
words = np.array(tfidf.get_feature_names())
topic_words_df = pd.DataFrame(np.zeros((num_topics, 20)), index=[f'Topic {i + 1}' for i in range(num_topics)],
                           columns=[f'Word {i + 1}' for i in range(20)]).astype(str)

for i in range(num_topics):
    ix = nmf_model.components_[i].argsort()[::-1][:20]
    topic_words_df.iloc[i] = words[ix]

topic_words_df

### Create the best topic for each complaint in terms of integer value

In [None]:
topic_results = nmf_model.transform(dtm)
topic_results.argmax(axis=1)

### Create a new column Topic and assign the best topic to each of the complaint

In [None]:
df_clean['Topic'] = topic_results.argmax(axis=1)

### Print the first 5 Complaints for each of the Topic

In [None]:
First5_comps=df_clean.groupby('Topic').head(5)
First5_comps.sort_values('Topic')

### After evaluating the mapping, if the topics assigned are correct then assign these names to the relevant topic:
- Bank Account services
- Credit card or prepaid card
- Theft/Dispute Reporting
- Mortgage/Loan
- Others

In [None]:
# Create the dictionary of Topic names and Topics
Topic_names = {0:'Bank account services', 1:'Others', 2:'Mortgage/Loan', 3:'Credit card or prepaid card', 4:'Theft/Dispute Reporting'}

# Replace Topics with Topic Names
df_clean['Topic'] = df_clean['Topic'].map(Topic_names)

In [None]:
df_clean.head(10)

## Applying Supervised Machine Learning Algorithms

In [None]:
# Create the dictionary again of Topic names and Topics

Topic_names = {'Bank account services':0, 'Others':1, 'Mortgage/Loan':2, 'Credit card or prepaid card':3, 'Theft/Dispute Reporting':4}
# Replace Topics with Topic Names
df_clean['Topic'] = df_clean['Topic'].map(Topic_names)

In [None]:
training_data = df_clean[['complaints_what_happened','Topic']]

### Applying the supervised models on the training data created. In this process, we are going to do the following:
- Create the vector counts using Count Vectoriser
- Transform the word vecotr to tf-idf
- Create the train & test data using the train_test_split on the tf-idf & topics

In [None]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(training_data.complaints_what_happened)

# Write your code here to transform the word vector to tf-idf
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import classification_report

In [None]:
# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X_train_tfidf, training_data.Topic, test_size=0.2, random_state=42)

### 1) Logistic Regression

In [None]:
lr = LogisticRegression().fit(X_train, y_train)
predicted_lr = lr.predict(X_test)

print("Classification Report:-")
print(classification_report(y_test, predicted_lr))

accuracy_lr=round(accuracy_score(y_test, predicted_lr),4)*100
print("Accuracy of Logistic Regression:-",accuracy_lr,"%")

### 2) Decision Tree

In [None]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

predicted_dt = dt.predict(X_test)

print("Classification Report:-")
print(classification_report(y_test, predicted_dt))

accuracy_dt=round(accuracy_score(y_test, predicted_dt),4)*100
print("Accuracy of Decision Trees-",accuracy_dt,"%")

### 3) Random Forest

In [None]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

predicted_rf = rfc.predict(X_test)

print("Classification Report:-")
print(classification_report(y_test, predicted_rf))

accuracy_rf=round(accuracy_score(y_test, predicted_rf),4)*100
print("Accuracy of Random Forest-",accuracy_rf,"%")

### 4) Gaussian Naive Bayes

In [None]:
nb = GaussianNB().fit(X_train.toarray(), y_train)
predicted_nb = nb.predict(X_test.toarray())

print("Classification Report:-")
print(classification_report(y_test, predicted_nb))

accuracy_nb=round(accuracy_score(y_test, predicted_nb),4)*100
print("Accuracy of Gaussian Naive Bayes-",accuracy_nb,"%")

### Model Summary

In [None]:
model_summary_df=pd.DataFrame({'Model':['Logistic Regression','Decision Trees','Random Forest','Gaussian Naive Bayes'],
                               'Accuracy (in %)':[accuracy_lr,accuracy_dt,accuracy_rf,accuracy_nb]})

model_summary_df

##### Thus by comapring the applied Machine Learning Algorithms, we can conclude that Logistic Regression Performs better than the other models

#### Using the Logistic Regression for Inference:-

In [None]:
def predict_topic(text):
    Topic_names = {0:'Bank account services', 1:'Others', 2:'Mortgage/Loan', 3:'Credit card or prepaid card', 4:'Theft/Dispute Reporting'}
    X_new_counts = count_vect.transform(text)
    X_new_tfidf = tfidf_transformer.transform(X_new_counts)
    predicted = lr.predict(X_new_tfidf)
    return Topic_names[predicted[0]]

In [None]:
df_complaints = pd.DataFrame({'complaints': ["To whom it may concern, Chase bank charged wrongly overdraft fees, I have alert of low balance or unsuficent fee balance and always deposit immediatly to cover transactions if needed but Chase always changed the order and charged me overdraft fee anyway. when you call they said their per their guidelines they don't refund more then 2 overdrawft doesn't matter bank fault or not.Taken {$34.00} from people is money just because you can is not Ok.See attached documents. When Chase refund, they always find the way to take back what they refunded in first place.",
                                            "Chase is marketing credit cards to those of us with good credit like it's going out of style. Be careful - the marketing is not clear. IF you already have a SWA Chase personal card, do not apply for the new one online. This has sent me through a XXXX triangle, wasting my time and therefore money.It appears in the middle of XX/XX/2018, Chase expanded their undisclosed 5/24 rule to include ALL co-branded cards, not just SWA, yet the marketing machine continues to ignore this policy and the Agents on the phone are not well trained. I am contacting the credit bureau b/c two of the cards opened are not mine - that is the silver lining here. However, the issue with incessant marketing of their branded cards to customers, plus this confusing 5/24 rule and lack of Agent phone training, is false advertising. The information is still relatively opaque, as Chase never comments on the 5/24 rule, but multiple reader and community data points suggest denials due to 5/24 for cards previously exempt. CFPB staff, please help educate consumers and hold the big banks accountable for deceptive trade practices. I do not believe this is intentional on Chase 's part, but the second to last Agent did encourage me to file a complaint here, so be it.",
                                            "What is the procedure to know my CIBIL score?",
                                            "I can not get from chase who services my mortgage, who owns it and who has original loan docs", 
                                  "The bill amount of my credit card was debited twice. Please look into the matter and resolve at the earliest.",
                                  "I want to open a salary account at your downtown branch. Please provide me the procedure.",
                                  "Yesterday, I received a fraudulent email regarding renewal of my services.",
                                            "Where are the bank branches in the city of Mumbai?",
                                            "unwanted service activated and money deducted automatically"]})
df_complaints

### Predictions on new complaints:-

In [None]:
df_complaints['Predicted Topic'] = df_complaints['complaints'].apply(lambda x: predict_topic([x]))
df_complaints