In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn import metrics

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
from nltk.stem import WordNetLemmatizer                                                         # for lemmatization

import re

In [2]:
data = pd.read_excel('BankReviews.xlsx')

In [3]:
data

Unnamed: 0,Date,Stars,Reviews,BankName
0,2017-04-10,5,"Great job, Wyndham Capital! Each person was pr...",Wyndham Capital Mortgage
1,2017-02-10,5,Matthew Richardson is professional and helpful...,Wyndham Capital Mortgage
2,2017-08-21,5,We had a past experience with Wyndham Mortgage...,Wyndham Capital Mortgage
3,2017-12-17,5,We have been dealing with Brad Thomka from the...,Wyndham Capital Mortgage
4,2016-05-27,5,I can't express how grateful I am for the supp...,Wyndham Capital Mortgage
5,2016-12-20,5,I had the pleasure of working with Wyndham Cap...,Wyndham Capital Mortgage
6,2017-12-17,5,My experience with Mattison was beyond greatly...,Wyndham Capital Mortgage
7,2016-08-16,5,Patrick answered all my questions by email imm...,Wyndham Capital Mortgage
8,2017-09-04,5,I loved working with this group of people! The...,Wyndham Capital Mortgage
9,2016-03-22,5,Great web interface for both the loan applicat...,Wyndham Capital Mortgage


## 1.Data processing

In [4]:
y=data['Stars']
x=data['Reviews']

In [5]:
# 70-30 data split
x_train, x_test, y_train, y_test = train_test_split(x, y,test_size = 0.3, random_state=42)

In [7]:
type(x_train)

pandas.core.series.Series

In [26]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(353,)
(152,)
(353,)
(152,)


In [316]:
'''
Data preparation :
    -> Tokenization = uni-grams
    -> Removing English stopwords,digits and special characters.
    -> Density=95% and Sparsity=1% (1%<Word frequency<95%)
    -> Lower-case
    -> Used lemmatization (done separately below)
'''

vect = CountVectorizer(max_df=.95,min_df=0.01,stop_words='english',token_pattern='(?u)\\b[a-zA-Z]{2,}\\w\\w+\\b')
#vect = CountVectorizer(tokenizer=split_into_lemmas(), max_df=.95,min_df=0.01,stop_words='english')

In [312]:
'''      <chars meaning><Identifiers>
#df['TEXT'] = df['TEXT'].str.replace('\d+', '') # for digits
#df['TEXT'] = df['TEXT'].str.replace(r'(\b\w{1,2}\b)', '') # for words
#df['TEXT'] = df['TEXT'].str.replace('[^\w\s]', '') # for punctuation 
'''

'''      <Traing on diff vocab>
x_test_dtm = vect.fit_transform(x_test)
x_test_dtm

o/p:        <152x680 sparse matrix of type '<class 'numpy.int64'>'
            with 3394 stored elements in Compressed Sparse Row format>
'''

<152x680 sparse matrix of type '<class 'numpy.int64'>'
	with 3394 stored elements in Compressed Sparse Row format>

In [317]:
# learn training data vocabulary
vect.fit(x_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.95, max_features=None, min_df=0.01,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b[a-zA-Z]{2,}\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [318]:
#then create document-term matrix
x_train_dtm = vect.transform(x_train)

In [319]:
x_train_dtm

<353x591 sparse matrix of type '<class 'numpy.int64'>'
	with 7228 stored elements in Compressed Sparse Row format>

In [71]:
# Transform to an dense matrix
x_train_dtm.toarray()

array([[0, 0, 0, ..., 2, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [None]:
# Making test data dtm
x_test_dtm = vect.transform(x_test)
x_test_dtm

In [60]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mac\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [218]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\mac\AppData\Roaming\nltk_data...


True

In [72]:
x_train_tokens = vect.get_feature_names()

In [73]:
print(x_train_tokens[:50])

['aaron', 'able', 'absolutely', 'accept', 'accepted', 'account', 'accurate', 'actual', 'adam', 'adan', 'additional', 'address', 'agent', 'alex', 'amazing', 'american', 'annoyed', 'answer', 'answered', 'answering', 'answers', 'antebellum', 'anthony', 'application', 'apply', 'appraisal', 'appraise', 'appraiser', 'appreciate', 'appreciated', 'approval', 'approved', 'arose', 'asked', 'asking', 'asset', 'assurance', 'attention', 'attentive', 'attitude', 'available', 'aware', 'away', 'awesome', 'bank', 'banker', 'banks', 'barrett', 'based', 'basis']


In [61]:
lemmatizer = WordNetLemmatizer()
# eg
lemmatizer.lemmatize("played",'v')

'play'

In [79]:
# applying lemmatization
x_train_tokens_root=[]
for w in x_train_tokens:
    x_train_tokens_root.append(lemmatizer.lemmatize(w,'v'))
    #print(rootWord)

In [80]:
x_train_tokens_root

['aaron',
 'able',
 'absolutely',
 'accept',
 'accept',
 'account',
 'accurate',
 'actual',
 'adam',
 'adan',
 'additional',
 'address',
 'agent',
 'alex',
 'amaze',
 'american',
 'annoy',
 'answer',
 'answer',
 'answer',
 'answer',
 'antebellum',
 'anthony',
 'application',
 'apply',
 'appraisal',
 'appraise',
 'appraiser',
 'appreciate',
 'appreciate',
 'approval',
 'approve',
 'arise',
 'ask',
 'ask',
 'asset',
 'assurance',
 'attention',
 'attentive',
 'attitude',
 'available',
 'aware',
 'away',
 'awesome',
 'bank',
 'banker',
 'bank',
 'barrett',
 'base',
 'basis',
 'beat',
 'begin',
 'begin',
 'begin',
 'believe',
 'beneficial',
 'best',
 'better',
 'brent',
 'broker',
 'business',
 'buyer',
 'buyers',
 'buy',
 'call',
 'call',
 'call',
 'calm',
 'come',
 'capital',
 'care',
 'care',
 'case',
 'cause',
 'certainly',
 'challenge',
 'change',
 'change',
 'change',
 'check',
 'choose',
 'chris',
 'circumstances',
 'clarification',
 'class',
 'clear',
 'client',
 'clients',
 'close'

In [211]:
def word_count(words):
    counts = dict()
    for word in words:
        if word in counts:
            counts[word] += 1
        else:
            counts[word] = 1

    return counts

## 2.Key positive and negative words

In [188]:
vect = CountVectorizer(max_df=.95,min_df=0.01,stop_words='english',token_pattern='(?u)\\b[a-zA-Z]{2,}\\w\\w+\\b')

In [189]:
x_data_dtm = vect.fit_transform(data['Reviews'])

In [190]:
x_data_dtm

<505x543 sparse matrix of type '<class 'numpy.int64'>'
	with 10011 stored elements in Compressed Sparse Row format>

In [259]:
x_data_tokens = vect.get_feature_names()

In [281]:
#first presence of the word and where
#vocab_data = vect.vocabulary_
#type(vocab_data)

In [216]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

In [288]:
pos_word_list=[]
neu_word_list=[]
neg_word_list=[]

for word in x_data_tokens:
    if (sid.polarity_scores(word)['compound']) >= 0.1:
        pos_word_list.append(word)
    elif (sid.polarity_scores(word)['compound']) <= -0.1:
        neg_word_list.append(word)
    else:
        neu_word_list.append(word)  

In [289]:
pos_count = []
for word in pos_word_list:
    count=0
    for text in data['Reviews']:
        if word in text:
            count=count+1
    pos_count.append(count)       

In [290]:
print(len(pos_count))
print(len(pos_word_list))

84
84


In [297]:
a=pd.Series(pos_count,name='count')
b=pd.Series(pos_word_list,name='Positive Words')
c = pd.concat([b,a],axis=1)

In [298]:
# Positive words
c.sort_values(by='count',ascending=False).head(10)

Unnamed: 0,Positive Words,count
64,recommend,168
42,help,120
73,sure,111
39,great,92
11,best,75
26,easy,59
35,friend,58
66,responsive,56
43,helpful,54
51,like,47


In [353]:
neg_count = []
for word in neg_word_list:
    count=0
    for text in data['Reviews']:
        if word in text:
            count=count+1
    neg_count.append(count) 

In [354]:
print(len(neg_count))
print(len(neg_word_list))

18
18


In [355]:
d=pd.Series(neg_count,name='count')
e=pd.Series(neg_word_list,name='Negative Word')
f = pd.concat([e,d],axis=1)

In [356]:
# Negative words
f.sort_values(by='count',ascending=False).head(10)

Unnamed: 0,Negative Word,count
6,hard,42
12,problem,31
14,stressful,24
8,lower,23
13,problems,16
10,mistake,16
5,failed,10
2,difficult,9
11,poor,8
16,unprofessional,7


In [338]:
x_data_count = np.sum(x_data_dtm.toarray(),axis=0)
len(x_data_count)

543

In [342]:
x_data_count_df = pd.DataFrame({'token':x_data_tokens, 'count':x_data_count})
x_data_count_df

Unnamed: 0,token,count
0,aaron,23
1,able,47
2,absolutely,13
3,accept,6
4,account,15
5,accurate,10
6,actual,8
7,adam,39
8,adan,14
9,additional,16


## 3. Classification of reviews

In [94]:
#import sys
#!{sys.executable} -m pip install textblob

Collecting textblob
  Downloading https://files.pythonhosted.org/packages/60/f0/1d9bfcc8ee6b83472ec571406bd0dd51c0e6330ff1a51b2d29861d389e85/textblob-0.15.3-py2.py3-none-any.whl (636kB)
Installing collected packages: textblob
Successfully installed textblob-0.15.3


In [95]:
from textblob import TextBlob

In [104]:
def get_sentiment(text): 
    analysis = TextBlob(text) 
    if analysis.sentiment.polarity > 0: 
        return 'positive'
    elif analysis.sentiment.polarity == 0: 
        return 'neutral'
    else: 
        return 'negative'

In [106]:
reviews_sentiment = []
for reviews in data['Reviews']:
    reviews_sentiment.append(get_sentiment(reviews))

In [107]:
reviews_sentiment

['positive',
 'positive',
 'negative',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'negative',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'negative',
 'negative',
 'positive',
 'positive',
 'positive',
 'negative',
 'negative',
 'positive',
 'negative',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'negative',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',

In [112]:
type(reviews_sentiment)

list

In [114]:
review_text = pd.Series(data['Reviews'],name="Review")
review_sentiment = pd.Series(reviews_sentiment,name="Sentiment")
Table = pd.concat([review_text,review_sentiment],axis=1)

In [115]:
Table

Unnamed: 0,Review,Sentiment
0,"Great job, Wyndham Capital! Each person was pr...",positive
1,Matthew Richardson is professional and helpful...,positive
2,We had a past experience with Wyndham Mortgage...,negative
3,We have been dealing with Brad Thomka from the...,positive
4,I can't express how grateful I am for the supp...,positive
5,I had the pleasure of working with Wyndham Cap...,positive
6,My experience with Mattison was beyond greatly...,positive
7,Patrick answered all my questions by email imm...,positive
8,I loved working with this group of people! The...,positive
9,Great web interface for both the loan applicat...,positive


In [118]:
Table['Sentiment'].value_counts()

positive    466
negative     35
neutral       4
Name: Sentiment, dtype: int64

## 4. Identify keys themes of issues

In [200]:
# Creating Tf-idf matix
vect_tfidf = TfidfVectorizer(max_df=.95,min_df=0.01,stop_words='english',token_pattern='(?u)\\b[a-zA-Z]\\w\\w+\\b')
data_tfidf = vect_tfidf.fit_transform(data['Reviews'])
data_tfidf
print(data_tfidf.toarray())

[[0.         0.         0.         ... 0.39978659 0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.6888569  0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.1158803  0.        ]]


In [201]:
from sklearn.decomposition import LatentDirichletAllocation

In [202]:
lda_model = LatentDirichletAllocation(n_components=10,learning_method='online',max_iter=20)

In [203]:
X_topics = lda_model.fit_transform(data_tfidf)

In [204]:
topic_words = lda_model.components_

In [205]:
vocab = vect_tfidf.get_feature_names()

In [206]:
n_top_words = 8
topic_summaries = []
for i,topic_dist in enumerate(topic_words):
    topic_word=np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    topic_summaries.append(' '.join(topic_word))

In [207]:
topic_summaries

['response bank called credit just nasb letter real',
 'kory completed check closing little decision took bob',
 'process great loan home recommend work time team',
 'unprofessional previous various number list contacted encountered responses',
 'kory complicated tough properties calm responsiveness dealing wouldn',
 'jon barrett beginning timely knowledgeable oriented able professional',
 'best awesome homes nasb purchased got far experience',
 'responsive poor informative lock good expired surprises attentive',
 'bad product don tree companies offer lending months',
 'surprises jeremy joey mortgage local trusted nasb process']

### Topics(important):
    -> Great process,great online facilites,timely response
    -> Unprofessional behaviour of employees
    -> Good loan services and closing rate
    -> Super fast process
    -> Higher rates or bank payments
    -> About Kory Anthony
    -> Good professionalism
    -> Trusted bank and positive attitude of workers
    -> Timely services
    -> Hard and patient employes


## 5.Predict ratings

In [301]:
data['Stars'].value_counts()

5    410
1     95
Name: Stars, dtype: int64

In [303]:
def get_rating(text): 
    analysis = TextBlob(text) 
    sent=analysis.sentiment.polarity
    if sent > 0: 
        if sent > 0.2:
            return 5
        else:        
            return 4
    elif sent < 0: 
        if sent < -0.2:
            return 1
        else:
            return 2
    else: 
        return 3

In [304]:
pred_rating = []
for rate in x_train:
    pred_rating.append(get_rating(rate))

In [306]:
pred_rating_test = []
for rate in x_test:
    pred_rating_test.append(get_rating(rate))

In [307]:
# test sample
print(metrics.accuracy_score(y_test, pred_rating_test))

0.5855263157894737


In [308]:
# train sample
print(metrics.accuracy_score(y_train, pred_rating))

0.6288951841359773


### Applying Naive Bayes:

In [309]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [310]:
nb.fit(x_train_dtm, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [321]:
y_pred_class = nb.predict(x_test_dtm)

In [322]:
y_pred_class

array([5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 5, 1, 5, 5, 5, 5, 1, 5, 5, 1, 5, 5,
       5, 1, 1, 5, 5, 5, 5, 1, 5, 5, 5, 1, 1, 1, 5, 5, 5, 5, 1, 5, 5, 5,
       1, 5, 5, 5, 5, 5, 5, 5, 5, 1, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 1, 5,
       5, 5, 1, 5, 1, 5, 5, 5, 5, 5, 1, 5, 5, 5, 5, 1, 5, 5, 5, 1, 5, 5,
       1, 5, 5, 5, 5, 1, 5, 5, 5, 1, 1, 5, 5, 5, 5, 5, 5, 1, 1, 5, 5, 5,
       5, 5, 5, 5, 1, 5, 5, 5, 5, 1, 5, 5, 1, 5, 1, 5, 5, 5, 1, 5, 5, 5,
       5, 5, 1, 5, 1, 5, 5, 5, 1, 5, 5, 5, 5, 5, 1, 5, 5, 1, 5, 5],
      dtype=int64)

In [324]:
print(metrics.confusion_matrix(y_test, y_pred_class))

[[ 30   4]
 [  7 111]]


In [329]:
y_pred_prob = nb.predict_proba(x_test_dtm)[:, 1]

In [331]:
len(y_pred_prob)

152

In [332]:
# Accuracy
print(metrics.accuracy_score(y_test, y_pred_class))
# calculate AUC
print(metrics.roc_auc_score(y_test, y_pred_prob))

0.9276315789473685
0.969840478564307


### Applying Logistic Regression:

In [333]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1e9)

In [334]:
logreg.fit(x_train_dtm, y_train)



LogisticRegression(C=1000000000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=None,
          solver='warn', tol=0.0001, verbose=0, warm_start=False)

In [335]:
y_pred_class = logreg.predict(x_test_dtm)
y_pred_prob = logreg.predict_proba(x_test_dtm)[:, 1]

In [336]:
# calculating accuracy and AUC
print(metrics.accuracy_score(y_test, y_pred_class))
print(metrics.roc_auc_score(y_test, y_pred_prob))

0.8947368421052632
0.8925722831505485
