In [331]:
# import essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [332]:
# load the dataset
data = pd.read_csv('data/data_new.csv')

# check the dimension of the dataset
print(data.shape)

# view the first 5 rows of the dataset
data.head()

FileNotFoundError: [Errno 2] No such file or directory: 'data/data_new.csv'

In [None]:
data.info()

In [None]:
# total number of bank and address in the dataset

numAddress = data['address'].nunique()
numBank = data['bank'].nunique()

print(f'The reviews are collected from {numBank} different banks that located around {numAddress} of places in India.')
print('These banks are: ', data.bank.unique())

In [None]:
# some of the data without 'bank' name and is named as 'review' in the dataset
# convert these data to 'Unknown'

data['bank'].replace('review','Unknown', inplace=True)

In [None]:
data.head()

In [None]:
data['bank'].unique()

In [None]:
data.drop(columns=['author', 'date', 'bank_image'], axis=1, inplace=True)

In [None]:
# total number of bank and address in the dataset

numAddress = data['address'].nunique()
numBank = data['bank'].nunique()

print(f'The reviews are collected from {numBank} different banks that located around {numAddress} of places in United Kingdom.')
print('These banks are: ', data.bank.unique())

In [None]:
# distribution of the column 'address'
address = data['address'].value_counts().sort_values(ascending=False)[:10]

colors = ['red','crimson','orange','darksalmon','yellow','lemonchiffon','limegreen','lightgreen','navy','mediumslateblue']
explodes = ([0.05]*5) + ([0]*5)

# visualization (top 10 places where the reviews are collected from)
figure, axes = plt.subplots(1,2, figsize=(15,5), gridspec_kw={'width_ratios':[1.5,1]})
axes[0].barh(y=address.index, width=address.values, color=colors)
axes[0].set_xlabel('Frequency')

for index, values in enumerate (address):
        axes[0].text(values+3, index, str(values), va='center')
        
        axes[0].grid(alpha=0.4)
        axes[1].pie(address.values, labels=address.index, autopct='%.2f%%', explode=explodes, colors=colors)
        figure.suptitle('Top 10 Places where the Reviews are Collected From', fontsize=15)
        plt.tight_layout()
        plt.show()

In [None]:
# distribution of the data by 'bank'
bank = data['bank'].value_counts().sort_values(ascending=False)

# visualization in bar chart and pie chart
figure, axes = plt.subplots(1,2, figsize=(15,5), gridspec_kw={'width_ratios':[1.5,1]})
axes[0].barh(y=bank.index, width=bank.values, color=colors)
axes[0].set_xlabel('Frequency')

for index, values in enumerate (bank):
        axes[0].text(values+3, index, str(values), va='center')
        
        axes[0].grid(alpha=0.4)
        axes[1].pie(bank.values, labels=bank.index, autopct='%.2f%%', explode=explodes, colors=colors)
        figure.suptitle('Number of Reviews Collected from Each Bank', fontsize=15)
        plt.tight_layout()
        plt.show()

In [None]:
# distribution of rating (by rating score)
# convert the rating from int to str
data_copy = data.copy()
data_copy['rating'] = data_copy['rating'].astype(str)

rating = data_copy['rating'].value_counts().sort_index(ascending=False)

# visualization in bar chart and pie chart
figure, axes = plt.subplots(1,2, figsize=(15,5), gridspec_kw={'width_ratios':[1.5,1]})
axes[0].barh(y=rating.index, width=rating.values, color=colors)
axes[0].set_xlabel('Frequency')

for index, values in enumerate (rating):
        axes[0].text(values+3, index, str(values), va='center')
        
        axes[0].grid(alpha=0.4)
        axes[1].pie(rating.values, labels=rating.index, autopct='%.2f%%', explode=explodes, colors=colors)
        figure.suptitle('Overall Distribution of Rating (by rating score) in 1000 Reviews', fontsize=15)
        plt.tight_layout()
        plt.show()

In [None]:
# distribution of rating (by rating title)
rating_order = ['Unacceptable','Really Bad','Bad','Expected more','Just OK','Satisfactory','Pretty good','Great!','Excellent!','Blown Away!']
rating_title = data['rating_title_by_user'].value_counts(sort=False).reindex(rating_order)
reversed_rating_title = rating_title[::-1]

# visualization in bar chart and pie chart
figure, axes = plt.subplots(1,2, figsize=(15,5), gridspec_kw={'width_ratios':[1.5,1]})
axes[0].barh(y=reversed_rating_title.index, width=reversed_rating_title.values, color=colors)
axes[0].set_xlabel('Frequency')

for index, values in enumerate (reversed_rating_title):
        axes[0].text(values+3, index, str(values), va='center')
        
        axes[0].grid(alpha=0.4)
        axes[1].pie(reversed_rating_title.values, labels=reversed_rating_title.index, autopct='%.2f%%', explode=explodes, colors=colors)
        figure.suptitle('Overall Distribution of Rating (by rating title) in 1000 Reviews', fontsize=15)
        plt.tight_layout()
        plt.show()

In [None]:
from wordcloud import WordCloud

combined_title = ' '.join(data['review_title_by_user'])


# create a word cloud using the combined text
wordcloud_title = WordCloud(collocations=False, width = 800, height = 800,
                                                       background_color ='black',
                                                       min_font_size = 10).generate(combined_title)

# plot the WordCloud image
plt.figure(figsize=(6,6))
plt.imshow(wordcloud_title)
plt.axis('off')
plt.title('User Review Title', fontsize=20)
plt.tight_layout(pad=2)
plt.show()

In [None]:
# subset the data that rating score is 2.5 and lower
data_copy = data.copy()
data_copy = data[data['rating']<=2.5]

combined_title_lowrate = ' '.join(data_copy['review_title_by_user'])


# create a word cloud using the combined text
wordcloud_lowrate = WordCloud(width = 800, height = 800,
                                                         background_color ='black',
                                                         min_font_size = 10).generate(combined_title_lowrate)

# plot the WordCloud image
plt.figure(figsize=(6,6))
plt.imshow(wordcloud_lowrate)
plt.axis('off')
plt.title('User Review Title (Rating Score Lower than 2.5)', fontsize=15)
plt.tight_layout(pad=2)
plt.show()

In [None]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer

# initialize the object of SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

In [None]:
# calculate the polarity scores of the review
data['polarity'] = data['review'].apply(sia.polarity_scores)
data['compound'] = data['polarity'].apply(lambda x: x.get('compound'))

In [None]:
# define a function to perform the sentiment categorization
def categorize(score):
        if score < 0:
            return 'To be Improved'
        elif score > 0.8:
            return 'Blown Away'
        else:
            return 'Neutral'

In [None]:
# insert a new column 'Sentiment_Category' by categorized the sentiment based on the compound score
data['Sentiment_Category'] = data['compound'].apply(categorize)

# view the first 5 rows of the data after new columns inserted
data.head()

In [None]:
df=data.groupby('Sentiment_Category').count()

In [None]:
df.head().sort_values(['compound'])

In [None]:
data_copy = data.copy()
data_copy = data[data['Sentiment_Category']=='To be Improved']

combined_tobeImproved = ' '.join(data_copy['review'])


# create a word cloud using the combined text
wordcloud_tobeImproved = WordCloud(width = 800, height = 800,
                                                              background_color ='black',
                                                              min_font_size = 10).generate(combined_tobeImproved)

# plot the WordCloud image
plt.figure(figsize=(8,8))
plt.imshow(wordcloud_tobeImproved)
plt.axis('off')
plt.title('User Review Title (Category: To be Improved)', fontsize=15)
plt.tight_layout(pad=2)
plt.show()

In [None]:
import re

common_words = ['bank', 'account','salary','saving','savings','pounds']

pattern = re.compile(r'\b(?:' + '|'.join(map(re.escape, common_words)) + r')\b', flags=re.IGNORECASE)
filtered_string = pattern.sub('', combined_tobeImproved)

In [None]:
# create a word cloud using the combined text
wordcloud_tobeImproved = WordCloud(width = 800, height = 800,
                                                              background_color ='black',
                                                              min_font_size = 10).generate(filtered_string)

# plot the WordCloud image
plt.figure(figsize=(8,8))
plt.imshow(wordcloud_tobeImproved)
plt.axis('off')
plt.title('User Review Title (Category: Common Words)', fontsize=15)
plt.tight_layout(pad=2)
plt.show()