# A Machine Learning journey from customer reviews to business insights
# *Part 4: Word clouds*

*Author: Federica Lionetto*  
*Email: federica.lionetto@gmail.com*  
*Date: 17 November 2020*  
*License: Creative Commons BY-NC-SA*

*Based on the dataset available at:*
- https://www.kaggle.com/efehandanisman/skytrax-airline-reviews

### Further readings

- "Generating WordClouds in Python", https://www.datacamp.com/community/tutorials/wordcloud-python

In [None]:
# Needed for Colab.
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Needed for Colab.
os.chdir('/content/drive/MyDrive/')
# !git clone https://github.com/FedericaLionetto/UZHMLWorkshop2020-NLP
os.chdir('UZHMLWorkshop2020-NLP/')

## 0 - Configuration

In [None]:
df_types_filename = '../Results/NLPFinalDataLightTypes.csv'
df_filename = '../Results/NLPFinalDataLight.csv'
df_preds_filename = '../Results/Preds-WithText.csv'

## 1 - Import modules and helper functions

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_palette('Set2')

import nltk

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import os
import importlib

In [None]:
# Debugging capabilities.
import pdb

In [None]:
import sys  
sys.path.insert(0, './helper_functions')

In [None]:
# Related to recommendation.
import assign_label_recommended

# Related to word clouds.
import get_wordcloud
import get_df_word_importance

# Related to visualization.
import plot_word_cloud_diff

## 2 - Load the input data

In [None]:
# Type of each field in the input data.
df_dtype = pd.read_csv(df_types_filename)
dict_dtype = df_dtype[['index','dtypes']].set_index('index').to_dict()['dtypes']
dict_dtype['recommended'] = 'bool'

In [None]:
# Input data.
df = pd.read_csv(df_filename, dtype=dict_dtype, keep_default_na=False, na_values=['_'])
df.drop(columns=['Unnamed: 0'],inplace=True)

In [None]:
# Predictions.
df_preds = pd.read_csv(df_preds_filename)
df_preds.drop(columns=['Unnamed: 0'],inplace=True)

In [None]:
df_combined = pd.concat([df,df_preds],axis=1)

In [None]:
df_combined.head()

In [None]:
df_combined.shape

Get the names of the colums in the dataset.

In [None]:
cols = df_combined.columns.to_list()
print('Columns in the dataset:')
print(cols)

Get the total number of customer reviews in the dataset.

In [None]:
n_reviews = df_combined.shape[0]
print('Number of customer reviews in the dataset: {:d}'.format(n_reviews))

### 2.1 - Add the label to the dataset

In [None]:
df_combined['label'] = df_combined.apply(lambda x: assign_label_recommended.assign_label_recommended(x), axis=1)

In [None]:
df_combined.head()

## 3 - Word clouds

### 3.1 - Define subset for word cloud

First scenario: all customer reviews that are predicted to be positive vs. all customer reviews that are predicted to be negative.

In [None]:
review_subscores_feats = ['seat_comfort','cabin_service','food_bev','ground_service','entertainment','value_for_money']
sel_feats = review_subscores_feats+['review_text_clean','y_pred','y_prob','label','traveller_type']

In [None]:
df_pred_rec = df_combined[sel_feats][df_combined['y_prob']>=0.9]
df_pred_not_rec = df_combined[sel_feats][df_combined['y_prob']<=0.1]

In [None]:
print('Number of predicted positive customer reviews: {:d}'.format(df_pred_rec.shape[0]))
print('Number of predicted negative customer reviews: {:d}'.format(df_pred_not_rec.shape[0]))

In [None]:
all_text_pred_rec = " ".join(review for review in df_pred_rec.review_text_clean)
all_text_pred_not_rec = " ".join(review for review in df_pred_not_rec.review_text_clean)

Second scenario: same as above, but for two different types of traveller.

In [None]:
df_combined['traveller_type'].value_counts()

In [None]:
group_1 = 'Business'
group_2 = 'Family Leisure'

In [None]:
df_pred_rec_group_1 = df_combined[sel_feats][(df_combined['y_prob']>=0.9) & (df_combined['traveller_type']==group_1)]
df_pred_rec_group_2 = df_combined[sel_feats][(df_combined['y_prob']>=0.9) & (df_combined['traveller_type']==group_2)]
df_pred_not_rec_group_1 = df_combined[sel_feats][(df_combined['y_prob']<=0.1) & (df_combined['traveller_type']==group_1)]
df_pred_not_rec_group_2 = df_combined[sel_feats][(df_combined['y_prob']<=0.1) & (df_combined['traveller_type']==group_2)]

In [None]:
print('Number of predicted positive customer reviews for group 1: {:d}'.format(df_pred_rec_group_1.shape[0]))
print('Number of predicted negative customer reviews for group 1: {:d}'.format(df_pred_not_rec_group_1.shape[0]))
print('Number of predicted positive customer reviews for group 2: {:d}'.format(df_pred_rec_group_2.shape[0]))
print('Number of predicted negative customer reviews for group 2: {:d}'.format(df_pred_not_rec_group_2.shape[0]))

In [None]:
all_text_pred_rec_group_1 = " ".join(review for review in df_pred_rec_group_1.review_text_clean)
all_text_pred_not_rec_group_1 = " ".join(review for review in df_pred_not_rec_group_1.review_text_clean)
all_text_pred_rec_group_2 = " ".join(review for review in df_pred_rec_group_2.review_text_clean)
all_text_pred_not_rec_group_2 = " ".join(review for review in df_pred_not_rec_group_2.review_text_clean)

### 3.2 - Visualize word cloud

In [None]:
nltk.download('punkt')

In [None]:
top_words = 50

In [None]:
with open('../Results/NLTKStopWordsExtended.csv', 'r') as f:
    nltk_stopwords_extended = f.read()
nltk_stopwords_extended = nltk_stopwords_extended.split(', ')
# print(nltk_stopwords_extended)

In [None]:
print('Number of stopwords after extension: {:d}'.format(len(nltk_stopwords_extended)))

In [None]:
# These stopwords are removed from the word clouds only, in order to visualize the factors that drives a positive/negative travel experience.
additional_stopwords = ['one',
                        'get',
                        'also',
                        'however',
                        'even',
                        'make',
                        'good',
                        'great',
                        'well',
                        'bad',
                        'would',
                        'take',
                        'use',
                        'ife',
                        'really',
                        'overall',
                        'could',
                        'much',
                        'excellent',
                        'though',
                        'like',
                        'still',
                        'although',
                        'best',
                        'quite',
                        'trip',
                        'seem',
                        'two',
                        'go',
                        'ba',
                        'swiss',
                        'never',
                        'way',
                        'back',
                        'zurich',
                        'paris',
                        'hour',
                        'flight',
                        'boeing',
                        'airbus',
                        'lot',
                        'old',
                        'bit',
                        'fly',
                        'small',
                        'year'
                       ]

In [None]:
all_stopwords = nltk_stopwords_extended+additional_stopwords

In [None]:
# First scenario.
wordcloud_pred_rec = get_wordcloud.get_wordcloud(all_text_pred_rec, max_words=50, stop_words=all_stopwords, filename='../Results/04/WordCloud-PredRec.png')
wordcloud_pred_not_rec = get_wordcloud.get_wordcloud(all_text_pred_not_rec, max_words=50, stop_words=all_stopwords, filename='../Results/04/WordCloud-PredNotRec.png')
dict_pred_rec = wordcloud_pred_rec.words_
dict_pred_not_rec = wordcloud_pred_not_rec.words_

In [None]:
# Second scenario.
wordcloud_pred_rec_group_1 = get_wordcloud.get_wordcloud(all_text_pred_rec_group_1, max_words=50, stop_words=all_stopwords, filename='../Results/04/WordCloud-PredRec-Group1.png')
wordcloud_pred_not_rec_group_1 = get_wordcloud.get_wordcloud(all_text_pred_not_rec_group_1, max_words=50, stop_words=all_stopwords, filename='../Results/04/WordCloud-PredNotRec-Group1.png')
dict_pred_rec_group_1 = wordcloud_pred_rec_group_1.words_
dict_pred_not_rec_group_1 = wordcloud_pred_not_rec_group_1.words_

wordcloud_pred_rec_group_2 = get_wordcloud.get_wordcloud(all_text_pred_rec_group_2, max_words=50, stop_words=all_stopwords, filename='../Results/04/WordCloud-PredRec-Group2.png')
wordcloud_pred_not_rec_group_2 = get_wordcloud.get_wordcloud(all_text_pred_not_rec_group_2, max_words=50, stop_words=all_stopwords, filename='../Results/04/WordCloud-PredNotRec-Group2.png')
dict_pred_rec_group_2 = wordcloud_pred_rec_group_2.words_
dict_pred_not_rec_group_2 = wordcloud_pred_not_rec_group_2.words_

In [None]:
dict_pred_rec

In [None]:
len(list(dict_pred_rec.keys()))

In [None]:
# Comparison between predicted positive and predicted negative.
df_wordcloud = get_df_word_importance.get_df_word_importance(dict_1=dict_pred_rec, 
                                      dict_2=dict_pred_not_rec, 
                                      label_1='Pred Pos', 
                                      label_2='Pred Neg')

In [None]:
# Comparison between predicted positive and predicted positive, for two different origin cities.
df_wordcloud_groups_comp_rec = get_df_word_importance.get_df_word_importance(dict_1=dict_pred_rec_group_1, 
                                                    dict_2=dict_pred_rec_group_2,
                                                    label_1='Pred Pos from group 1',
                                                    label_2='Pred Pos from group 2')

In [None]:
# Comparison between predicted negative and predicted negative, for two different origin cities.
df_wordcloud_groups_comp_not_rec = get_df_word_importance.get_df_word_importance(dict_1=dict_pred_not_rec_group_1, 
                                                            dict_2=dict_pred_not_rec_group_2,
                                                            label_1='Pred Neg from group 1',
                                                            label_2='Pred Neg from group 2')

In [None]:
# Comparison between predicted positive and predicted negative, for one origin city.
df_wordcloud_one_group = get_df_word_importance.get_df_word_importance(dict_1=dict_pred_rec_group_1, 
                                                  dict_2=dict_pred_not_rec_group_1,
                                                  label_1='Pred Pos from group 1',
                                                  label_2='Pred Neg from group 1')

In [None]:
importlib.reload(plot_word_cloud_diff)

In [None]:
plot_word_cloud_diff.plot_word_cloud_diff(df_sorted=df_wordcloud,
                                          label_1='Pred Pos',
                                          label_2='Pred Neg',
                                          n_top_words=top_words,
                                          filename='../Results/04/WordCloudDiff-'+'PredPos'+'-'+'PredNeg'+'.png')

In [None]:
plot_word_cloud_diff.plot_word_cloud_diff(df_sorted=df_wordcloud_groups_comp_rec, 
                                          label_1='Pred Pos from group 1',
                                          label_2='Pred Pos from group 2',
                                          n_top_words=top_words,
                                          filename='../Results/04/WordCloudDiff-'+'PredPosGroup1'+'-'+'PredPosGroup2'+'.png')

In [None]:
plot_word_cloud_diff.plot_word_cloud_diff(df_sorted=df_wordcloud_groups_comp_not_rec,
                                          label_1='Pred Neg from group 1', 
                                          label_2='Pred Neg from group 2',
                                          n_top_words=top_words,
                                          filename='../Results/04/WordCloudDiff-'+'PredNegGroup1'+'-'+'PredNegGroup2'+'.png')

We can cross-check these findings by looking at the review subscores.

In [None]:
df_pred_rec_group_1['entertainment'].describe()

In [None]:
df_pred_rec_group_2['entertainment'].describe()