In [78]:
# Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set up matplotlib style 
plt.style.use('ggplot')

# Libraries for wordcloud making and image importing
from wordcloud import WordCloud, ImageColorGenerator
from PIL import Image

# And libraries for data transformation
import datetime
from string import punctuation

#words counter
from collections import Counter
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import spacy
from nltk.tokenize import word_tokenize 

#model
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from itertools import compress
import eli5
from eli5.sklearn import PermutationImportance
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import graphviz
from matplotlib import pyplot as plt
from pdpbox import pdp, get_dataset, info_plots
import shap


In [None]:
#import data
data = pd.read_csv('../out/google_homemini450.csv')
data.info()
# Data overlook
data.head()

In [None]:
# Transform string data and remove punctuation
data['verified_reviews'] = data.verified_reviews.apply(lambda x: str(x).lower())
data['verified_reviews'] = data.verified_reviews.apply(lambda x: ''.join([c for c in x if c not in punctuation]))

In [None]:
# Get length of review for EDA
data['review_length'] = data.verified_reviews.apply(lambda x: len(x))

In [None]:
# Check the data again
data.head()

In [None]:
# Take a look at the mean, standard deviation, and maximum
print('The mean for the length of review:',data['review_length'].mean())
print('The standard deviation for the length of reviews:',data['review_length'].std())
print('The maximum for the length of reviews:',data['review_length'].max())

In [None]:
# And take a look at the distribution of the length
data['review_length'].hist(bins=20)
plt.title('Distribution of review length')

In [None]:
#words cloud of most comman words 
A = np.array(Image.open('../in/google_homemini.png'))
np.random.seed(321)
sns.set(rc={'figure.figsize':(8,8)})
reviews = ' '.join(data['verified_reviews'].tolist())

wordcloud = WordCloud(mask=A,background_color="white").generate(reviews)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.margins(x=0, y=0)
plt.title('Reviews',size=20)
plt.show()

In [None]:
#reviews on shops 
sns.set(rc={'figure.figsize':(10,6)})
sns.countplot(data.shop,
              order = data['shop'].value_counts().index)
plt.xticks(rotation=90)
plt.title('Counts of each shops')

In [None]:
import matplotlib.cm as cm

#count data rating
data.rating.value_counts()
#rating pie
values = rating.values
labels = rating.index
explode = (0.1, 0.1, 0.1, 0.1, 0.1)
colors = cm.rainbow(np.linspace(0, 1, len(labels)))
plt.pie(values, 
        colors=colors, 
        labels=labels,
        explode=explode, 
        autopct='%1.1f%%',
        counterclock=False,
        shadow=True)
plt.title('Ratings Pie Chat')
plt.show()

In [None]:
data5 = data[data.rating == 5]
data_not_5 = data[data.rating != 5]
data1 = data[data.rating == 1]

In [None]:
#words cloud of most comman words for rating 1-4
A = np.array(Image.open('../in/google_homemini.png'))
np.random.seed(321)
sns.set(rc={'figure.figsize':(8,8)})
reviews = ' '.join(data1['verified_reviews'].tolist())

wordcloud = WordCloud(mask=A,background_color="white").generate(reviews)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.margins(x=0, y=0)
plt.title('Reviews',size=20)
plt.show()

In [None]:
#words cloud of most comman words for rating 1-4
A = np.array(Image.open('../in/google_homemini.png'))
np.random.seed(321)
sns.set(rc={'figure.figsize':(8,8)})
reviews = ' '.join(data5['verified_reviews'].tolist())

wordcloud = WordCloud(mask=A,background_color="white").generate(reviews)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.margins(x=0, y=0)
plt.title('Reviews',size=20)
plt.show()

In [None]:
#relationship between rating and shop
sns.boxplot(data.shop, data.rating)
plt.xticks(rotation = 90)

In [None]:
#relationship between rating and reviews lenth
sns.boxplot('rating','review_length',data=data)


In [None]:
data['log_review_length'] = data.review_length.apply(lambda x: (np.log(x)+1))


In [None]:
sns.boxplot('rating','log_review_length',data=data)


In [None]:
sns.boxplot('variation','log_review_length',data=data)
plt.xticks(rotation = 90)