In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os


In [2]:
%%time
# Importing the csv 
reviews = pd.read_csv('reviews.csv')

# Displaying the first 5 lines of the csv 
reviews.head(10)

CPU times: user 1min 41s, sys: 6min 11s, total: 7min 53s
Wall time: 50min 39s


Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,xQY8N_XvtGbearJ5X4QryQ,OwjRMXRC0KyPrIlcjaXeFQ,-MhfebM0QIsKt87iDN-FNw,2.0,5.0,0.0,0.0,"As someone who has worked with many museums, I...",2015-04-15 05:21:16
1,UmFMZ8PyXZTY2QcwzsfQYA,nIJD_7ZXHq-FX8byPMOkMQ,lbrU8StCq3yDfr-QMnGrmQ,1.0,1.0,1.0,0.0,I am actually horrified this place is still in...,2013-12-07 03:16:52
2,LG2ZaYiOgpr2DK_90pYjNw,V34qejxNsCbcgD8C0HVk-Q,HQl28KMwrEKHqhFrrDqVNQ,5.0,1.0,0.0,0.0,I love Deagan's. I do. I really do. The atmosp...,2015-12-05 03:18:11
3,i6g_oA9Yf9Y31qt0wibXpw,ofKDkJKXSKZXu5xJNGiiBQ,5JxlZaqCnk1MnbgRirs40Q,1.0,0.0,0.0,0.0,"Dismal, lukewarm, defrosted-tasting ""TexMex"" g...",2011-05-27 05:30:52
4,6TdNDKywdbjoTkizeMce8A,UgMW8bLE0QMJDCkQ1Ax5Mg,IS4cv902ykd8wj1TR0N3-A,4.0,0.0,0.0,0.0,"Oh happy day, finally have a Canes near my cas...",2017-01-14 21:56:57
5,L2O_INwlrRuoX05KSjc4eg,5vD2kmE25YBrbayKhykNxQ,nlxHRv1zXGT0c0K51q3jDg,5.0,2.0,0.0,0.0,This is definitely my favorite fast food sub s...,2013-05-07 07:25:25
6,ZayJ1zWyWgY9S_TRLT_y9Q,aq_ZxGHiri48TUXJlpRkCQ,Pthe4qk5xh4n-ef-9bvMSg,5.0,1.0,0.0,0.0,"Really good place with simple decor, amazing f...",2015-11-05 23:11:05
7,lpFIJYpsvDxyph-kPzZ6aA,dsd-KNYKMpx6ma_sRWCSkQ,FNCJpSn0tL9iqoY3JC73qw,5.0,0.0,0.0,0.0,"Awesome office and staff, very professional an...",2017-07-18 18:31:54
8,JA-xnyHytKiOIHl_ztnK9Q,P6apihD4ASf1vpPxHODxAQ,e_BiI4ej1CW1F0EyVLr-FQ,5.0,0.0,0.0,0.0,Most delicious authentic Italian I've had in t...,2015-02-16 06:48:47
9,z4BCgTkfNtCu4XY5Lp97ww,jOERvhmK6_lo_XGUBPws_w,Ws8V970-mQt2X9CwCuT5zw,4.0,3.0,0.0,1.0,I have been here twice. Very nice and laid bac...,2009-10-13 04:16:41


In [4]:
reviews['user_id'] == 'xvu8G900tezTzbbfqmTKvA'

0          False
1          False
2          False
3          False
4          False
           ...  
8021119    False
8021120    False
8021121    False
8021122    False
8021123    False
Name: user_id, Length: 8021124, dtype: bool

In [3]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8021124 entries, 0 to 8021123
Data columns (total 9 columns):
review_id      object
user_id        object
business_id    object
stars          float64
useful         float64
funny          float64
cool           float64
text           object
date           object
dtypes: float64(4), object(5)
memory usage: 550.8+ MB


**Preprocessing the text**


To do this, we're going to do some preprocessing with our text. 


- First, we will remove the following:
    - Punctuation
    - Digits
    - Lowercase the words
    - Remove stopwords
    
- Second, we're going to take a look at the number of words and count them along with visually showing the distribution to the word count to get an idea of how long individuals would usually contribute towards suggesting a review.
- Third, we're going to look at a wordcloud of what some of the words might look like within these reviews.
 

In [None]:
# Removing punctuation, numbers and lowercasing for our text column in the dataframe
# Removing all punctuation from our text column
reviews['text'] = reviews['text'].str.translate(str.maketrans('','', string.punctuation))
print(f'The following punctuations: {string.punctuation} have been removed from the text column')

# Removing all digits/numbers from our text column
reviews['text'] = reviews['text'].str.translate(str.maketrans('','', string.digits))
print(f'The following {string.digits} have been removed from the text column')

# Lowering all case for our texts
reviews['text'] = reviews['text'].str.lower()
print(f'The text has been made to lowercase')

# Let's take a look at the columns
reviews.head()

In [None]:
# Converting our text column from series to string so that we can split
reviews['text'] = reviews['text'].astype(str)

# Getting the total number of words in our text columns for the tips by users
reviews['Tips Word Count'] = reviews['text'].apply(lambda x: len(x.split(' ')))
reviews.head()

**Splitting up the date/time of review**

In [None]:
# Converting date column data type from object to datetime just incase 
reviews['date'] = pd.to_datetime(reviews['date'])

# Converting true_df date columns into year, month and day
# Extracting the year of publishing
reviews['Reviews - Year'] = reviews['date'].dt.year

# Extracting the month of the year
reviews['Reviews - Month'] = reviews['date'].dt.month_name()

# Extracting the day of the month
reviews['Reviews - Day'] = reviews['date'].dt.day

# Extracting hour of day
reviews['Reviews - Hour'] = reviews['date'].dt.hour

# Extracting day of week
reviews['Reviews - DayofWeek'] = reviews['date'].dt.day_name()

# Let's take a look at the dataframe now as it shouldve added 5 columns
reviews.head()

In [None]:
# Let's see how the tipping has changed over the years
plt.figure(figsize = (10,5))
sns.countplot(tips['Reviews - Year'], palette = 'Wistia_r' )
plt.xlabel('Yearly contributions for reviews by Yelp Users', fontsize = 15)
plt.ylabel('Suggested Reviews by Yelp Users', fontsize = 15)
plt.title('Reviews contributions over the years by Yelp Users', fontsize = 20)
plt.tight_layout()
plt.show()

In [None]:
# Let's see how the tipping has changed over the months
plt.figure(figsize = (10,5))
sns.countplot(tips['Reviews - Month'], palette = 'cividis')
plt.xlabel('Monthly contributions for reviews by Yelp Users', fontsize = 15)
plt.ylabel('Suggested Reviews by Yelp Users', fontsize = 15)
plt.title('Reviews contributions over the months by Yelp Users', fontsize = 20)
plt.tight_layout()
plt.show()

In [None]:
# Let's see how the tipping has changed over the days
plt.figure(figsize = (10,5))
sns.countplot(tips['Reviews - Year'], palette = 'Blues' )
plt.xlabel('Daily contributions for reviews by Yelp Users', fontsize = 15)
plt.ylabel('Suggested Reviews by Yelp Users', fontsize = 15)
plt.title('Reviews contributions on a daily basis by Yelp Users', fontsize = 20)
plt.tight_layout()
plt.show()

In [None]:
# Let's see how the tipping has changed over the hour
plt.figure(figsize = (10,5))
sns.countplot(reviews['Reviews - Hour'], palette = 'BuPu' )
plt.xlabel('Yearly contributions for reviews by Yelp Users', fontsize = 15)
plt.ylabel('Suggested Reviews by Yelp Users', fontsize = 15)
plt.title('Reviews contributions over the hours by Yelp Users', fontsize = 20)
plt.tight_layout()
plt.show()

In [None]:
# Let's see how the tipping has changed over the day of week
plt.figure(figsize = (10,5))
sns.countplot(reviews['Reviews - DayofWeek'], palette = 'Oranges' )
plt.xlabel('Yearly contributions for reviews by Yelp Users', fontsize = 15)
plt.ylabel('Suggested Reviews by Yelp Users', fontsize = 15)
plt.title('Reviews contributions over the days of week by Yelp Users', fontsize = 20)
plt.tight_layout()
plt.show()