In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

### Load data

In [2]:
path_to_data = "../data/raw/mcdonalds_reviews.csv"
data = pd.read_csv(path_to_data, encoding="latin-1")

### Data overview

In [3]:
data.head()

Unnamed: 0,reviewer_id,store_name,category,store_address,latitude,longitude,rating_count,review_time,review,rating
0,1,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,3 months ago,Why does it look like someone spit on my food?...,1 star
1,2,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,5 days ago,It'd McDonalds. It is what it is as far as the...,4 stars
2,3,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,5 days ago,Made a mobile order got to the speaker and che...,1 star
3,4,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,a month ago,My mc. Crispy chicken sandwich was ï¿½ï¿½ï¿½ï¿...,5 stars
4,5,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,2 months ago,"I repeat my order 3 times in the drive thru, a...",1 star


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33396 entries, 0 to 33395
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   reviewer_id    33396 non-null  int64  
 1   store_name     33396 non-null  object 
 2   category       33396 non-null  object 
 3   store_address  33396 non-null  object 
 4   latitude       32736 non-null  float64
 5   longitude      32736 non-null  float64
 6   rating_count   33396 non-null  object 
 7   review_time    33396 non-null  object 
 8   review         33396 non-null  object 
 9   rating         33396 non-null  object 
dtypes: float64(2), int64(1), object(7)
memory usage: 2.5+ MB


In [5]:
data.describe(include="all")

Unnamed: 0,reviewer_id,store_name,category,store_address,latitude,longitude,rating_count,review_time,review,rating
count,33396.0,33396,33396,33396,32736.0,32736.0,33396.0,33396,33396,33396
unique,,2,1,40,,,51.0,39,22285,5
top,,McDonald's,Fast food restaurant,"9814 International Dr, Orlando, FL 32819, Unit...",,,2193.0,4 years ago,Excellent,5 stars
freq,,33325,33396,1890,,,1140.0,6740,2148,10274
mean,16698.5,,,,34.442546,-90.647033,,,,
std,9640.739131,,,,5.344116,16.594844,,,,
min,1.0,,,,25.790295,-121.995421,,,,
25%,8349.75,,,,28.65535,-97.792874,,,,
50%,16698.5,,,,33.931261,-81.471414,,,,
75%,25047.25,,,,40.727401,-75.399919,,,,


### Data pre-processing

In [6]:
df = data[['review', 'rating']]

In [7]:
df.head()

Unnamed: 0,review,rating
0,Why does it look like someone spit on my food?...,1 star
1,It'd McDonalds. It is what it is as far as the...,4 stars
2,Made a mobile order got to the speaker and che...,1 star
3,My mc. Crispy chicken sandwich was ï¿½ï¿½ï¿½ï¿...,5 stars
4,"I repeat my order 3 times in the drive thru, a...",1 star


In [8]:
# Extract numeric values from 'rating' column
df.loc[:, 'rating'] = df.loc[:, 'rating'].str.split().str[0]
df.loc[:, 'rating'] = df.loc[:, 'rating'].fillna(0.0).astype(int)

In [10]:
df.head(5)

Unnamed: 0,review,rating
0,Why does it look like someone spit on my food?...,1
1,It'd McDonalds. It is what it is as far as the...,4
2,Made a mobile order got to the speaker and che...,1
3,My mc. Crispy chicken sandwich was customer s...,5
4,"I repeat my order 3 times in the drive thru, a...",1


In [9]:
# Clean 'review' column from corrupted data
df.loc[:, 'review'] = df.loc[:, 'review'].str.encode("ascii", "ignore").str.decode("utf-8")

In [11]:
df = df.replace('', np.nan, regex=True)
df = df.dropna()

In [12]:
df.head(5)

Unnamed: 0,review,rating
0,Why does it look like someone spit on my food?...,1
1,It'd McDonalds. It is what it is as far as the...,4
2,Made a mobile order got to the speaker and che...,1
3,My mc. Crispy chicken sandwich was customer s...,5
4,"I repeat my order 3 times in the drive thru, a...",1


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 33385 entries, 0 to 33395
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   review  33385 non-null  object
 1   rating  33385 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 782.5+ KB


In [14]:
# Save processed data
df.to_csv('../data/interim/interim_data_v0.1.csv', index=False)