In [1]:
import pandas as pd
import os
import re

In [2]:
# Path to both data files
current_directory = os.getcwd()
reviews_csv = os.path.join(current_directory, 'data', 'gm_reviews.csv')
restaurants_csv = os.path.join(current_directory, 'data','restaurants.csv')


In [3]:
reviews_df = pd.read_csv(reviews_csv, encoding='utf-8')
print(reviews_df.shape)
reviews_df.head()

(19560, 5)


Unnamed: 0,restaurant_ids,id_review,caption,relative_date,username
0,1,ChdDSUhNMG9nS0VJQ0FnSUN4ek83dTd3RRAB,,17 hours ago,S V
1,1,ChZDSUhNMG9nS0VJQ0FnSUN4MU95cFZREAE,,a day ago,Adaikkappan Nagappan
2,1,ChZDSUhNMG9nS0VJQ0FnSUN4NUlpb0l3EAE,,a day ago,project_be_simple
3,1,ChdDSUhNMG9nS0VJQ0FnSUN4NExQZXdBRRAB,,2 days ago,Anmol Singh
4,1,ChdDSUhNMG9nS0VJQ0FnSURSOTRpbmxnRRAB,Great experience,6 days ago,Sandeep Kaur


In [4]:
restaurants_df = pd.read_csv(restaurants_csv, encoding='utf-8')
print(restaurants_df.shape)
restaurants_df.head()

(652, 9)


Unnamed: 0,restaurant_id,name,place_id,price_level,lat,long,types,address,cuisine
0,1,Domino's Pizza Flinders St,ChIJzUeRGVJd1moR8OAx5QG2svk,1,-37.820869,144.956346,"['meal_delivery', 'meal_takeaway', 'restaurant...","Tenancy 2C/555 Flinders Street, Melbourne",Fast Food
1,2,City Kebabs,ChIJJaylGVJd1moRjyu89QjaLrg,1,-37.820527,144.956193,"['meal_takeaway', 'restaurant', 'food', 'point...","3/546 Flinders Street, Melbourne",Fast Food
2,3,Royale PizzaMelbourne,ChIJT5at6FFd1moRl7tSBihHz1I,0,-37.819691,144.954892,"['meal_delivery', 'meal_takeaway', 'restaurant...","44/56 Spencer Street, Melbourne",Pizza
3,4,Cherry and Twigs,ChIJJbxbHVJd1moRlmxHO37SS70,2,-37.820037,144.955732,"['cafe', 'restaurant', 'food', 'point_of_inter...","Shop 1B/555 Flinders Lane, Melbourne",Café
4,5,Restaurant 1903,ChIJlVbz5VFd1moRBbVdFgoTEVI,1,-37.819733,144.954767,"['restaurant', 'food', 'point_of_interest', 'e...","44 Spencer Street, Melbourne",Modern Australian


In [23]:
# Remove all blank reviews
reviews_df.dropna(axis=0, how='any',inplace=True)
reviews_df.reset_index(drop=True,inplace=True)
reviews_df.shape

(14321, 5)

In [24]:
# Remove all emoji characters and non-english text from the reviews.
cleaned_reviews_df = reviews_df.copy()

def remove_emoji_or_encoding_byproducts(text):
    # The regular expression pattern below will match any emoji or non-text symbols
    pattern = re.compile(r'[\U00010000-\U0010ffff]|[^\x00-\x7F]+', flags=re.UNICODE)
    return pattern.sub('', text)

# Apple function to review column
cleaned_reviews_df['caption'] = cleaned_reviews_df['caption'].apply(remove_emoji_or_encoding_byproducts)



In [25]:
# Ensure again that there are no blank reviews
cleaned_reviews_df.dropna(axis=0, how='any',inplace=True)
cleaned_reviews_df.reset_index(drop=True,inplace=True)
cleaned_reviews_df.shape

(14321, 5)

In [27]:
# Have the names and cuisines of the restaurants on the reviews dataframe
merged_df = pd.merge(cleaned_reviews_df, 
         restaurants_df, 
         left_on='restaurant_ids',
         right_on='restaurant_id')

merged_df.drop(columns=['price_level', 'place_id', 'restaurant_id','lat', 'long', 'types', 'address'],inplace=True)
merged_df.head()

Unnamed: 0,restaurant_ids,id_review,caption,relative_date,username,name,cuisine
0,1,ChdDSUhNMG9nS0VJQ0FnSURSOTRpbmxnRRAB,Great experience,6 days ago,Sandeep Kaur,Domino's Pizza Flinders St,Fast Food
1,1,ChdDSUhNMG9nS0VJQ0FnSURScktPaDV3RRAB,Must be the worst dominos in the world. Pizz...,3 weeks ago,Robert McFarland,Domino's Pizza Flinders St,Fast Food
2,1,ChZDSUhNMG9nS0VJQ0FnSURSNUotOUt3EAE,They are so nice and the Food is Perfect !!,3 weeks ago,Patricia Tchialeu,Domino's Pizza Flinders St,Fast Food
3,1,ChdDSUhNMG9nS0VJQ0FnSURSNklYTG93RRAB,Staff was friendly and helpful. The deals make...,3 weeks ago,Brent Folan,Domino's Pizza Flinders St,Fast Food
4,1,ChZDSUhNMG9nS0VJQ0FnSUNPNXJhd0VREAE,Been coming here for awhile usually happy,a month ago,Johnathan Vanderwerf,Domino's Pizza Flinders St,Fast Food


In [28]:
# Saving to output folder
output_folder = 'output'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Save the DataFrame to the 'output' folder as a CSV file
output_file = 'cleaned_reviews.csv'
output_path = os.path.join(output_folder, output_file)
merged_df.to_csv(output_path, index=False)