In [2]:
import pandas as pd
import ast  # To safely evaluate strings containing Python expressions from a string-based input

# Read the main CSV file
df = pd.read_csv('scraped_data_food_full.csv')
# print(df)
original_rows = len(df)
# print(original_rows)
print("Number of rows in the original CSV file: %d" % len(df))

# Remove duplicate rows with the same 'href', and adjust the index accordingly
df.drop_duplicates(subset='href', inplace=True)
# Reset the index
df.reset_index(drop=True, inplace=True)

print("Number of rows after removing duplicates: %d" % len(df))

print("Number of rows removed: %d" % (original_rows - len(df)))

Number of rows in the original CSV file: 15063
Number of rows after removing duplicates: 15063
Number of rows removed: 0


In [3]:
# Write to a new CSV file
df.to_csv('scraped_data_food_full_processed.csv', index=False)

In [4]:
# Cleaning of dataset by categories

# Read the reviews CSV file
df = pd.read_csv('scraped_data_food_full_processed.csv')

# Sort and obtain all the categories value in each cell
original_category_list = []

for i in range (0, len(df['Category'])):
    original_category_list.append(df['Category'][i])

original_category_list.sort()
print("Length of Original Category List: "+ str(len(original_category_list)))

# Ensure that all the category in the dataset are only mentioned once

filtered_category_list = list(set((original_category_list)))
filtered_category_list.sort()
print("Length of Filtered Category List: "+ str(len(filtered_category_list)))

# Copy filtered_category_list into not_dine_in_list

not_dine_in_list = list(filtered_category_list)

# Categories with these strings mentioned in the to_remove_keywords list will be removed from the not_dine_in_list

to_remove_keywords = ["restaurant", "cafe", "bar", "takeaway", "food court", "bakery", "pub", "beer", "patisserie", 
                      "creperie", "diner", "bistro", "live music venue", "hawker", "grill", "kiosk", "stand", "BBQ","brewery",
                      "delicatessen", "deli"]


# Search for all categories that has the string "shop", "store", and "house". Append the results into the respective list
category_with_word_shop = []
category_with_word_store = []
category_with_word_house = []

for item in not_dine_in_list[:]:
    if "shop" in item.lower():
        category_with_word_shop.append(item.lower())
    elif "store" in item.lower():
        category_with_word_store.append(item.lower())
    elif "house" in item.lower():
        category_with_word_house.append(item.lower())

# Exclude the string shop, townhouse complex and warehouse string
print(category_with_word_shop)
print(category_with_word_house)
category_with_word_shop.remove("shop")
category_with_word_house.remove("townhouse complex")
category_with_word_house.remove("warehouse")

# Join category_with_word_house list to the to_remove_keywords list
 
to_remove_keywords.extend(category_with_word_house)
print(to_remove_keywords)


# Remove elements that are part of the excluded_category_with_word_shop list from category_with_word_shop list
 
excluded_category_with_word_shop = ["shopping mall", "gift shop", "butcher shop", "chicken shop", "rice shop"]

for element in category_with_word_shop[:]:
    if any(exclude_word in element.lower() for exclude_word in excluded_category_with_word_shop):
       category_with_word_shop.remove(element)

# Join category_with_word_shop list to the to_remove_keywords list
       
to_remove_keywords.extend(category_with_word_shop)


# Remove elements that are part of the excluded_category_with_word_store list from category_with_word_store list

excluded_category_with_word_store = ["convenience store", "fruit and vegetable store", "furniture store", "gourmet grocery store", "grocery store", "meat products store" ]

for element in category_with_word_store[:]:  
    if any(exclude_word in element.lower() for exclude_word in excluded_category_with_word_store):
       category_with_word_store.remove(element)

# Join category_with_word_store list to the to_remove_keywords list

to_remove_keywords.extend(category_with_word_store)

# Remove those categories that matches the keywords in the to_remove_keywords list

not_dine_in_list = [item for item in filtered_category_list if not any(keyword in item.lower() for keyword in to_remove_keywords)]


# Length of the not_dine_in_list

print("Length of Not Dine-In List: " + str(len(not_dine_in_list)))


# Remove all rows that matches the elements in the not_dine_in_list from the CSV file

df = df[~df['Category'].isin(not_dine_in_list)]
df.reset_index(drop=True, inplace=True)

# Update the CSV File

df.to_csv('scraped_data_food_full_processed.csv', index=False)






Length of Original Category List: 15063
Length of Filtered Category List: 344
['açaí shop', 'bagel shop', 'butcher shop', 'butcher shop deli', 'cake shop', 'chicken shop', 'coffee shop', 'dessert shop', 'ice cream shop', 'juice shop', 'kebab shop', 'noodle shop', 'pasta shop', 'pastry shop', 'rice shop', 'salad shop', 'sandwich shop', 'shop', 'shopping mall', 'soba noodle shop', 'soft drinks shop', 'soup shop', 'steamed bun shop', 'tea and coffee shop']
['chinese tea house', 'crab house', 'french steakhouse restaurant', 'japanese steakhouse', 'steak house', 'tea house', 'townhouse complex', 'warehouse']
['restaurant', 'cafe', 'bar', 'takeaway', 'food court', 'bakery', 'pub', 'beer', 'patisserie', 'creperie', 'diner', 'bistro', 'live music venue', 'hawker', 'grill', 'kiosk', 'stand', 'BBQ', 'brewery', 'delicatessen', 'deli', 'chinese tea house', 'crab house', 'french steakhouse restaurant', 'japanese steakhouse', 'steak house', 'tea house']
Length of Not Dine-In List: 103


In [5]:
# Temporary Removal of Metadata Filtering

# import pandas as pd
# import ast
# import re

# df = pd.read_csv('scraped_data_food_full_processed.csv')


# # Assuming df is your DataFrame and it already exists

# # Safely convert the string representation of list of strings into a list of strings
# df['Metadata'] = df['Metadata'].apply(ast.literal_eval)
# # print(df["Metadata"])

# # Create two new columns "Service Rating" and "Service Type"
# df['Additional Details of Location'] = None
# df['Plus Code'] = None
# df['Contact Number'] = None
# df['Website'] = None
    
# for index, metadata_list in enumerate(df['Metadata']):
#     # print(index)
#     # print(metadata_list)

#     for item in metadata_list:
#         phone_number_pattern = re.compile(r'^\d{4}\s\d{4}$')
#         website_pattern= re.compile(r'^(?:[a-zA-Z0-9-]+\.)+[a-zA-Z]{2,}$')
#         # print(item)
#         if phone_number_pattern.match(item):
#             print(item)
#             key = "Contact Number"
#             df.at[index, key] = item
#             metadata_list.remove(item)
#         # elif website_pattern.match(item):
#         #     print(item)
#         #     key="Website"
#         #     df.at[index, key] = item
#         #     metadata_list.remove(item)           
# df
# df.to_csv('scraped_data_food_full_processed1.csv', index=False)

