In [18]:
import pandas as pd
import os

# List all files inside raw folder
raw_path = "../data/raw"
print(os.listdir(raw_path))

# =========================
# Helper function to load CSV safely
# =========================
def load_csv_safe(path):
    encodings = ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']
    for encoding in encodings:
        try:
            return pd.read_csv(path, encoding=encoding, on_bad_lines='skip')
        except (UnicodeDecodeError, pd.errors.ParserError):
            continue
    # If all fail, use utf-8 with skip
    return pd.read_csv(path, encoding='utf-8', on_bad_lines='skip')

# =========================
# 1. Load datasets
# =========================
df_catering = load_csv_safe("../data/raw/Catering_Dataset.csv")
df_noces = load_csv_safe("../data/raw/noces_info_uoc.csv")
df_vic = load_csv_safe("../data/raw/venues-for-event-bookings.csv")
df_geo = load_csv_safe("../data/raw/geoplaces2.csv")

print("\nLoaded:")
print("Catering:", df_catering.shape)
print("Noces:", df_noces.shape)
print("Vic:", df_vic.shape)
print("GeoPlaces:", df_geo.shape)


['Catering_Dataset.csv', 'chefmozaccepts.csv', 'chefmozcuisine.csv', 'chefmozhours4.csv', 'chefmozparking.csv', 'geoplaces2.csv', 'noces_info_uoc.csv', 'usercuisine.csv', 'userpayment.csv', 'userprofile.csv', 'venues-for-event-bookings.csv']

Loaded:
Catering: (133, 14)
Noces: (215, 7)
Vic: (206, 22)
GeoPlaces: (130, 21)


In [19]:
# =========================
# Preview column names before cleaning
# =========================
print("Catering columns:", df_catering.columns.tolist())
print("Noces columns:", df_noces.columns.tolist())
print("Vic columns:", df_vic.columns.tolist())
print("Geo columns:", df_geo.columns.tolist())


Catering columns: ['Id', 'Name', 'Description', 'Tel', 'Add', 'Zipcode', 'Opentime', 'Map', 'Px', 'Py', 'Class', 'Website', 'Parkinginfo', 'Changetime']
Noces columns: ['Spot', 'Rating', 'Location', 'Promotion', 'Price', 'Num_people', 'URL']
Vic columns: ['geo_point_2d', 'geo_shape', 'prop_id', 'no_smoking', 'level_1_na', 'addresspt1', 'event', 'full_name', 'addressp_1', 'training', 'dog_prohib', 'dog_off_le', 'venue_recn', 'addresspt', 'sport', 'promotion', 'bookable', 'level_3_na', 'wedding', 'roadseg_id', 'sustainabi', 'level_2_na']
Geo columns: ['placeID', 'latitude', 'longitude', 'the_geom_meter', 'name', 'address', 'city', 'state', 'country', 'fax', 'zip', 'alcohol', 'smoking_area', 'dress_code', 'accessibility', 'price', 'url', 'Rambience', 'franchise', 'area', 'other_services']


In [20]:
# =========================
# 2. CLEANING & STANDARDIZATION
# Target schema:
# name | category | location | rating | services
# =========================

# --- Catering Dataset ---
catering_clean = df_catering.rename(columns={
    "Name": "name",
    "Description": "services",
    "Add": "location"
})

catering_clean["category"] = "catering"
catering_clean["rating"] = 0  # no rating provided
catering_clean = catering_clean[["name", "category", "location", "rating", "services"]]

# --- Barcelona Wedding Venues (Noces) ---
noces_clean = df_noces.rename(columns={
    "Spot": "name",
    "Location": "location",
    "Rating": "rating"
})

noces_clean["services"] = "wedding venue"
noces_clean["category"] = "wedding venue"
noces_clean = noces_clean[["name", "category", "location", "rating", "services"]]

# --- Victoria Venues ---
vic_clean = df_vic.rename(columns={
    "full_name": "name",
    "addresspt1": "location"
})

vic_clean["services"] = "event venue"
vic_clean["category"] = "event venue"
vic_clean["rating"] = 0
vic_clean = vic_clean[["name", "category", "location", "rating", "services"]]

# --- GeoPlaces (restaurants) ---
geo_clean = df_geo.rename(columns={
    "name": "name",
    "address": "location",
    "price": "rating"
})

geo_clean["services"] = "restaurant and food services"
geo_clean["category"] = "restaurant"
geo_clean = geo_clean[["name", "category", "location", "rating", "services"]]

print("Cleaned datasets created")


Cleaned datasets created


In [21]:
# =========================
# 3. Merge all datasets
# =========================
all_vendors = pd.concat([
    catering_clean,
    noces_clean,
    vic_clean,
    geo_clean
], ignore_index=True)

print("Merged shape:", all_vendors.shape)


Merged shape: (684, 5)


In [22]:
# =========================
# 4. Final cleaning
# =========================
all_vendors = all_vendors.drop_duplicates()
all_vendors["rating"] = all_vendors["rating"].fillna(0)
all_vendors = all_vendors.dropna(subset=["name", "services"])

print("After cleaning:", all_vendors.shape)


After cleaning: (682, 5)


In [23]:
# =========================
# 5. Save final dataset
# =========================
all_vendors.to_csv("../data/vendors_final.csv", index=False)

print("Saved to ../data/vendors_final.csv")
all_vendors.head()


Saved to ../data/vendors_final.csv


Unnamed: 0,name,category,location,rating,services
0,The Red Castle,catering,"6 2nd Alley, Sanmin Street, Tamsui District, ,...",0,Coffee and Fine Dining in a Colonial BuildingT...
1,Feng Hua Restaurant,catering,"No.209, Sec. 2, Shuangshi Rd., , 220, Taiwan,...",0,"With clean and quiet environment, modern and Z..."
2,Hong Kong Chen's gruel and noodle.,catering,"No.46, Zhulin Rd., , 234, Taiwan, Yonghe Dist...",0,"Standard Hong Kong style gruel, boil with smal..."
3,Zhening Jia Garden Restaurant,catering,"No.70, Sec. 1, Zhongshan Rd., , 252, Taiwan, ...",0,The owner has over 30 years of experience prep...
4,Fu Zhou Lin Pepper Bun,catering,"Minsheng St. (inside the Food Street), Ruifang...",0,"Legend has it that Fu Zhou China's ""Green onio..."
