In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pandas as pd
from typing import Dict, Any
from datetime import datetime
import ast
import re

In [3]:
df = pd.read_json('/content/drive/My Drive/final.json', lines=True)
df = df.dropna()

In [None]:
df.attributes[0]

{'GoodForKids': 'True',
 'RestaurantsGoodForGroups': 'True',
 'RestaurantsReservations': 'True',
 'BusinessAcceptsCreditCards': 'True',
 'RestaurantsAttire': "u'casual'",
 'Alcohol': "u'full_bar'",
 'RestaurantsTakeOut': 'True',
 'RestaurantsPriceRange2': '2',
 'HasTV': 'True',
 'BusinessParking': "{'garage': False, 'street': False, 'validated': False, 'lot': True, 'valet': False}",
 'NoiseLevel': "u'average'",
 'Ambience': "{'touristy': False, 'hipster': False, 'romantic': False, 'divey': False, 'intimate': False, 'trendy': False, 'upscale': False, 'classy': True, 'casual': True}",
 'HappyHour': 'True',
 'RestaurantsTableService': 'True',
 'WiFi': "u'free'",
 'ByAppointmentOnly': 'False',
 'BikeParking': 'True',
 'WheelchairAccessible': 'True',
 'OutdoorSeating': 'True',
 'Caters': 'True',
 'BYOB': 'False',
 'RestaurantsDelivery': 'None',
 'DogsAllowed': 'False',
 'Corkage': 'True',
 'GoodForMeal': "{'dessert': None, 'latenight': False, 'lunch': True, 'dinner': True, 'brunch': None, '

In [6]:
def format_attribute(parking_dict: dict) -> str:
    """
    Formats attribute information into a natural language description.
    """
    available_options = []

    if parking_dict.get('lot', False):
        available_options.append('lot')
    if parking_dict.get('garage', False):
        available_options.append('garage')
    if parking_dict.get('street', False):
        available_options.append('street')
    if parking_dict.get('valet', False):
        available_options.append('valet')
    if parking_dict.get('validated', False):
        available_options.append('validated')

    if not available_options:
        return "does not have on-site parking"

    if len(available_options) == 1:
        return f"offers {available_options[0]} parking"
    elif len(available_options) == 2:
        return f"offers {available_options[0]} and {available_options[1]} parking"
    else:
        last_option = available_options.pop()
        return f"offers {', '.join(available_options)}, and {last_option} parking"

def parse_parking_dict(value):
    """
    Parses the parking dictionary from various input formats.
    """
    if isinstance(value, dict):
        return value
    if isinstance(value, str):
        try:
            cleaned = value.replace("u'", "'").replace("'", '"')
            return ast.literal_eval(cleaned)
        except:
            return None
    return None

def format_ambience(ambience_dict: dict) -> str:
    """
    Formats ambience information into a natural language description.
    """
    present_qualities = [key for key, value in ambience_dict.items() if convert_to_bool(value)]

    if not present_qualities:
        return "has no specific ambience qualities listed"

    if len(present_qualities) == 1:
        return f"has a {present_qualities[0]} ambience"
    elif len(present_qualities) == 2:
        return f"has a {present_qualities[0]} and {present_qualities[1]} ambience"
    else:
        last_quality = present_qualities.pop()
        return f"has a {', '.join(present_qualities)}, and {last_quality} ambience"

def parse_dict_attribute(value):
    """
    Parses dictionary attributes from various input formats.
    """
    if isinstance(value, dict):
        return value
    if isinstance(value, str):
        try:
            cleaned = value.replace("u'", "'").replace("'", '"')
            return ast.literal_eval(cleaned)
        except:
            return None
    return None

def format_good_for_meal(meal_dict: dict) -> str:
    """
    Formats meal information into a natural language description.
    """
    meal_order = ['breakfast', 'brunch', 'lunch', 'dinner', 'dessert', 'latenight']
    good_for = []
    unknown = []

    for meal in meal_order:
        value = meal_dict.get(meal)
        if value is True:
            good_for.append(meal)
        elif value is None:
            unknown.append(meal)

    description_parts = []

    if good_for:
        if len(good_for) == 1:
            description_parts.append(f"serves {good_for[0]}")
        elif len(good_for) == 2:
            description_parts.append(f"serves {good_for[0]} and {good_for[1]}")
        else:
            last_meal = good_for.pop()
            description_parts.append(f"serves {', '.join(good_for)}, and {last_meal}")

    if unknown:
        if len(unknown) == 1:
            description_parts.append(f"has not specified {unknown[0]} service")
        elif len(unknown) == 2:
            description_parts.append(f"has not specified {unknown[0]} or {unknown[1]} service")
        else:
            last_unknown = unknown.pop()
            description_parts.append(f"has not specified {', '.join(unknown)}, or {last_unknown} service")

    if not description_parts:
        return "does not serve any listed meal types"

    return " and ".join(description_parts)

def format_music_options(music_dict: dict) -> str:
    """
    Formats music information into a natural language description.
    Music options that are None or False are both treated as "does not offer"
    """
    music_types = {
        'dj': 'DJ music',
        'live': 'live music',
        'jukebox': 'jukebox',
        'video': 'music videos',
        'background_music': 'background music',
        'karaoke': 'karaoke',
        'no_music': 'quiet atmosphere'
    }

    available = [music_types[key] for key, value in music_dict.items()
                if value is True]

    if not available:
        return "does not offer any music entertainment"

    if len(available) == 1:
        return f"offers {available[0]}"
    elif len(available) == 2:
        return f"offers {available[0]} and {available[1]}"
    else:
        last_option = available.pop()
        return f"offers {', '.join(available)}, and {last_option}"

def format_best_nights(nights_dict: dict) -> str:
    """
    Formats best nights information into a natural language description.
    """
    day_order = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']
    best_nights = [day.capitalize() for day in day_order if nights_dict.get(day, False)]

    if not best_nights:
        return "does not list any specific best nights to visit"

    if len(best_nights) == 1:
        return f"is busiest and has best nights on {best_nights[0]}"
    elif len(best_nights) == 2:
        return f"is busiest on and has best nights on {best_nights[0]} and {best_nights[1]}"
    else:
        last_night = best_nights.pop()
        return f"is busiest on and has best nights on {', '.join(best_nights)}, and {last_night}"

def process_restaurant_attributes(df: pd.DataFrame) -> pd.DataFrame:
    """
    Processes restaurant data to create one row per attribute with natural language descriptions.

    Args:
        df (pandas.DataFrame): Input dataframe with restaurant data
    Returns:
        pandas.DataFrame: Processed dataframe with one row per attribute per restaurant
    """
    all_rows = []

    for _, row in df.iterrows():
        try:
            attributes = row['attributes']

            if not isinstance(attributes, dict):
                continue

            for attr_key, attr_value in attributes.items():
                try:
                    if isinstance(attr_value, dict):
                        for nested_key, nested_value in attr_value.items():
                            description = generate_attribute_description(
                                row['name'],
                                attr_key,
                                nested_key,
                                nested_value
                            )
                            all_rows.append({
                                'business_id': row['business_id'],
                                'name': row['name'],
                                'attribute': f"{attr_key}.{nested_key}",
                                'value': str(nested_value),
                                'review': description
                            })
                    else:
                        description = generate_attribute_description(
                            row['name'],
                            attr_key,
                            None,
                            attr_value
                        )
                        all_rows.append({
                            'business_id': row['business_id'],
                            'name': row['name'],
                            'attribute': attr_key,
                            'value': str(attr_value),
                            'review': description
                        })
                except Exception as e:
                    print(f"Error processing attribute {attr_key} for {row['name']}: {e}")
                    continue

        except Exception as e:
            print(f"Error processing row for {row['name']}: {e}")
            continue

    return pd.DataFrame(all_rows)

def convert_to_bool(value: Any) -> bool:
    """
    Converts various value formats to boolean.
    """
    if isinstance(value, bool):
        return value
    if isinstance(value, str):
        return value.lower() == 'true'
    return bool(value)

def generate_attribute_description(restaurant_name: str, attribute: str, nested_key: str = None, value: Any = None) -> str:
    """
    Generates natural language descriptions for restaurant attributes.
    """

    if attribute == 'BusinessParking':
        if nested_key:
            return f"{restaurant_name} {'offers' if convert_to_bool(value) else 'does not offer'} {nested_key} parking."
        else:
            parking_dict = parse_parking_dict(value)
            if parking_dict:
                return f"{restaurant_name} {format_attribute(parking_dict)}"
    elif attribute == 'Ambience':
        if nested_key:
            if value is None:
                return f"The {nested_key} ambience of {restaurant_name} is not specified."
            return f"{restaurant_name} {'has' if convert_to_bool(value) else 'does not have'} a {nested_key} ambience."
        else:
            ambience_dict = parse_dict_attribute(value)
            if ambience_dict:
                return f"{restaurant_name} {format_ambience(ambience_dict)}"

    elif attribute == 'GoodForMeal':
        if nested_key:
            if value is None:
                return f"It is unknown if {restaurant_name} is good for {nested_key}."
            return f"{restaurant_name} {'is' if convert_to_bool(value) else 'is not'} good for {nested_key}."
        else:
            meal_dict = parse_dict_attribute(value)
            if meal_dict:
                return f"{restaurant_name} {format_good_for_meal(meal_dict)}"

    elif attribute == 'Music':
        if nested_key:
            if value is None or value is False:
                return f"{restaurant_name} does not offer {nested_key}."
            return f"{restaurant_name} offers {nested_key}."
        else:
            music_dict = parse_dict_attribute(value)
            if music_dict:
                return f"{restaurant_name} {format_music_options(music_dict)}"

    elif attribute == 'BestNights':
        if nested_key:
            return f"{nested_key.capitalize()} {'is' if convert_to_bool(value) else 'is not'} one of the best nights to visit {restaurant_name}."
        else:
            nights_dict = parse_dict_attribute(value)
            if nights_dict:
                return f"{restaurant_name} {format_best_nights(nights_dict)}"

    if attribute == 'GoodForKids':
        is_good = convert_to_bool(value)
        return f"{restaurant_name} {'is' if is_good else 'is not'} good for kids."
    elif attribute == 'RestaurantsAttire':
        value = str(value).replace("u'", "").replace("'", "")
        return f"{restaurant_name} has a {value} dress code."
    elif attribute == 'RestaurantsGoodForGroups':
        is_good = convert_to_bool(value)
        return f"{restaurant_name} {'is' if is_good else 'is not'} good for group dining."
    elif attribute == 'RestaurantsReservations':
        accepts = convert_to_bool(value)
        return f"{restaurant_name} {'accepts' if accepts else 'does not accept'} reservations."
    elif attribute == 'Alcohol':
        value = str(value).replace("u'", "").replace("'", "")
        if value.lower() in ['none', 'no']:
            return f"{restaurant_name} does not serve alcohol."
        return f"{restaurant_name} serves {value.replace('_', ' ')}."
    elif attribute == 'BusinessAcceptsCreditCards':
        accepts = convert_to_bool(value)
        return f"{restaurant_name} {'accepts' if accepts else 'does not accept'} credit cards."
    elif attribute == 'RestaurantsTakeOut':
        offers = convert_to_bool(value)
        return f"{restaurant_name} {'offers' if offers else 'does not offer'} takeout."
    elif attribute == 'WiFi':
        value = str(value).replace("u'", "").replace("'", "")
        if value.lower() in ['no', 'false']:
            return f"{restaurant_name} does not offer WiFi."
        return f"{restaurant_name} offers {value} WiFi."
    elif attribute == 'HasTV':
        has_tv = convert_to_bool(value)
        return f"{restaurant_name} {'has' if has_tv else 'does not have'} TVs."
    elif attribute == 'HappyHour':
        has_happy_hour = convert_to_bool(value)
        return f"{restaurant_name} {'offers' if has_happy_hour else 'does not offer'} happy hour."
    elif attribute == 'RestaurantsTableService':
        has_service = convert_to_bool(value)
        return f"{restaurant_name} {'offers' if has_service else 'does not offer'} table service."
    elif attribute == 'ByAppointmentOnly':
        appointment_only = convert_to_bool(value)
        return f"{restaurant_name} {'is' if appointment_only else 'is not'} by appointment only."
    elif attribute == 'NoiseLevel':
        value = str(value).replace("u'", "").replace("'", "")
        return f"{restaurant_name} has an {value} noise level."
    elif attribute == 'BikeParking':
        has_parking = convert_to_bool(value)
        return f"{restaurant_name} {'offers' if has_parking else 'does not offer'} bike parking."
    elif attribute == 'WheelchairAccessible':
        is_accessible = convert_to_bool(value)
        return f"{restaurant_name} {'is' if is_accessible else 'is not'} wheelchair accessible."
    elif attribute == 'OutdoorSeating':
        has_outdoor = convert_to_bool(value)
        return f"{restaurant_name} {'offers' if has_outdoor else 'does not offer'} outdoor seating."
    elif attribute == 'Caters':
        does_cater = convert_to_bool(value)
        return f"{restaurant_name} {'offers' if does_cater else 'does not offer'} catering services."
    elif attribute == 'BYOB':
        is_byob = convert_to_bool(value)
        return f"{restaurant_name} {'is' if is_byob else 'is not'} BYOB."
    elif attribute == 'RestaurantsDelivery':
        if value is None:
            return f"Delivery information for {restaurant_name} is not available."
        has_delivery = convert_to_bool(value)
        return f"{restaurant_name} {'offers' if has_delivery else 'does not offer'} delivery."
    elif attribute == 'DogsAllowed':
        allows_dogs = convert_to_bool(value)
        return f"{restaurant_name} {'allows' if allows_dogs else 'does not allow'} dogs."
    elif attribute == 'Corkage':
        has_corkage = convert_to_bool(value)
        return f"{restaurant_name} {'has' if has_corkage else 'does not have'} a corkage fee."
    elif attribute == 'BusinessAcceptsBitcoin':
        accepts_bitcoin = convert_to_bool(value)
        return f"{restaurant_name} {'accepts' if accepts_bitcoin else 'does not accept'} Bitcoin."
    elif attribute == 'BYOBCorkage':
        if str(value).lower() == "'no'" or value == 'no':
            return f"{restaurant_name} does not allow BYOB."
        elif str(value).lower() == "'yes_free'" or value == 'yes_free':
            return f"{restaurant_name} allows BYOB with no corkage fee."
        else:
            return f"{restaurant_name} has BYOB policy: {value}"
    elif attribute == 'GoodForDancing':
        has_dancing = convert_to_bool(value)
        return f"{restaurant_name} {'is' if has_dancing else 'is not'} good for dancing."
    elif attribute == 'DriveThru':
        has_drive_thru = convert_to_bool(value)
        return f"{restaurant_name} {'has' if has_drive_thru else 'does not have'} a drive-thru."
    elif attribute == 'CoatCheck':
        has_coat_check = convert_to_bool(value)
        return f"{restaurant_name} {'offers' if has_coat_check else 'does not offer'} coat check service."
    elif attribute == 'Smoking':
        if value is None:
            return f"Smoking policy at {restaurant_name} is not specified."
        value = str(value).lower().replace("u'", "'").replace("'", "")
        if value == 'no':
            return f"{restaurant_name} does not allow smoking."
        elif value == 'outdoor':
            return f"{restaurant_name} allows smoking outdoors only."
        else:
            return f"{restaurant_name} has smoking policy: {value}"
    elif attribute == 'RestaurantsPriceRange2':
        price_descriptions = {
            '1': 'inexpensive',
            '2': 'moderately priced',
            '3': 'expensive',
            '4': 'very expensive'
        }
        return f"{restaurant_name} is {price_descriptions.get(str(value), 'undefined price range')}."

    if isinstance(value, bool) or value is None:
        verb = 'does' if value is False else 'does not' if value is None else ''
        feature = ' '.join(filter(None, [word for word in attribute if word.isupper() or word.islower()])).lower()
        return f"{restaurant_name} {verb + ' not ' if verb else ''}{feature}."

    return f"{restaurant_name} has {attribute}: {value}."

def process_data(df):
    print("Starting data processing...")
    print(f"Number of restaurants: {len(df)}")

    result_df = process_restaurant_attributes(df)

    print(f"Processing complete. Generated {len(result_df)} attribute descriptions.")

    if not result_df.empty:
        print("\nSample of generated descriptions:")
        print(result_df.head())

    return result_df

In [7]:
attribute_df = process_data(df)

Starting data processing...
Number of restaurants: 49
Processing complete. Generated 1158 attribute descriptions.

Sample of generated descriptions:
              business_id                           name  \
0  4e9tguRcMc4S5hzBlfC62A  The Farm and Fisherman Tavern   
1  4e9tguRcMc4S5hzBlfC62A  The Farm and Fisherman Tavern   
2  4e9tguRcMc4S5hzBlfC62A  The Farm and Fisherman Tavern   
3  4e9tguRcMc4S5hzBlfC62A  The Farm and Fisherman Tavern   
4  4e9tguRcMc4S5hzBlfC62A  The Farm and Fisherman Tavern   

                    attribute      value  \
0                 GoodForKids       True   
1    RestaurantsGoodForGroups       True   
2     RestaurantsReservations       True   
3  BusinessAcceptsCreditCards       True   
4           RestaurantsAttire  u'casual'   

                                              review  
0    The Farm and Fisherman Tavern is good for kids.  
1  The Farm and Fisherman Tavern is good for grou...  
2  The Farm and Fisherman Tavern accepts reservat...  
3  Th

In [None]:
attribute_df[['business_id', 'name', 'review']].to_csv('attributes.csv')

In [None]:
def format_time(time_str):
    """
    Formats time string into readable format.
    """
    if time_str == '0:0':
        return 'Closed'

    hours, minutes = map(int, time_str.split(':'))
    time = datetime.strptime(f"{hours}:{minutes}", "%H:%M")
    return time.strftime("%I:%M %p").lstrip("0").lower()

def process_hours(restaurant_name, hours_dict):
    """
    Creates individual descriptions for each day.
    Returns a list of descriptions, one for each day.
    """
    if not hours_dict:
        return [f"{restaurant_name}'s hours are not available."]

    days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    descriptions = []

    for day in days:
        if day not in hours_dict or hours_dict[day] == '0:0-0:0':
            descriptions.append(f"{restaurant_name} is closed on {day}.")
        else:
            start, end = hours_dict[day].split('-')
            if end == '0:0':
                end = '24:0'
            descriptions.append(
                f"{restaurant_name} is open on {day} between {format_time(start)} and {format_time(end)}."
            )

    return descriptions

def create_hours_descriptions(df):
    """
    Creates a new DataFrame with separate rows for each day's hours description.
    """
    processed_rows = []

    for _, row in df.iterrows():
        try:
            if isinstance(row['hours'], str):
                hours_dict = ast.literal_eval(row['hours'])
            else:
                hours_dict = row['hours']

            daily_descriptions = process_hours(row['name'], hours_dict)

            for description in daily_descriptions:
                processed_rows.append({
                    'business_id': row['business_id'],
                    'name': row['name'],
                    'hours': hours_dict,
                    'review': description
                })

        except Exception as e:
            print(f"Error processing hours for {row['name']}: {str(e)}")
            continue

    return pd.DataFrame(processed_rows)

def process_restaurant_hours(input_df):
    """
    Process restaurant hours and create formatted DataFrame.
    """
    print("Processing restaurant hours...")
    print(f"Input DataFrame shape: {input_df.shape}")

    result_df = create_hours_descriptions(input_df)

    print(f"Processing complete. Output DataFrame shape: {result_df.shape}")
    print("\nSample output:")
    print(result_df[['name', 'review']].head())

    return result_df

In [None]:
hours_df = process_restaurant_hours(df)

Processing restaurant hours...
Input DataFrame shape: (49, 18)
Error processing hours for The Jug Handle Inn: time data '24:0' does not match format '%H:%M'
Error processing hours for Yard House: time data '24:0' does not match format '%H:%M'
Error processing hours for Vincentown Diner: time data '24:0' does not match format '%H:%M'
Error processing hours for Silver Diner: time data '24:0' does not match format '%H:%M'
Error processing hours for Bahama Breeze: time data '24:0' does not match format '%H:%M'
Error processing hours for The Pub: time data '24:0' does not match format '%H:%M'
Error processing hours for Miller's Ale House: time data '24:0' does not match format '%H:%M'
Error processing hours for Cinder Bar: time data '24:0' does not match format '%H:%M'
Processing complete. Output DataFrame shape: (287, 4)

Sample output:
                            name  \
0  The Farm and Fisherman Tavern   
1  The Farm and Fisherman Tavern   
2  The Farm and Fisherman Tavern   
3  The Farm

In [None]:
hours_df.to_csv('hours.csv')

In [None]:
def clean_tips_string(tips_str):
    """
    Cleans and extracts tip texts from a string representation of tips list.

    Args:
        tips_str: String representation of tips list
    Returns:
        list: List of tip texts
    """
    try:
        if isinstance(tips_str, str):
            tips_str = tips_str.replace("'", '"').replace('u"', '"')
            tips = json.loads(tips_str)
        else:
            tips = tips_str

        return [tip['text'] for tip in tips]
    except Exception as e:
        try:
            tips = ast.literal_eval(str(tips_str))
            return [tip['text'] for tip in tips]
        except:
            print(f"Error processing tips: {tips_str}")
            return []

def create_tips_rows(df):
    """
    Creates a new DataFrame with separate rows for each tip text.

    Args:
        df: Input DataFrame with business info and tips column
    Returns:
        DataFrame: New DataFrame with separate rows for each tip
    """
    processed_rows = []

    for _, row in df.iterrows():
        try:
            tip_texts = clean_tips_string(row['tips'])

            for text in tip_texts:
                processed_rows.append({
                    'business_id': row['business_id'],
                    'name': row['name'],
                    'review': text
                })

        except Exception as e:
            print(f"Error processing row for {row['name']}: {str(e)}")
            continue

    return pd.DataFrame(processed_rows)

In [None]:
tips_df = create_tips_rows(df)

In [None]:
tips_df.to_csv('tips.csv')

In [None]:
df[['name', 'business_id', 'menu']].to_csv('menu.csv')

In [None]:
df.categories.unique()

array(['Restaurants, American (New)',
       'Tea Rooms, Korean, Asian Fusion, Chicken Wings, Food, Restaurants, Specialty Food',
       'Chicken Wings, American (Traditional), Restaurants, Nightlife, Bars, American (New)',
       'Bars, French, Restaurants, Nightlife, Breakfast & Brunch',
       'Food, American (New), Breakfast & Brunch, Restaurants, Diners, Desserts, Caterers, Event Planning & Services',
       'American (New), Sushi Bars, Restaurants, Japanese',
       'American (Traditional), Restaurants, Cafes, Comfort Food, Vegetarian, Breakfast & Brunch, American (New)',
       'Restaurants, Diners, American (New), American (Traditional), Salad',
       'Active Life, Restaurants, Venues & Event Spaces, Music Venues, Botanical Gardens, Museums, Event Planning & Services, Religious Organizations, Parks, Arts & Entertainment, French, Nightlife',
       'American (New), Vegetarian, Restaurants, Bars, Nightlife, Beer Bar',
       'Steakhouses, Vegetarian, Tacos, Seafood, Mexican, Veg

In [None]:
def transform_categories_to_reviews(df):
    rows = []
    for _, row in df.iterrows():
        restaurant_name = row['name']
        categories = row['categories'].split(', ')
        for category in categories:
            review = f"{restaurant_name} offers excellent options for {category}."
            rows.append({'restaurant_name': restaurant_name, 'category': category, 'review': review})

    new_df = pd.DataFrame(rows)
    return new_df

transformed_df = transform_categories_to_reviews(df)

In [None]:
transformed_df.to_csv('categories.csv')