In [1]:
import requests
import json
from bs4 import BeautifulSoup
import pandas as pd

def extract_hotel_info(url):
    # Send a request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract hotel name from the meta tag
        hotel_name = soup.find('meta', {'name': 'description'}).get('content').split('-')[0].strip()

        # Extract JSON data from the specified script tag
        script_tag = soup.find('script', {'type': 'application/ld+json'})
        if script_tag:
            json_data = json.loads(script_tag.string)
            location = json_data.get('address', {}).get('streetAddress')
        else:
            print('JSON data not found in the specified script tag.')
            location = None

        # Extract amenities
        amenities_tag = soup.find('ul', {'style': 'padding-left:0;list-style-type:circle'})
        if amenities_tag:
            amenities = amenities_tag.find_all('div', class_='css-901oao')
            amenities_list = [amenity.text for amenity in amenities]
            amenities_str = ', '.join(amenities_list)
        else:
            amenities_str = None

        # Create a DataFrame for hotel information
        data = {
            'Hotel Name': [hotel_name],
            'Location': [location],
            'Amenities': [amenities_str]
        }
        df = pd.DataFrame(data)

        return df
    else:
        print(f'Failed to retrieve data from {url}. Status Code: {response.status_code}')
        return None

# Example usage:
url = 'https://www.traveloka.com/en-ph/hotel/philippines/diamond-hotel-philippines-1000000143368?contexts=%7B%22accessCode%22%3A%2257281PHSEO41911%22%7D'
hotel_df = extract_hotel_info(url)

# Display the DataFrame
print(hotel_df)


                  Hotel Name  \
0  Diamond Hotel Philippines   

                                            Location  \
0  Roxas Boulevard corner Dr. J. Quintos Street, ...   

                                           Amenities  
0  Bellhop, Welcoming drinks, Concierge, Money ch...  


In [16]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def extract_hotel_info(url):
    # Send a request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract hotel name from the specified h1 tag
        hotel_name_tag = soup.find('h1', class_='x_AX-hotel-name')
        hotel_name = hotel_name_tag.get_text(strip=True) if hotel_name_tag else None
        
        # Extract location from the specified div tag
        location_tag = soup.find('div', class_='x_AX-address')
        location = ', '.join(element.text.strip() for element in location_tag.find_all('span')) if location_tag else None

        # Extract amenities
        amenities_list = soup.select('.t8Xi-amenity-name')
        amenities = ', '.join([amenity.text.strip() for amenity in amenities_list]) if amenities_list else None

        # Extract price
        price_tag = soup.find('span', class_='iWwa-data')
        price = price_tag.text.strip() if price_tag else None
        
        # Extract number of available rooms
        rooms_span = soup.find('span', class_='G2iq-displayText')
        rooms_info = rooms_span['aria-label'] if rooms_span else None

        # Extract check-in and check-out information
        check_in_out_td = soup.find('td', class_='Q6C3-description')
        check_in_out_info = check_in_out_td.text.strip() if check_in_out_td else None

       # Extract customer review information
        review_content_div = soup.find('div', class_='c2oma-review-content')

        if review_content_div:
            review_rating = review_content_div.find('div', class_='c2oma-rating').text.strip() if review_content_div.find('div', class_='c2oma-rating') else None
            review_user_info = review_content_div.find('div', class_='c2oma-user-info').text.strip() if review_content_div.find('div', class_='c2oma-user-info') else None
            review_text_span = review_content_div.find('span', class_='c2oma-review-type-label')

            if review_text_span:
                review_type_label = review_text_span.text.strip()
                review_text = review_text_span.find_next('span').text.strip()
            else:
                review_type_label = None
                review_text = None
        else:
            review_rating = None
            review_user_info = None
            review_type_label = None
            review_text = None

        # Create a DataFrame for hotel information
        data = {
        'Hotel Name': [hotel_name],
        'Location': [location],
        'Amenities': [amenities],
        'Price': [price],
        'Number of Available Rooms': [rooms_info],
        'Other Data': [check_in_out_info],
        'Customer Review Rating': [review_rating],
        'Customer Review User Info': [review_user_info],
        'Customer Review Type Label': [review_type_label],
        'Customer Review Text': [review_text],
        }

        df = pd.DataFrame(data)
        return df
    else:
        print(f'Failed to retrieve data from {url}. Status Code: {response.status_code}')
        return None

# Example usage:
url = 'https://www.kayak.com.ph/Manila-Hotels-Summit-Hotel-Magnolia.2246932.ksp'
hotel_df = extract_hotel_info(url)

# Display the DataFrame
print(hotel_df)


              Hotel Name                                           Location  \
0  Summit Hotel Magnolia  Doña Hemady St. Cor. Aurora Boulevard, Manila,...   

                                           Amenities   Price  \
0  Free Wi-Fi, Tea/coffee maker, Free parking, Bu...  ₱2,270   

  Number of Available Rooms  \
0          1 room, 2 guests   

                                          Other Data Customer Review Rating  \
0  Check in anytime after 2:00 PM, check out anyt...                   None   

  Customer Review User Info Customer Review Type Label Customer Review Text  
0                      None                       None                 None  


In [10]:
import json
import pandas as pd

# Replace 'your_json_string' with the actual JSON string
json_string = '''
{"@context":"http://schema.org","@type":"ItemList","itemListOrder":"http://schema.org/ItemListOrderAscending","description":"16 best hotels in Manila","itemListElement":[{"@type":"ListItem","position":1,"url":"https://www.kayak.com.ph/Manila-Hotels-Manila-Manor-Hotel.334804.ksp"},{"@type":"ListItem","position":2,"url":"https://www.kayak.com.ph/Manila-Hotels-The-Bayleaf-Intramuros.653348.ksp"},{"@type":"ListItem","position":3,"url":"https://www.kayak.com.ph/Manila-Hotels-Amelie-Hotel-Manila.2224980.ksp"},{"@type":"ListItem","position":4,"url":"https://www.kayak.com.ph/Manila-Hotels-Riviera-Mansion-Hotel.371833.ksp"},{"@type":"ListItem","position":5,"url":"https://www.kayak.com.ph/Manila-Hotels-Palm-Grove-Hotel.185266.ksp"},{"@type":"ListItem","position":6,"url":"https://www.kayak.com.ph/Manila-Hotels-Summit-Hotel-Magnolia.2246932.ksp"},{"@type":"ListItem","position":7,"url":"https://www.kayak.com.ph/Manila-Hotels-Malate-Pensionne.267453.ksp"},{"@type":"ListItem","position":8,"url":"https://www.kayak.com.ph/Manila-Hotels-Ramada-by-Wyndham-Manila-Central.417087.ksp"},{"@type":"ListItem","position":9,"url":"https://www.kayak.com.ph/Manila-Hotels-Rizal-Park-Hotel.2985701.ksp"},{"@type":"ListItem","position":10,"url":"https://www.kayak.com.ph/Manila-Hotels-Red-Planet-Manila-Ortigas.2041353.ksp"},{"@type":"ListItem","position":11,"url":"https://www.kayak.com.ph/Manila-Hotels-Hotel-Kimberly-Manila.156655.ksp"},{"@type":"ListItem","position":12,"url":"https://www.kayak.com.ph/Manila-Hotels-Octagon-Mansion-Hotel.316030.ksp"},{"@type":"ListItem","position":13,"url":"https://www.kayak.com.ph/Manila-Hotels-City-Garden-Suites-Manila.115786.ksp"},{"@type":"ListItem","position":14,"url":"https://www.kayak.com.ph/Manila-Hotels-Eurotel-Makati.334631.ksp"},{"@type":"ListItem","position":15,"url":"https://www.kayak.com.ph/Manila-Hotels-Manila-Marriott-Hotel.315862.ksp"},{"@type":"ListItem","position":16,"url":"https://www.kayak.com.ph/Manila-Hotels-Citystate-Tower-Hotel.681300.ksp"}]}
'''

# Parse the JSON string
json_data = json.loads(json_string)

# Extract URLs using list comprehension
urls = [item['url'] for item in json_data['itemListElement']]

# Create a DataFrame
consolidated_url = pd.DataFrame({'Hotel_URL': urls})

# Print the DataFrame
print(consolidated_url)

# Convert DataFrame to a list
url_list = consolidated_url['Hotel_URL'].tolist()

# Print the list
print(url_list)


                                            Hotel_URL
0   https://www.kayak.com.ph/Manila-Hotels-Manila-...
1   https://www.kayak.com.ph/Manila-Hotels-The-Bay...
2   https://www.kayak.com.ph/Manila-Hotels-Amelie-...
3   https://www.kayak.com.ph/Manila-Hotels-Riviera...
4   https://www.kayak.com.ph/Manila-Hotels-Palm-Gr...
5   https://www.kayak.com.ph/Manila-Hotels-Summit-...
6   https://www.kayak.com.ph/Manila-Hotels-Malate-...
7   https://www.kayak.com.ph/Manila-Hotels-Ramada-...
8   https://www.kayak.com.ph/Manila-Hotels-Rizal-P...
9   https://www.kayak.com.ph/Manila-Hotels-Red-Pla...
10  https://www.kayak.com.ph/Manila-Hotels-Hotel-K...
11  https://www.kayak.com.ph/Manila-Hotels-Octagon...
12  https://www.kayak.com.ph/Manila-Hotels-City-Ga...
13  https://www.kayak.com.ph/Manila-Hotels-Eurotel...
14  https://www.kayak.com.ph/Manila-Hotels-Manila-...
15  https://www.kayak.com.ph/Manila-Hotels-Citysta...
['https://www.kayak.com.ph/Manila-Hotels-Manila-Manor-Hotel.334804.ksp', 'https://

In [12]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json

import requests
from bs4 import BeautifulSoup
import pandas as pd

def extract_hotel_info(url):
    # Send a request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract hotel name from the specified h1 tag
        hotel_name_tag = soup.find('h1', class_='x_AX-hotel-name')
        hotel_name = hotel_name_tag.get_text(strip=True) if hotel_name_tag else None
        
        # Extract location from the specified div tag
        location_tag = soup.find('div', class_='x_AX-address')
        location = ', '.join(element.text.strip() for element in location_tag.find_all('span')) if location_tag else None

        # Extract amenities
        amenities_list = soup.select('.t8Xi-amenity-name')
        amenities = ', '.join([amenity.text.strip() for amenity in amenities_list]) if amenities_list else None

        # Extract price
        price_tag = soup.find('span', class_='iWwa-data')
        price = price_tag.text.strip() if price_tag else None
        
        # Extract number of available rooms
        rooms_span = soup.find('span', class_='G2iq-displayText')
        rooms_info = rooms_span['aria-label'] if rooms_span else None

        # Extract check-in and check-out information
        check_in_out_td = soup.find('td', class_='Q6C3-description')
        check_in_out_info = check_in_out_td.text.strip() if check_in_out_td else None

       # Extract customer review information
        review_content_div = soup.find('div', class_='c2oma-review-content')

        if review_content_div:
            review_rating = review_content_div.find('div', class_='c2oma-rating').text.strip() if review_content_div.find('div', class_='c2oma-rating') else None
            review_user_info = review_content_div.find('div', class_='c2oma-user-info').text.strip() if review_content_div.find('div', class_='c2oma-user-info') else None
            review_text_span = review_content_div.find('span', class_='c2oma-review-type-label')

            if review_text_span:
                review_type_label = review_text_span.text.strip()
                review_text = review_text_span.find_next('span').text.strip()
            else:
                review_type_label = None
                review_text = None
        else:
            review_rating = None
            review_user_info = None
            review_type_label = None
            review_text = None

        # Create a DataFrame for hotel information
        data = {
        'Hotel Name': [hotel_name],
        'Location': [location],
        'Amenities': [amenities],
        'Price': [price],
        'Number of Available Rooms': [rooms_info],
        'Other Data': [check_in_out_info],
        'Customer Review Rating': [review_rating],
        'Customer Review User Info': [review_user_info],
        'Customer Review Type Label': [review_type_label],
        'Customer Review Text': [review_text],
        }

        df = pd.DataFrame(data)
        return df
    else:
        print(f'Failed to retrieve data from {url}. Status Code: {response.status_code}')
        return None

# Replace 'your_json_string' with the actual JSON string
json_string = '''
{"@context":"http://schema.org","@type":"ItemList","itemListOrder":"http://schema.org/ItemListOrderAscending","description":"16 best hotels in Manila","itemListElement":[{"@type":"ListItem","position":1,"url":"https://www.kayak.com.ph/Manila-Hotels-Manila-Manor-Hotel.334804.ksp"},{"@type":"ListItem","position":2,"url":"https://www.kayak.com.ph/Manila-Hotels-The-Bayleaf-Intramuros.653348.ksp"},{"@type":"ListItem","position":3,"url":"https://www.kayak.com.ph/Manila-Hotels-Amelie-Hotel-Manila.2224980.ksp"},{"@type":"ListItem","position":4,"url":"https://www.kayak.com.ph/Manila-Hotels-Riviera-Mansion-Hotel.371833.ksp"},{"@type":"ListItem","position":5,"url":"https://www.kayak.com.ph/Manila-Hotels-Palm-Grove-Hotel.185266.ksp"},{"@type":"ListItem","position":6,"url":"https://www.kayak.com.ph/Manila-Hotels-Summit-Hotel-Magnolia.2246932.ksp"},{"@type":"ListItem","position":7,"url":"https://www.kayak.com.ph/Manila-Hotels-Malate-Pensionne.267453.ksp"},{"@type":"ListItem","position":8,"url":"https://www.kayak.com.ph/Manila-Hotels-Ramada-by-Wyndham-Manila-Central.417087.ksp"},{"@type":"ListItem","position":9,"url":"https://www.kayak.com.ph/Manila-Hotels-Rizal-Park-Hotel.2985701.ksp"},{"@type":"ListItem","position":10,"url":"https://www.kayak.com.ph/Manila-Hotels-Red-Planet-Manila-Ortigas.2041353.ksp"},{"@type":"ListItem","position":11,"url":"https://www.kayak.com.ph/Manila-Hotels-Hotel-Kimberly-Manila.156655.ksp"},{"@type":"ListItem","position":12,"url":"https://www.kayak.com.ph/Manila-Hotels-Octagon-Mansion-Hotel.316030.ksp"},{"@type":"ListItem","position":13,"url":"https://www.kayak.com.ph/Manila-Hotels-City-Garden-Suites-Manila.115786.ksp"},{"@type":"ListItem","position":14,"url":"https://www.kayak.com.ph/Manila-Hotels-Eurotel-Makati.334631.ksp"},{"@type":"ListItem","position":15,"url":"https://www.kayak.com.ph/Manila-Hotels-Manila-Marriott-Hotel.315862.ksp"},{"@type":"ListItem","position":16,"url":"https://www.kayak.com.ph/Manila-Hotels-Citystate-Tower-Hotel.681300.ksp"}]}
'''

# Parse the JSON string
json_data = json.loads(json_string)

# Extract URLs using list comprehension
urls = [item['url'] for item in json_data['itemListElement']]

# List to store DataFrames for each hotel
hotel_dfs = []

# Iterate through each URL and call the extract_hotel_info function
for url in urls:
    hotel_df = extract_hotel_info(url)
    if hotel_df is not None:
        hotel_dfs.append(hotel_df)

# Concatenate the list of DataFrames into a single DataFrame
result_df = pd.concat(hotel_dfs, ignore_index=True)

# Display the final DataFrame
print(result_df)


                          Hotel Name  \
0                 Manila Manor Hotel   
1                               None   
2                Amelie Hotel Manila   
3              Riviera Mansion Hotel   
4                  Palm Hotel Manila   
5              Summit Hotel Magnolia   
6                   Malate Pensionne   
7   Ramada by Wyndham Manila Central   
8                               None   
9                 Red Planet Ortigas   
10             Hotel Kimberly Manila   
11             Octagon Mansion Hotel   
12         City Garden Suites Manila   
13                              None   
14             Manila Marriott Hotel   
15             Citystate Tower Hotel   

                                             Location  \
0   1660 Jorge Bocobo Street, Manila, 1004, Philip...   
1                                                None   
2       1667 Bocobo Street, Manila, 1004, Philippines   
3          1638 A. Mabini Street, Manila, Philippines   
4   524 Pedro Gil Cor, Adriatico S

In [13]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json


def extract_hotel_info(url):
    # Send a request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract hotel name from the specified h1 tag
        hotel_name_tag = soup.find('h1', class_='x_AX-hotel-name')
        hotel_name = hotel_name_tag.get_text(strip=True) if hotel_name_tag else None
        
        # Extract location from the specified div tag
        location_tag = soup.find('div', class_='x_AX-address')
        location = ', '.join(element.text.strip() for element in location_tag.find_all('span')) if location_tag else None

        # Extract amenities
        amenities_list = soup.select('.t8Xi-amenity-name')
        amenities = ', '.join([amenity.text.strip() for amenity in amenities_list]) if amenities_list else None

        # Extract price
        price_tag = soup.find('span', class_='iWwa-data')
        price = price_tag.text.strip() if price_tag else None
        
        # Extract number of available rooms
        rooms_span = soup.find('span', class_='G2iq-displayText')
        rooms_info = rooms_span['aria-label'] if rooms_span else None

        # Extract check-in and check-out information
        check_in_out_td = soup.find('td', class_='Q6C3-description')
        check_in_out_info = check_in_out_td.text.strip() if check_in_out_td else None

       # Extract customer review information
        review_content_div = soup.find('div', class_='c2oma-review-content')

        if review_content_div:
            review_rating = review_content_div.find('div', class_='c2oma-rating').text.strip() if review_content_div.find('div', class_='c2oma-rating') else None
            review_user_info = review_content_div.find('div', class_='c2oma-user-info').text.strip() if review_content_div.find('div', class_='c2oma-user-info') else None
            review_text_span = review_content_div.find('span', class_='c2oma-review-type-label')

            if review_text_span:
                review_type_label = review_text_span.text.strip()
                review_text = review_text_span.find_next('span').text.strip()
            else:
                review_type_label = None
                review_text = None
        else:
            review_rating = None
            review_user_info = None
            review_type_label = None
            review_text = None

        # Create a DataFrame for hotel information
        data = {
        'Hotel Name': [hotel_name],
        'Location': [location],
        'Amenities': [amenities],
        'Price': [price],
        'Number of Available Rooms': [rooms_info],
        'Other Data': [check_in_out_info],
        'Customer Review Rating': [review_rating],
        'Customer Review User Info': [review_user_info],
        'Customer Review Type Label': [review_type_label],
        'Customer Review Text': [review_text],
        }

        df = pd.DataFrame(data)
        return df
    else:
        print(f'Failed to retrieve data from {url}. Status Code: {response.status_code}')
        return None

def extract_urls_from_json(json_string):
    json_data = json.loads(json_string)
    return [item['url'] for item in json_data['itemListElement']]

def append_new_urls(existing_urls, additional_urls):
    return existing_urls + additional_urls

def run_all_and_display(json_string, existing_urls=None):
    # Extract URLs from JSON
    json_urls = extract_urls_from_json(json_string)

    # If existing_urls is not provided, initialize it as an empty list
    existing_urls = existing_urls or []

    # Append new URLs
    all_urls = append_new_urls(existing_urls, json_urls)

    # List to store DataFrames for each hotel
    hotel_dfs = []

    # Iterate through each URL and call the extract_hotel_info function
    for url in all_urls:
        hotel_df = extract_hotel_info(url)
        if hotel_df is not None:
            hotel_dfs.append(hotel_df)

    # Load the existing DataFrame from the CSV file
    csv_file_path = 'output.csv'
    try:
        existing_df = pd.read_csv(csv_file_path)
    except FileNotFoundError:
        existing_df = pd.DataFrame()

    # Concatenate the list of DataFrames into a single DataFrame
    result_df = pd.concat([existing_df] + hotel_dfs, ignore_index=True)

    # Save the updated DataFrame to the CSV file
    result_df.to_csv(csv_file_path, index=False)
    print(f"DataFrame updated and saved to {csv_file_path}")

    # Automatically append new URLs to existing ones for subsequent runs
    existing_urls.extend(json_urls)

#execution
json_string = '''
{"@context":"http://schema.org","@type":"ItemList","itemListOrder":"http://schema.org/ItemListOrderAscending","description":"16 best hotels in Antipolo","itemListElement":[{"@type":"ListItem","position":1,"url":"https://www.kayak.com.ph/Pasay-Hotels-Urban-Travellers-Hotel.618466.ksp"},{"@type":"ListItem","position":2,"url":"https://www.kayak.com.ph/Paranaque-Hotels-Go-Hotels-Manila-Airport-Road.2677731.ksp"},{"@type":"ListItem","position":3,"url":"https://www.kayak.com.ph/Makati-Hotels-Kl-Serviced-Residences.720304.ksp"},{"@type":"ListItem","position":4,"url":"https://www.kayak.com.ph/Makati-Hotels-Makati-Crown-Regency-Hotel.244296.ksp"},{"@type":"ListItem","position":5,"url":"https://www.kayak.com.ph/Makati-Hotels-Makati-Shangri-La-Hotel-Manila.13762.ksp"},{"@type":"ListItem","position":6,"url":"https://www.kayak.com.ph/Pasay-Hotels-Kabayan-Hotel.319385.ksp"},{"@type":"ListItem","position":7,"url":"https://www.kayak.com.ph/Makati-Hotels-The-Picasso-Boutique-Serviced-Residences.335450.ksp"},{"@type":"ListItem","position":8,"url":"https://www.kayak.com.ph/Makati-Hotels-Guijo-Suites-Makati.639611.ksp"},{"@type":"ListItem","position":9,"url":"https://www.kayak.com.ph/Makati-Hotels-Raffles-Makati.485041.ksp"},{"@type":"ListItem","position":10,"url":"https://www.kayak.com.ph/Makati-Hotels-Valero-Grand-Suites-By-Swiss-Belhotel-Makati.2985932.ksp"},{"@type":"ListItem","position":11,"url":"https://www.kayak.com.ph/Pasay-Hotels-Park-Bed-Breakfast.88383.ksp"},{"@type":"ListItem","position":12,"url":"https://www.kayak.com.ph/Pasay-Hotels-Conrad-Manila.2588129.ksp"},{"@type":"ListItem","position":13,"url":"https://www.kayak.com.ph/Pasay-Hotels-Midas-Hotel-Casino.621294.ksp"},{"@type":"ListItem","position":14,"url":"https://www.kayak.com.ph/Makati-Hotels-Fairmont-Makati.462424.ksp"},{"@type":"ListItem","position":15,"url":"https://www.kayak.com.ph/Pasay-Hotels-Dg-Budget-Hotel-Salem.627120.ksp"},{"@type":"ListItem","position":16,"url":"https://www.kayak.com.ph/Pasay-Hotels-Sofitel-Philippine-Plaza-Manila.324413.ksp"}]}'''

# Call the function to run all and display the result
run_all_and_display(json_string)


DataFrame updated and saved to output.csv
