In [425]:
import time
from random import randint
from bs4 import BeautifulSoup
from bs4 import NavigableString
import requests
import pandas as pd
import time
import random
import time

In [329]:
def scrape_hotel_urls():
    """
    Scrapes the URLs of hotels from a website and saves them to a Feather file.

    The function iterates through a range of pages, extracts the hotel URLs
    from each page, and saves the URLs to a DataFrame. The URLs are then
    updated with the full URL format and saved to a Feather file.

    Returns:
        None
    """
    

    base_url = 'https://www.hotelleriesuisse.ch/de/branche-und-politik/branchenverzeichnis/hotel-page-'
    total_pages = 365
    data = []

    for page in range(1, total_pages + 1):
        url = f"{base_url}{page}"
        response = requests.get(url)
        html = response.content
        soup = BeautifulSoup(html, 'html.parser')

        # Find all list items with class 'CardGrid--grid-item'
        items = soup.find_all('li', class_='CardGrid--grid-item')

        # Iterate over each item and extract href
        for item in items:
            link = item.find('a')['href']
            data.append({'Link': link})

        # Random sleep timer between 1 and 3 seconds
        sleep_time = randint(1, 3)
        time.sleep(sleep_time)

    # Create a DataFrame from the scraped data
    df = pd.DataFrame(data)

    # Update link column with full URL
    df['Link'] = df['Link'].apply(lambda x: 'https://www.hotelleriesuisse.ch' + x)

    # Save DataFrame to a Feather file
    df.to_feather('hotel_urls.feather')



In [325]:

df = pd.read_feather('scraped_data.feather')

In [422]:
len(df)

4379

In [455]:
links_to_srape = df.Link
links_to_srape = links_to_srape[30:40]

In [459]:
def scrape_url(url):
    """
    Scrapes information from a single URL of a hotel and returns the extracted information.

    Args:
        url (str): The URL of the hotel page to scrape.

    Returns:
        dict: A dictionary containing the extracted information from the hotel page.
    """
    # Set a list of user-agent headers to rotate between
    user_agents = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 11_4_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    ]

    # Set a random user-agent header
    user_agent = random.choice(user_agents)

    # Set headers with the user-agent
    headers = {
        "User-Agent": user_agent
    }

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"An error occurred while making the request: {e}")
        return None

    try:
        soup = BeautifulSoup(response.content, "html.parser")
        
        # Extract the desired information from the HTML using BeautifulSoup selectors
        hotel_name = soup.find('title').get_text(strip=True) if soup.find('title') else None
        hotel_summary = soup.select_one(".Text--copy.richtext").get_text(strip=True) if soup.select_one(".Text--copy.richtext") else None
        contact_name = soup.select_one(".Avatar--name").get_text(strip=True) if soup.select_one(".Avatar--name") else None
        address_element = soup.select_one(".Button--label p")
        address_lines = []

        if address_element:
            for content in address_element.contents:
                if isinstance(content, NavigableString):
                    line = content.strip()
                    if line:
                        address_lines.append(line)

        if len(address_lines) >= 2:
            hotelname = address_lines[0]
            strasse = address_lines[1]
            plz_ort = address_lines[2].split(" ")
            plz = plz_ort[0] if len(plz_ort) > 0 else None
            ort = " ".join(plz_ort[1:]) if len(plz_ort) > 1 else None
            adresse_not_mapped = None
        else:
            hotelname = None
            strasse = None
            plz = None
            ort = None
            adresse_not_mapped = address_element



        hotel_features = [tag.get_text(strip=True) for tag in soup.select(".TagList--list--item .BlockLink.active")]
        activities = hotel_features

        # Additional Hotel information
        richtext_div = soup.select('div.richtext:not(.Text--copy)')
        richtext_div_html = str(richtext_div)
        soup_richtext_div = BeautifulSoup(richtext_div_html, 'html.parser')
        variables = {strong_tag.previous_sibling.strip(): strong_tag.text.strip() for strong_tag in soup_richtext_div.find_all('strong')}

        check_in = variables.get('Check-In')
        check_out = variables.get('Check-Out')
        zimmer_apartments = variables.get('Zimmer/Apartments')
        betten = variables.get('Betten')
        seminare_bis = variables.get('Seminare bis')
        bankette_bis = variables.get('Bankette bis')

        # Create a dictionary with all the extracted information
        hotel_info = {
            "Hotel Name": hotel_name,
            "Summary": hotel_summary,
            "Contact Name": contact_name,
            "Hotelname": hotelname,
            "Strasse": strasse,
            "PLZ": plz,
            "Ort": ort,
            "Adresse_not_mapped" : adresse_not_mapped,
            "Hotel Features": hotel_features,
            "Activities": activities,
            "Zimmer/Apartments": zimmer_apartments,
            "Betten": betten,
            "Check-In": check_in,
            "Check-Out": check_out,
            "Maximale Seminargrösse in Personen": seminare_bis,
            "Maximale Bankettgrösse in Personen": bankette_bis
        }

        return hotel_info
    except Exception as e:
        print(f"An error occurred during scraping: {e}")
        return None





SyntaxError: invalid syntax (811899867.py, line 91)

In [457]:
# Set a delay range between requests (in seconds)
min_delay = 0  # Minimum delay
max_delay = 0.1  # Maximum delay

# Randomize the delay between requests
delay = random.uniform(min_delay, max_delay)


# Scrape information from each URL in the list
scraped_data = []
for url in links_to_srape:
    data = scrape_url(url)
    scraped_data.append(data)
    time.sleep(delay)

# Create a DataFrame from the scraped data
df_hotels = pd.DataFrame(scraped_data)

In [458]:
df_hotels

Unnamed: 0,Hotel Name,Summary,Contact Name,Hotelname,Strasse,PLZ,Ort,Hotel Features,Activities,Zimmer/Apartments,Betten,Check-In,Check-Out,Maximale Seminargrösse in Personen,Maximale Bankettgrösse in Personen
0,airporthotel Grenchen,Willkommen - Bienvenue - Benvenuti - Welcome,,,,,,[​ Ambassador Swiss Hotels],[​ Ambassador Swiss Hotels],41,44,,,,
1,aja Zürich. Das City-Resort,Städtetrip und Entspannungsurlaub in einem Ih...,Sven Lehmann,aja Zürich. Das City-Resort,Vulkanstrasse 108b,8048.0,Zürich,"[​ Öffentliches Restaurant, ​ Sitzungszimmer, ...","[​ Öffentliches Restaurant, ​ Sitzungszimmer, ...",318,636,24-Stunden Check-in,24-Stunden Check-out,8.0,60.0
2,Aktiv Hotel & Spa Hannigalp,Willkommen in Grächen Aktiv in den Walliser Be...,Olivier Andenmatten,Aktiv Hotel & Spa Hannigalp,Heiminen 468,3925.0,Grächen,"[​ Öffentliches Restaurant, ​ Sitzungszimmer, ...","[​ Öffentliches Restaurant, ​ Sitzungszimmer, ...",29,75,14:00\n ...,08:00\n ...,10.0,100.0
3,Aktivhostel HängeMatt,"Unser kleines, familienbetriebenes Hostel biet...",,,,,,"[​ Öffentliches Restaurant, ​ Sitzungszimmer, ...","[​ Öffentliches Restaurant, ​ Sitzungszimmer, ...",6,16,17:00\n ...,07:30\n ...,,
4,Al Ponte Albergo - Ristorante,Traumhaft gelegen an einem Südhang in Cademari...,Patric Gatti,Al Ponte Albergo - Ristorante,Via Cantonale di Sopra,6936.0,Cademario,"[​ Öffentliches Restaurant, ​ Aussichtsrestaur...","[​ Öffentliches Restaurant, ​ Aussichtsrestaur...",12,26,14:00\n ...,08:00\n ...,,
5,Alaïa Lodge,Willkommen - Bienvenue - Benvenuti - Welcome,,,,,,"[​ Rollstuhlgängige Toilette, ​ Betthöhe 45-50...","[​ Rollstuhlgängige Toilette, ​ Betthöhe 45-50...",36,71,,,,
6,Albana Hotel,Das neu renovierte Albana Hotel & Suites Silva...,Daniel Bosshard-Jürisaar,Albana Hotel,Via vers Mulins 5,7513.0,Silvaplana,"[​ Öffentliches Restaurant, ​ Aussichtsrestaur...","[​ Öffentliches Restaurant, ​ Aussichtsrestaur...",33,75,15:00\n ...,08:00\n ...,40.0,70.0
7,Albergo Al Giardinetto,Das Hotel Restaurant Al Giardinetto in Biasca ...,,,,,,"[​ Öffentliches Restaurant, ​ Aussichtsrestaur...","[​ Öffentliches Restaurant, ​ Aussichtsrestaur...",23,67,,,120.0,120.0
8,Albergo Altavilla,Unser familienfreundliches Hotel liegt an eine...,,,,,,"[​ Öffentliches Restaurant, ​ Sitzungszimmer, ...","[​ Öffentliches Restaurant, ​ Sitzungszimmer, ...",9,22,14:00\n ...,10:30\n ...,60.0,60.0
9,Albergo Bellavista,Willkommen - Bienvenue - Benvenuti - Welcome,Gabriela Niggeler,Albergo Bellavista,Via Costa die Mezzo 77,6614.0,Brissago,[],[],8,16,,,,
