## Webscraping and Applied ML - Project
_Authors_: Alessia SARRITZU, Alberto MARTINELLI

### Step 1: Get hotel data from Foursquare
* Import the necessary libraries
* Define the necessary functions for the API calls and data retrival
* Retrieve and save the data in .csv format

In [25]:
from dotenv import load_dotenv
import os
import pandas as pd
import requests

In [26]:
def get_api_key():
    # Load environment variables from the .env file
    load_dotenv()

    # Get the API key from the environment variable
    api_key = os.getenv('API_KEY')  # Fetch the API key

    if api_key is None:
        print("API key is missing in the .env file!")
        exit()
    return api_key

def convert_json_to_df(data):
    if data:
        flattened_data = []

        # Check if 'results' key exists (Type 1 JSON structure)
        if isinstance(data, dict) and 'results' in data:
            places = data['results']
        # Otherwise, assume it's a list (Type 2 JSON structure)
        else:
            places = data

        # Iterate over each place in 'places' (works for both Type 1 and Type 2)
        for place in places:
            # Extract the required fields based on Type 1 or Type 2 structure
            if 'fsq_id' in place:  # Type 1 (place has 'fsq_id' and 'categories')
                place_data = {
                    'fsq_id': place['fsq_id'],
                    'name': place['name'],
                    'address': place['location']['address'] if 'location' in place else None,
                    'locality': place['location']['locality'] if 'location' in place else None,
                    'country': place['location']['country'] if 'location' in place else None,
                    'formatted_address': place['location']['formatted_address'] if 'location' in place else None,
                    'latitude': place['geocodes']['main']['latitude'] if 'geocodes' in place else None,
                    'longitude': place['geocodes']['main']['longitude'] if 'geocodes' in place else None,
                    'distance': place['distance'] if 'distance' in place else None,
                    'link': place['link'],
                    'categories': [category['name'] for category in place['categories']]  # Extract category names
                }
            elif 'id' in place:  # Type 2 (place has 'id' and 'text')
                place_data = {
                    'id': place['id'],
                    'created_at': place['created_at'],
                    'text': place['text']
                }
            flattened_data.append(place_data)

        # Convert the list of flattened data into a DataFrame
        df = pd.DataFrame(flattened_data)
        return df
    else:
        print("No data available to save.")

def convert_df_to_csv(df, filename):
    # Save the DataFrame to CSV
    df.to_csv(filename, index=False)
    # print("CSV file has been saved as 'output.csv'")

def get_json_data_from_api(url, api_calls):
    api_key = get_api_key()

    headers = {
        "Accept": "application/json",
        "Authorization": api_key
    }

    # Make the GET request to the API
    response = requests.get(url, headers=headers)

    # Check if the request was successful
    if response.status_code == 200:
        data = response.json()  # Parse the JSON response
        # print(data)
    else:
        print(f"Error: {response.status_code}")

    api_calls[0] += 1
    return data

In [27]:
api_calls = [0]

# Get the HOTELS in newyork near 124 Hudson Street in a radiues of 150 meters
# url = "https://api.foursquare.com/v3/places/search?query=hotel&ll=40.720276093678535%2C-74.00855578601094&radius=150"
url = "https://api.foursquare.com/v3/places/search?query=hotel&ll=40.720276093678535%2C-74.00855578601094&limit=50"
data = get_json_data_from_api(url, api_calls)

df = convert_json_to_df(data)

#for each fsq_id value in the fsq_id column, make an API call to get the tips for that place
# and then add the tips to the dataframe
for fsq_id in df['fsq_id']:
    url = f"https://api.foursquare.com/v3/places/{fsq_id}/tips"
    tips = get_json_data_from_api(url, api_calls)

    tips_text = [tip['text'] for tip in tips]  # Get the 'text' from each tip

    # Assign the list of tip texts to the dataframe
    df.loc[df['fsq_id'] == fsq_id, 'tips'] = ', '.join(tips_text)


print(df.head())

convert_df_to_csv(df, 'hotel_reviews_API.csv')
print("CSV file has been saved as 'output.csv'")
print("API calls made:", api_calls[0])

                     fsq_id                                   name  \
0  3fd66200f964a52053eb1ee3                       Soho Grand Hotel   
1  54c0151d498e4c827296cd41  Hotel Indigo Lower East Side New York   
2  4a0e0f85f964a520bf751fe3                     Ace Hotel New York   
3  4adfb640f964a520e57c21e3        Lobby Bar at Ace Hotel New York   
4  5ffdc960d85c9a288b3b391f                     Ace Hotel Brooklyn   

               address  locality country  \
0       310 W Broadway  New York      US   
1        171 Ludlow St  New York      US   
2         20 W 29th St  New York      US   
3        1186 Broadway  New York      US   
4  252 Schermerhorn St  Brooklyn      US   

                                               formatted_address   latitude  \
0                  310 W Broadway (Grand St), New York, NY 10013  40.722011   
1  171 Ludlow St (btwn Houston & Stanton St), New York, NY 10002  40.721759   
2                 20 W 29th St (at Broadway), New York, NY 10001  40.745702  

### Step 2: Get hotel data from KAYAK - Web Scraping
* Import the necessary libraries
* Define the necessary function for the Playwright and data retrival
* Retrieve and save the data in .csv format

In [28]:
from playwright.sync_api import sync_playwright

In [29]:
def scrape_reviews_with_playwright(hotel_url):
    reviews = []
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=False)  # Set to False for debugging
        context = browser.new_context()
        page = context.new_page()

        # Navigate to the hotel details page
        page.goto(hotel_url)

        try:
            # Accept cookies if the cookie banner is present
            if page.locator("button:has-text('Accetta')").is_visible():
                page.locator("button:has-text('Accetta')").click()
                print("Cookies accepted.")

            # Wait for the reviews section to load
            page.wait_for_selector(".acD_-reviews-row-header", timeout=30000)

            # Locate review containers
            review_elements = page.query_selector_all(".acD_")

            for review in review_elements:
                try:
                    # Extract rating
                    rating = review.query_selector(".wdjx-positive").inner_text().strip() if review.query_selector(".wdjx-positive") else None
                    # Extract score description
                    score_description = review.query_selector(".acD_-score-description").inner_text().strip() if review.query_selector(".acD_-score-description") else None
                    # Extract user name and date
                    user_name_date = review.query_selector(".acD_-userName").inner_text().strip() if review.query_selector(".acD_-userName") else None
                    # Extract pros/advantages
                    pros = review.query_selector(".acD_-pros").inner_text().strip() if review.query_selector(".acD_-pros") else None
                    # Extract the full review text
                    full_review = review.query_selector("span[id^='showMoreText']").inner_text().strip() if review.query_selector("span[id^='showMoreText']") else None

                    # Append review data
                    reviews.append({
                        'Rating': rating,
                        'Score Description': score_description,
                        'User': user_name_date,
                        'Pros': pros,
                        'Full Review': full_review
                    })
                except Exception as e:
                    print(f"Error processing review: {e}")

        except Exception as e:
            print(f"Error loading reviews: {e}")
        finally:
            browser.close()

    return reviews

In [30]:
import nest_asyncio
import asyncio
import atexit
from playwright.async_api import async_playwright

# Allow nested event loops
nest_asyncio.apply()

async def scrape_reviews_with_playwright_async(hotel_url):
    reviews = []

    pw = await async_playwright().start()
    browser = await pw.chromium.launch(headless=False)  # Set to False for debugging
    page = await browser.new_page()

    # All methods are async (use the "await" keyword)
    # Navigate to the hotel details page
    await page.goto(hotel_url)

    try:
        # Accept cookies if the cookie banner is present
        if await page.locator("button:has-text('Accept')").is_visible():
            await page.locator("button:has-text('Accept')").click()
            print("Cookies accepted.")

        # Wait for the reviews section to load
        await page.wait_for_selector(".acD_-reviews-row-header", timeout=30000)

        # Locate review containers
        review_elements = await page.query_selector_all(".acD_")

        for review in review_elements:
            try:
                # Extract rating
                rating_element = await review.query_selector(".wdjx-positive")
                rating = (await rating_element.inner_text()).strip() if rating_element else None
                # Extract score description
                score_description_element = await review.query_selector(".acD_-score-description")
                score_description = (await score_description_element.inner_text()).strip() if score_description_element else None
                # Extract user name and date
                user_name_date_element = await review.query_selector(".acD_-userName")
                user_name_date = (await user_name_date_element.inner_text()).strip() if user_name_date_element else None
                # Extract pros/advantages
                pros_element = await review.query_selector(".acD_-pros")
                pros = (await pros_element.inner_text()).strip() if pros_element else None
                # Extract the full review text
                full_review_element = await review.query_selector("span[id^='showMoreText']")
                full_review = (await full_review_element.inner_text()).strip() if full_review_element else None

                # Append review data
                reviews.append({
                    'Rating': rating,
                    'Score Description': score_description,
                    'User and date': user_name_date,
                    'Pros': pros,
                    'Full Review': full_review
                })
            except Exception as e:
                print(f"Error processing review: {e}")

    except Exception as e:
        print(f"Error loading reviews: {e}")
    finally:
        await browser.close()

    # Function to close browser and stop Playwright
    async def shutdown_playwright():
        await browser.close()
        await pw.stop()

    # Register shutdown hook for when the program exits
    atexit.register(lambda: asyncio.run(shutdown_playwright()))

    return reviews

# Run the async main function
# await scrape_reviews_with_playwright_2()  # Use await directly instead of asyncio.run()

In [31]:
hotel_url = "https://www.kayak.com/hotels/InterContinental-New-York-Barclay,New-York-p59560-h14931-details"
print("Scraping reviews for the hotel...")
reviews_data = await scrape_reviews_with_playwright_async(hotel_url)

# Save reviews to CSV
if reviews_data:
    reviews_df = pd.DataFrame(reviews_data)
    # process the User and date column to remove the user name and keep only the date (anonymization)
    reviews_df['User and date'] = reviews_df['User and date'].str.split(pat=', ')
    reviews_df['User and date'] = reviews_df['User and date'].transform(lambda l: l[1])
    reviews_df.rename(columns={'User and date': 'Date'}, inplace=True)

    # convert the date in datetime format
    reviews_df['Date'] = pd.to_datetime(reviews_df['Date'], format='%b %Y', errors='coerce')

    reviews_df.to_csv("hotel_reviews_webscraping.csv", index=False)
    print("Reviews saved to hotel_reviews_webscraping.csv.")
else:
    print("No reviews found.")

Scraping reviews for the hotel...
Reviews saved to hotel_reviews_webscraping.csv.


### Step 3: Visualize a data preview

In [32]:
from tabulate import tabulate
def display_data_preview(filename):
    pd.set_option('display.max_columns', 200)
    pd.set_option('display.max_rows', 200)
    pd.set_option('display.max_colwidth', 200)

    # Load the CSV file into a DataFrame
    df = pd.read_csv(filename)

    print("Dataframe preview:")
    print(tabulate(df, headers='keys', tablefmt='psql'))

display_data_preview('hotel_reviews_API.csv')
display_data_preview('hotel_reviews_webscraping.csv')

Dataframe preview:
+----+--------------------------+--------------------------------------------------------+-----------------------------+-------------+-----------+----------------------------------------------------------------------------+------------+-------------+------------+-------------------------------------+------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------