## Webscraping and Applied ML - Project
_Authors_: Alessia SARRITZU, Alberto MARTINELLI

### Step 1: Get hotel data from Foursquare
* Import the necessary libraries
* Define the necessary functions for the API calls and data retrival
* Retrieve and save the data in .csv format

In [None]:
from dotenv import load_dotenv
import os
import pandas as pd
import requests

In [None]:
def get_api_key():
    # Load environment variables from the .env file
    load_dotenv()

    # Get the API key from the environment variable
    api_key = os.getenv('API_KEY')  # Fetch the API key

    if api_key is None:
        print("API key is missing in the .env file!")
        exit()
    return api_key

def convert_json_to_df(data):
    if data:
        flattened_data = []

        # Check if 'results' key exists (Type 1 JSON structure)
        if isinstance(data, dict) and 'results' in data:
            places = data['results']
        # Otherwise, assume it's a list (Type 2 JSON structure)
        else:
            places = data

        # Iterate over each place in 'places' (works for both Type 1 and Type 2)
        for place in places:
            # Extract the required fields based on Type 1 or Type 2 structure
            if 'fsq_id' in place:  # Type 1 (place has 'fsq_id' and 'categories')
                place_data = {
                    'fsq_id': place['fsq_id'],
                    'name': place['name'],
                    'address': place['location']['address'] if 'location' in place else None,
                    'locality': place['location']['locality'] if 'location' in place else None,
                    'country': place['location']['country'] if 'location' in place else None,
                    'formatted_address': place['location']['formatted_address'] if 'location' in place else None,
                    'latitude': place['geocodes']['main']['latitude'] if 'geocodes' in place else None,
                    'longitude': place['geocodes']['main']['longitude'] if 'geocodes' in place else None,
                    'distance': place['distance'] if 'distance' in place else None,
                    'link': place['link'],
                    'categories': [category['name'] for category in place['categories']]  # Extract category names
                }
            elif 'id' in place:  # Type 2 (place has 'id' and 'text')
                place_data = {
                    'id': place['id'],
                    'created_at': place['created_at'],
                    'text': place['text']
                }
            flattened_data.append(place_data)

        # Convert the list of flattened data into a DataFrame
        df = pd.DataFrame(flattened_data)
        return df
    else:
        print("No data available to save.")

def convert_df_to_csv(df, filename):
    # Save the DataFrame to CSV
    df.to_csv(filename, index=False)
    # print("CSV file has been saved as 'output.csv'")

def get_json_data_from_api(url, api_calls):
    api_key = get_api_key()

    headers = {
        "Accept": "application/json",
        "Authorization": api_key
    }

    # Make the GET request to the API
    response = requests.get(url, headers=headers)

    # Check if the request was successful
    if response.status_code == 200:
        data = response.json()  # Parse the JSON response
        # print(data)
    else:
        print(f"Error: {response.status_code}")

    api_calls[0] += 1
    return data

In [None]:
api_calls = [0]

# Get the HOTELS in newyork near 124 Hudson Street in a radiues of 150 meters
url = "https://api.foursquare.com/v3/places/search?query=hotel&ll=40.720276093678535%2C-74.00855578601094&radius=150"
data = get_json_data_from_api(url, api_calls)

df = convert_json_to_df(data)

#for each fsq_id value in the fsq_id column, make an API call to get the tips for that place
# and then add the tips to the dataframe
for fsq_id in df['fsq_id']:
    url = f"https://api.foursquare.com/v3/places/{fsq_id}/tips"
    tips = get_json_data_from_api(url, api_calls)

    tips_text = [tip['text'] for tip in tips]  # Get the 'text' from each tip

    # Assign the list of tip texts to the dataframe
    df.loc[df['fsq_id'] == fsq_id, 'tips'] = ', '.join(tips_text)


print(df.head())

convert_df_to_csv(df, 'hotel_reviews_API.csv')
print("CSV file has been saved as 'output.csv'")
print("API calls made:", api_calls[0])

### Step 2: Get hotel data from KAYAK - Web Scraping
* Import the necessary libraries
* Define the necessary function for the Playwright and data retrival
* Retrieve and save the data in .csv format

In [None]:
from playwright.sync_api import sync_playwright

In [None]:
def scrape_reviews_with_playwright(hotel_url):
    reviews = []
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=False)  # Set to False for debugging
        context = browser.new_context()
        page = context.new_page()

        # Navigate to the hotel details page
        page.goto(hotel_url)

        try:
            # Accept cookies if the cookie banner is present
            if page.locator("button:has-text('Accetta')").is_visible():
                page.locator("button:has-text('Accetta')").click()
                print("Cookies accepted.")

            # Wait for the reviews section to load
            page.wait_for_selector(".acD_-reviews-row-header", timeout=30000)

            # Locate review containers
            review_elements = page.query_selector_all(".acD_")

            for review in review_elements:
                try:
                    # Extract rating
                    rating = review.query_selector(".wdjx-positive").inner_text().strip() if review.query_selector(".wdjx-positive") else None
                    # Extract score description
                    score_description = review.query_selector(".acD_-score-description").inner_text().strip() if review.query_selector(".acD_-score-description") else None
                    # Extract user name and date
                    user_name_date = review.query_selector(".acD_-userName").inner_text().strip() if review.query_selector(".acD_-userName") else None
                    # Extract pros/advantages
                    pros = review.query_selector(".acD_-pros").inner_text().strip() if review.query_selector(".acD_-pros") else None
                    # Extract the full review text
                    full_review = review.query_selector("span[id^='showMoreText']").inner_text().strip() if review.query_selector("span[id^='showMoreText']") else None

                    # Append review data
                    reviews.append({
                        'Rating': rating,
                        'Score Description': score_description,
                        'User': user_name_date,
                        'Pros': pros,
                        'Full Review': full_review
                    })
                except Exception as e:
                    print(f"Error processing review: {e}")

        except Exception as e:
            print(f"Error loading reviews: {e}")
        finally:
            browser.close()

    return reviews

In [None]:
hotel_url = "https://www.kayak.it/hotels/InterContinental-New-York-Barclay,New-York-p59560-h14931-details"
print("Scraping reviews for the hotel...")
reviews_data = scrape_reviews_with_playwright(hotel_url)

# Save reviews to CSV
if reviews_data:
    reviews_df = pd.DataFrame(reviews_data)
    reviews_df.to_csv("hotel_reviews_webscraping.csv", index=False)
    print("Reviews saved to hotel_reviews_playwright.csv.")
else:
    print("No reviews found.")

### Step 3: Visualize a data preview

In [4]:
def display_data_preview(filename):
    pd.set_option('display.max_columns', 200)
    pd.set_option('display.max_rows', 200)
    pd.set_option('display.max_colwidth', 200)

    # Load the CSV file into a DataFrame
    df = pd.read_csv(filename)

    print("Dataframe preview:")
    df.head()

display_data_preview('hotel_reviews_API.csv')
display_data_preview('hotel_reviews_webscraping.csv')

Dataframe preview:


Unnamed: 0,fsq_id,name,address,locality,country,formatted_address,latitude,longitude,distance,link,categories,tips
0,4ab6c985f964a5200a7920e3,The Greenwich Hotel,377 Greenwich St,New York,US,"377 Greenwich St (N. Moore), New York, NY 10013",40.719919,-74.009957,125,/v3/places/4ab6c985f964a5200a7920e3,['Hotel'],"This Robert de Niro-owned boutique hotel has great service, great food (be sure to check out Locanda Verde), and great beds. What more could we want in a hotel?, For the love of this iconic hotel’..."
1,4d8e0ccfd00a6ea8839faf4f,Shibui Spa,377 Greenwich St,New York,US,"377 Greenwich St, New York, NY 10013",40.719867,-74.009911,150,/v3/places/4d8e0ccfd00a6ea8839faf4f,"['Spa', 'Hotel']","Shibui Spa houses a lantern-lit swimming pool and lounge area. It's roofed under a 250-year old wood and bamboo farmhouse that was reconstructed in the Greenwich by Japanese craftsmen., A beautifu..."
2,4a149749f964a52055781fe3,Locanda Verde,377 Greenwich St,New York,US,"377 Greenwich St (at N Moore St), New York, NY 10013",40.719919,-74.009957,126,/v3/places/4a149749f964a52055781fe3,['Italian Restaurant'],"I like the ricotta crostini, blue crab crostini, lamb sliders and grandmother’s ravioli! Andrew Carmellini worked with me at Café Boulud for many years and is an excellent chef., My Grandmother's ..."
