## Get capital cities used in Global Property SQM data
The Airbnb data has more cities than in the global property guide so instead of downloading all the airbnb data sets, I will limit it to cities contained within the sqm data.

In [86]:
import csv
import pandas as pd

capital_city_csv_path = r"data/country_capital_city.csv"
global_sqm_csv_path = r"data/global_sqm_prices.csv"

with open(capital_city_csv_path, 'r') as csv_file:
    csv_reader = csv.DictReader(csv_file)
    country_capital_dict = {row['Country']:row['Capital City'] for row in csv_reader}

df = pd.read_csv(global_sqm_csv_path)
df['Capital City'] = df['Country/City'].map(country_capital_dict)
df.loc[df['Country/City'] == 'Hong Kong', 'Capital City'] = 'Hong Kong'
df.loc[df['Country/City'] == 'Taiwan', 'Capital City'] = 'Taipei'
df.loc[df['Country/City'] == 'Puerto Rico', 'Capital City'] = 'San Juan'
df.loc[df['Country/City'] == 'Czech Republic', 'Capital City'] = 'Prague'
df.loc[df['Country/City'] == 'Turkey', 'Capital City'] = 'Ankara'

capital_cities_in_global_sqm = list(df['Capital City'])

I will use this function to check if city is in the capital cities list.

In [81]:
def is_word_in_string_list(word, string_list):
    return any(word.lower() in string.lower() for string in string_list)

## Scraping Airbnb data

### Finding download links for files

In [4]:
import requests
from bs4 import BeautifulSoup
import os

# URL of the website
base_url = "http://insideairbnb.com/get-the-data/"

# Create a directory to save the downloaded files
if not os.path.exists("listings"):
    os.makedirs("listings")

# Make a request to the webpage and parse it with BeautifulSoup
response = requests.get(base_url)
soup = BeautifulSoup(response.content, "html.parser")

# Find all links on the webpage
links = soup.find_all("a")

# Loop through the links and download the files
for link in links:
    href = link.get("href")
    if href and "listings.csv.gz" in href:
        if 
        print(href)

http://data.insideairbnb.com/the-netherlands/north-holland/amsterdam/2023-06-05/data/listings.csv.gz
http://data.insideairbnb.com/belgium/vlg/antwerp/2023-06-28/data/listings.csv.gz
http://data.insideairbnb.com/united-states/nc/asheville/2023-06-18/data/listings.csv.gz
http://data.insideairbnb.com/greece/attica/athens/2023-06-25/data/listings.csv.gz
http://data.insideairbnb.com/united-states/tx/austin/2023-06-10/data/listings.csv.gz
http://data.insideairbnb.com/thailand/central-thailand/bangkok/2023-06-26/data/listings.csv.gz
http://data.insideairbnb.com/spain/catalonia/barcelona/2023-06-10/data/listings.csv.gz
http://data.insideairbnb.com/australia/sa/barossa-valley/2023-06-27/data/listings.csv.gz
http://data.insideairbnb.com/australia/vic/barwon-south-west-vic/2023-06-28/data/listings.csv.gz
http://data.insideairbnb.com/china/beijing/beijing/2023-06-27/data/listings.csv.gz
http://data.insideairbnb.com/belize/bz/belize/2023-06-28/data/listings.csv.gz
http://data.insideairbnb.com/italy

### Parsing URL's for important information for use in filename

In [65]:
href_test = r"http://data.insideairbnb.com/the-netherlands/north-holland/amsterdam/2023-06-05/data/listings.csv.gz"
href_test.split("/")

['http:',
 '',
 'data.insideairbnb.com',
 'the-netherlands',
 'north-holland',
 'amsterdam',
 '2023-06-05',
 'data',
 'listings.csv.gz']

In [66]:
href_test = r"http://data.insideairbnb.com/australia/sa/barossa-valley/2023-06-27/data/listings.csv.gz"
href_test.split("/")

['http:',
 '',
 'data.insideairbnb.com',
 'australia',
 'sa',
 'barossa-valley',
 '2023-06-27',
 'data',
 'listings.csv.gz']

From the two list splits above (as well as the full list of URLs) a pattern can be established for reliably obtaining country (index of 3 in list) and date (index of -3). City names are always before the date, however some urls don't contain a city name at all, but a region. This isn't a problem though as we will filter non-city 

In [67]:
city_name = href_test.split("/")[-4]
city_name

'barossa-valley'

In [68]:
country = href_test.split("/")[3]
country

'australia'

In [69]:
date_of_data = href_test.split("/")[-3]
date_of_data

'2023-06-27'

In [70]:
file_name = f"{city_name}_{country}_{date_of_data}_listings.csv.gz"
file_name

'barossa-valley_australia_2023-06-27_listings.csv.gz'

### Putting it all together: Downloading data and saving them with desired name

In [90]:
import requests
from bs4 import BeautifulSoup
import os

# URL of the website
base_url = "http://insideairbnb.com/get-the-data/"

output_dir = "data/listings"

# Create a directory to save the downloaded files
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Make a request to the webpage and parse it with BeautifulSoup
response = requests.get(base_url)
soup = BeautifulSoup(response.content, "html.parser")

# Find all links on the webpage
links = soup.find_all("a")

# Loop through the links and download the files
for link in links:
    href = link.get("href")
    if href and "listings.csv.gz" in href:
        city_name = href.split("/")[-4]  # Extract city name from the URL
        country = href.split("/")[3]  # Extract country name from URL
        date_of_data = href.split("/")[-3]  # Extract date of data from URL

        if not is_word_in_string_list(city_name, capital_cities_in_global_sqm):
            print(f"{city_name} not in Global Property SQM data. Skipping download...")
            continue
        
        file_url = href
        file_name = f"{city_name}_{country}_{date_of_data}_listings.csv.gz"
        file_path = os.path.join(output_dir, file_name)

        # Download the file
        response = requests.get(file_url)
        with open(file_path, "wb") as file:
            file.write(response.content)
        
        print(f"Downloaded: {file_name}")

print("Download completed.")


Downloaded: amsterdam_the-netherlands_2023-06-05_listings.csv.gz
antwerp not in Global Property SQM data. Skipping download...
asheville not in Global Property SQM data. Skipping download...
Downloaded: athens_greece_2023-06-25_listings.csv.gz
austin not in Global Property SQM data. Skipping download...
Downloaded: bangkok_thailand_2023-06-26_listings.csv.gz
barcelona not in Global Property SQM data. Skipping download...
barossa-valley not in Global Property SQM data. Skipping download...
barwon-south-west-vic not in Global Property SQM data. Skipping download...
beijing not in Global Property SQM data. Skipping download...
belize not in Global Property SQM data. Skipping download...
bergamo not in Global Property SQM data. Skipping download...
Downloaded: berlin_germany_2023-06-22_listings.csv.gz
bologna not in Global Property SQM data. Skipping download...
bordeaux not in Global Property SQM data. Skipping download...
boston not in Global Property SQM data. Skipping download...
bozem