# Data extraction

This notebook dynamicall downloads Airbnb datasets from the Inside Airbnb site.

In [None]:
import requests, re, pickle
from datetime import datetime
import gzip
from tqdm import tqdm
import tempfile, shutil, os
from dataio.utils import *
from dataio.datasets import Datasets

# Data selection

Let's select all those dataset links belonging to the United States.

In [None]:
# Match data urls in inseide Airbnb
content = requests.get('http://insideairbnb.com/get-the-data.html')
matches = re.findall('<a href="(.*\.gz)"', content.text)  # All cities 
matches_geo = re.findall('<a href="(.*/visualisations/neighbourhoods.geojson)', content.text)  # All demographic info
matches = matches + matches_geo

Now let's extract the 3 data files (listing, review, calendar) for each city. Since older versions may also exist, we decide to take the lastes versions available for each city.

In [None]:
def get_geo_info(url):
    """ Returns the country, state and city name, in this order, of the city """
    split = url.split('/')
    return split[3].lower(), split[4].lower(), split[5].lower()

def get_data(url):
    """ Returns the kind of data the url belongs to """
    data = url.split('/')[-1].split('.')[0]
    if data not in ['listings', 'calendar', 'reviews', 'neighbourhoods']:
        raise ValueError('Unkown data frame found')
    return data

def get_time(url):
    """ Returns the timestamp associated with the given link"""
    str_date = url.split('/')[6]
    return datetime.strptime(str_date, '%Y-%m-%d')


# Store URLs for each city encountered
datasets = {}
for m in matches:
    country, state, city = get_geo_info(m)
    data, timestamp = get_data(m), get_time(m)
    store = True
    
    if city not in datasets:
        datasets[city] = {}
    if data in datasets[city] and 'timestamp' in datasets[city]:
        if datasets[city]['timestamp'] > timestamp:
            # Found older data for the given city, we want the most updated one
            print('Found older %s for %s' % (data, city))
            store = False
        
    # Store data if convenient
    if store:
        datasets[city][data] = m
        datasets[city]['country'] = country
        datasets[city]['state'] = state
        datasets[city]['timestamp'] = timestamp

Now let's take a look at the links extracted

In [None]:
for k in datasets.keys():
    print('City %s:' % k)
    print('\t - Listings: %s' % datasets[k]['listings'])
    print('\t - Calendar: %s' % datasets[k]['calendar'])
    print('\t - Reviews: %s' % datasets[k]['reviews'])
    print('\t - Neighbourhoods: %s' % datasets[k]['neighbourhoods'])

Make sure data is consistent and get the earlistes and latests date.

In [None]:
def check_same_year(data):
    """ Checks all datasets from cities are from the same year and 
    returns the earlistes and latest dates"""
    
    def overall(l, r, c, func):
        return func(l, func(r, c))
    
    min_date, max_date = None, None
    for c in data.keys():
        # Get all dates
        time_l = get_time(data[c]['listings'])
        time_r = get_time(data[c]['reviews'])
        time_c = get_time(data[c]['calendar'])
        time_n = get_time(data[c]['neighbourhoods'])
        
        if time_l != time_r or time_l != time_c or time_l != time_n:
            raise ValueError('Unconsistent dates for city %s' % c)
        
        # At this point we know all datasets are form same date
        if min_date is None:
            min_date = time_l
        if max_date is None:
            max_date = time_l
            
        # Update maximum and minimum dates
        if min_date > time_l:
            min_date = time_l
        if max_date < time_l:
            max_date = time_l
            
    return min_date, max_date

In [None]:
min_date, max_date = check_same_year(datasets)
print('Datasets are enclosed between {} and {}'.format(min_date, max_date))

# Data download

Download datasets into the data folder.

In [None]:
def download_url(url):
    """ Extracts url content """
    response = requests.get(url.encode('latin-1'), stream=True)
    response.raw.decode_content = True
    if response.status_code == 200:
        return response
    else:
        raise RuntimeError('Unexpected error: Response status from %s is %d') 
        % (url, response.status_code))


def download_and_untar(folder, url, tmp_path):
    """ Downloads data into temporary file and uncompress it into the given path """
    # Requests data from URL
    response = download_url(url)
    
    # Set destination folder
    dst = os.path.join(folder, os.path.basename(url)[:-3])
    
    # Create temporary file
    fd, downloaded = tempfile.mkstemp(suffix='.csv.gz')

    # Copy gz into tmp location
    with open(downloaded, 'wb') as f: 
        shutil.copyfileobj(response.raw, f)

    # Unzip into destination
    with gzip.open(downloaded, 'rb') as f_in, open(dst, 'wb') as f_out:
        f_out.write(f_in.read())

    # Close and remove temporary file
    os.close(fd)
    os.remove(downloaded)
            
        
def download_file(folder, url):
    """ Extracts url content into given folder """
    # Requests data from URL
    response = download_url(url)
    # Otherwise, copy into destination
    dst = os.path.join(folder, os.path.basename(url))
    with open(dst, 'wb') as f: 
        shutil.copyfileobj(response.raw, f)

        
def get_info_path(folder):
    return os.path.join(folder, 'info.dat')
    

def store_metadata(folder, city, state, country, time):
    """ Stores timestamp into the given folder """
    with open(get_info_path(folder), 'wb') as f:
        metadata = {'time': time, 'city': city, 'state': state, 'country':country}
        pickle.dump(metadata, f)
    

def read_timestamp(folder):
    """ Reads timestamp for the given dataset """
    with open(get_info_path(folder), 'rb') as f:
        return pickle.load(f)['time']
    

# Create root folder
airbnb_tmp_root = get_tmp_data_location(Datasets.AIRBNB_PRICE)
create_dir(airbnb_tmp_root)

# Create temporary file
tmp_folder = tempfile.mkdtemp()

# Create subfolder for each city
for c in tqdm(datasets.keys(), desc='Downloading Airbnb city datasets ...'):
    
    # Create directory for city
    city_folder = os.path.join(airbnb_tmp_root, c)
    download = False
    
    # Download dataset if one found is more recent or if not found
    if os.path.exists(city_folder):
        current_timestamp = read_timestamp(city_folder)
        if datasets[c]['timestamp'] < current_timestamp:
            download = True
    else:
        create_dir(city_folder)
        download = True

    if download:
        # Download compressed files for file
        download_and_untar(city_folder, datasets[c]['listings'], tmp_folder)
        download_and_untar(city_folder, datasets[c]['calendar'], tmp_folder)
        download_and_untar(city_folder, datasets[c]['reviews'], tmp_folder)
        download_file(city_folder, datasets[c]['neighbourhoods'])
    
        # Store timestamp
        state, country, time = datasets[c]['state'], datasets[c]['country'], datasets[c]['timestamp']
        store_metadata(city_folder, c, state, country, time)
    
# Clean temp folder
shutil.rmtree(tmp_folder)
print('Downloaded data for %s cities' % len(datasets.keys()))