In [3]:
import os
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.cm as cm
import matplotlib.pyplot as plt

import matplotlib
import matplotlib.font_manager

import os
from requests import get
from urllib.parse import urlparse

# For debugging purposes (fonts can be hard)
print(matplotlib.get_cachedir())

# We change fonts the hard way in this notebook...
# but you can also do this to change the default 
# font everywhere in one go:
# matplotlib.rcParams['font.family'] = "Liberation Sans Narrow"
fontname = "DejaVu Sans"

matplotlib.font_manager.FontManager().findfont(fontname, fontext='ttf', rebuild_if_missing=True)

/home/jovyan/.cache/matplotlib


'/opt/conda/lib/python3.11/site-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSans.ttf'

In [4]:


def cache_data(src:str, dest:str) -> str:
    """
    Downloads a file from a given URL and caches it locally.

    Parameters:
    src (str): The source URL of the file to download.
    dest (str): The destination directory where the file will be saved.

    Returns:
    str: The path to the cached file.
    """    
    url = urlparse(src) # We assume that this is some kind of valid URL 
    fn  = os.path.split(url.path)[1] # Extract the filename
    dfn = os.path.join(dest,fn) # Destination filename
    
    if not os.path.isfile(dfn) or os.path.getsize(dfn) < 250:
        
        print(f"{dfn} not found, downloading!")

        path = os.path.split(dest)
        
        if len(path) >= 1 and path[0] != '':
            os.makedirs(os.path.join(*path), exist_ok=True)
            
        with open(dfn, "wb") as file:
            response = get(src)
            file.write(response.content)
            
        print("\tDone downloading...")

    else:
        print(f"Found {dfn} locally!")

    return dfn

help(cache_data) # <- This should show the docstring you've written

Help on function cache_data in module __main__:

cache_data(src: str, dest: str) -> str
    Downloads a file from a given URL and caches it locally.
    
    Parameters:
    src (str): The source URL of the file to download.
    dest (str): The destination directory where the file will be saved.
    
    Returns:
    str: The path to the cached file.



In [5]:
# Set download URL
ymd  = '20240614'
city = 'London'
host = 'https://orca.casa.ucl.ac.uk'
url  = f'{host}/~jreades/data/{ymd}-{city}-listings.csv.gz'


In [6]:
path = os.path.join('data','raw') # A default location to save raw data
fn   = url.split('/')[-1]         # What does this do?
print(f"Writing to: {fn}")

df = pd.read_csv(cache_data(url, os.path.join('data','raw')))
print(f"Data frame is {df.shape[0]:,} x {df.shape[1]}")

Writing to: 20240614-London-listings.csv.gz
Found data/raw/20240614-London-listings.csv.gz locally!
Data frame is 93,481 x 75


In [7]:
ddir  = os.path.join('data','geo') # destination directory
spath = 'https://github.com/jreades/i2p/blob/master/data/src/' # source path

boros = gpd.read_file( cache_data(spath+'Boroughs.gpkg?raw=true', ddir) )
water = gpd.read_file( cache_data(spath+'Water.gpkg?raw=true', ddir) )
green = gpd.read_file( cache_data(spath+'Greenspace.gpkg?raw=true', ddir) )
road =  gpd.read_file( cache_data(spath+'Roads.gpkg?raw=true', ddir) )
print('Done.')

Found data/geo/Boroughs.gpkg locally!
Found data/geo/Water.gpkg locally!
Found data/geo/Greenspace.gpkg locally!
Found data/geo/Roads.gpkg locally!
Done.


In [8]:
testing = False

if testing:
    df = pd.read_csv(os.path.join(path,fn), 
                low_memory=True, nrows = 10000)
else:
    df = pd.read_csv(os.path.join(path,fn), 
                low_memory=False)

print(f"Data frame is {df.shape[0]:,} x {df.shape[1]}")

Data frame is 93,481 x 75


In [9]:
header = list(df)
header

['id',
 'listing_url',
 'scrape_id',
 'last_scraped',
 'source',
 'name',
 'description',
 'neighborhood_overview',
 'picture_url',
 'host_id',
 'host_url',
 'host_name',
 'host_since',
 'host_location',
 'host_about',
 'host_response_time',
 'host_response_rate',
 'host_acceptance_rate',
 'host_is_superhost',
 'host_thumbnail_url',
 'host_picture_url',
 'host_neighbourhood',
 'host_listings_count',
 'host_total_listings_count',
 'host_verifications',
 'host_has_profile_pic',
 'host_identity_verified',
 'neighbourhood',
 'neighbourhood_cleansed',
 'neighbourhood_group_cleansed',
 'latitude',
 'longitude',
 'property_type',
 'room_type',
 'accommodates',
 'bathrooms',
 'bathrooms_text',
 'bedrooms',
 'beds',
 'amenities',
 'price',
 'minimum_nights',
 'maximum_nights',
 'minimum_minimum_nights',
 'maximum_minimum_nights',
 'minimum_maximum_nights',
 'maximum_maximum_nights',
 'minimum_nights_avg_ntm',
 'maximum_nights_avg_ntm',
 'calendar_updated',
 'has_availability',
 'availability_30

In [10]:
cols = ['id', 'listing_url', 'last_scraped', 'name', 'description',
        'host_id', 'host_name', 'host_since', 'host_location', 'host_about', 'host_acceptance_rate', 'host_is_superhost', 'host_neighbourhood', 'host_listings_count', 'host_total_listings_count','host_verifications',
 'latitude','longitude',
 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
 'minimum_nights', 'maximum_nights', 'availability_365', 'number_of_reviews','first_review', 'last_review', 'review_scores_rating', 'reviews_per_month']

In [11]:
df = pd.read_csv(os.path.join(path,fn), 
                low_memory=False, usecols = cols)

In [12]:
list(df)

['id',
 'listing_url',
 'last_scraped',
 'name',
 'description',
 'host_id',
 'host_name',
 'host_since',
 'host_location',
 'host_about',
 'host_acceptance_rate',
 'host_is_superhost',
 'host_neighbourhood',
 'host_listings_count',
 'host_total_listings_count',
 'host_verifications',
 'latitude',
 'longitude',
 'property_type',
 'room_type',
 'accommodates',
 'bathrooms',
 'bathrooms_text',
 'bedrooms',
 'beds',
 'amenities',
 'price',
 'minimum_nights',
 'maximum_nights',
 'availability_365',
 'number_of_reviews',
 'first_review',
 'last_review',
 'review_scores_rating',
 'reviews_per_month']