# Download raw data files

In [1]:
import pathlib
import urllib.request
from tqdm import tqdm

## Data folders:

In [2]:
data_path = pathlib.Path('../../data/')
data_path.mkdir(parents=True, exist_ok=True)
inria_path = data_path / 'INRIA'
inria_path.mkdir(parents=True, exist_ok=True)
input_path = inria_path / 'input'
input_path.mkdir(parents=True, exist_ok=True)

source_file = inria_path / 'source.txt'

In [3]:
with open(source_file) as file:
    lines = file.readlines()
    lines = [line.rstrip() for line in lines]

print(lines)

['https://files.inria.fr/aerialimagelabeling/aerialimagelabeling.7z.001', 'https://files.inria.fr/aerialimagelabeling/aerialimagelabeling.7z.002', 'https://files.inria.fr/aerialimagelabeling/aerialimagelabeling.7z.003', 'https://files.inria.fr/aerialimagelabeling/aerialimagelabeling.7z.004', 'https://files.inria.fr/aerialimagelabeling/aerialimagelabeling.7z.005']


## Download the data

This may take a while depending on your connection. Total size is ~16GB.

In [4]:
# Providing function to show progress in urllib urlretreive.
class TqdmUpTo(tqdm):
    """Provides `update_to(n)` which uses `tqdm.update(delta_n)`."""
    def update_to(self, b=1, bsize=1, tsize=None):
        """
        b  : int, optional
            Number of blocks transferred so far [default: 1].
        bsize  : int, optional
            Size of each block (in tqdm units) [default: 1].
        tsize  : int, optional
            Total size (in tqdm units). If [default: None] remains unchanged.
        """
        if tsize is not None:
            self.total = tsize
        return self.update(b * bsize - self.n)  # also sets self.n = b * bsize

for url in lines:
    filename = url.split('/')[-1]
    print('Downloading {0}...'.format(filename))
    with TqdmUpTo(unit='B', unit_scale=True, unit_divisor=1024, miniters=1,
              desc=filename) as t:  # all optional kwargs
        urllib.request.urlretrieve(url, input_path / filename, reporthook=t.update_to, data=None)

Downloading aerialimagelabeling.7z.001...


aerialimagelabeling.7z.001: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 4.00G/4.00G [01:29<00:00, 47.9MB/s]


Downloading aerialimagelabeling.7z.002...


aerialimagelabeling.7z.002: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 4.00G/4.00G [01:16<00:00, 55.8MB/s]


Downloading aerialimagelabeling.7z.003...


aerialimagelabeling.7z.003: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 4.00G/4.00G [01:15<00:00, 56.6MB/s]


Downloading aerialimagelabeling.7z.004...


aerialimagelabeling.7z.004: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 4.00G/4.00G [01:15<00:00, 56.8MB/s]


Downloading aerialimagelabeling.7z.005...


aerialimagelabeling.7z.005: 3.52GB [01:13, 51.3MB/s]                                                                                                                      
