# Extract downloaded tif files

In [1]:
import pathlib
import py7zr
from tqdm import tqdm
import os
import shutil
from pyunpack import Archive

## Data folders:

In [2]:
data_path = pathlib.Path('../../data/')
inria_path = data_path / 'INRIA'
input_path = inria_path / 'input'
output_path = inria_path / 'output'
output_path.mkdir(parents=True, exist_ok=True)
tmp_path = output_path / 'tmp'
tmp_path.mkdir(parents=True, exist_ok=True)

## Extract the 7z files
Append multi-volume files (*.7z.00x) together before extracting.

In [3]:
filenames = list(input_path.glob('*.7z.*'))
filenames = list(map(lambda x: str(x), filenames))
filenames = sorted(filenames)

merge_7z = tmp_path / 'merge.7z'
with open(merge_7z, 'ab') as outfile: # append in binary mode
    print('Merging files...')
    pbar = tqdm(filenames)
    for filename in pbar:
        with open(filename, 'rb') as infile:
            outfile.write(infile.read())

Merging files...


100%|█████████████████████████████████████████████████████████████████████████████████| 5/5 [00:25<00:00,  5.17s/it]


In [4]:
with py7zr.SevenZipFile(merge_7z, 'r') as archive:
    print('Extracting files...')
    archive.extractall(path=tmp_path)
print('Deleting intermediate files...')
os.unlink(merge_7z)
print('Finished!')

Extracting files...
Deleting intermediate files...
Finished!


## Extract the final .zip file


In [5]:
pbar = tqdm(tmp_path.glob('*.zip'))
for file_zip in pbar:
    filename = file_zip.name
    print('Extracting {0}...'.format(filename))
    pbar.set_description(filename)
    Archive(file_zip).extractall(output_path)

print('Cleaning tmp folder contents...')
for f in tmp_path.glob('*'):
    os.remove(f)

print('Moving extracted .tif files into tmp folder...')
dataset = (output_path / 'AerialImageDataset')
for folder in dataset.glob('*'):
    new_path = tmp_path / folder.name
    folder.rename(new_path)
os.rmdir(dataset)

print('Finished!')

NEW2-AerialImageDataset.zip: : 0it [00:00, ?it/s]

Extracting NEW2-AerialImageDataset.zip...


NEW2-AerialImageDataset.zip: : 1it [03:05, 185.47s/it]


Cleaning tmp folder contents...
Moving extracted .tif files into tmp folder...
Finished!


## Final structure

```
data
|__INRIA
   |__input
   |  |  aerialimagelabeling.7z.001
   |  |  ...
   |  |  aerialimagelabeling.7z.005
   |
   |__output
      |__tmp
         |__train
         |  |__images
         |     |  <FILENAME>.tif
         |     |  ...
         |
         |  |__gt
         |     |  <FILENAME>.tif
         |     |  ...
         |
         |__test
            |  <FILENAME>.tif
            |  ...
```