# Set up GCP vm

In [1]:
import os
import shutil
import json
# from google.colab import drive

root = "src"

def save_json(item, dest_file_name):
    with open(dest_file_name, "w") as json_file:
        json.dump(item, json_file)
    
def load_json(path):
    return json.load(open(path))

## Check Memory Usage

In [2]:
! free -m

              total        used        free      shared  buff/cache   available
Mem:          15033         877        7202           8        6953       13842
Swap:             0           0           0


In [3]:
!lsblk

NAME    MAJ:MIN RM  SIZE RO TYPE MOUNTPOINT
sda       8:0    0  100G  0 disk 
├─sda1    8:1    0 99.9G  0 part /
├─sda14   8:14   0    3M  0 part 
└─sda15   8:15   0  124M  0 part /boot/efi
sdb       8:16   0  400G  0 disk /home/jupyter


In [4]:
!pwd

/home/jupyter/src


## Download & Unpack ImageNet

In [5]:
# download ImageNet validation set
!wget -N https://image-net.org/data/ILSVRC/2012/ILSVRC2012_img_val.tar
!wget -N https://image-net.org/data/ILSVRC/2012/ILSVRC2012_devkit_t12.tar.gz
!wget -N http://dl.caffe.berkeleyvision.org/caffe_ilsvrc12.tar.gz

--2021-12-29 11:48:43--  https://image-net.org/data/ILSVRC/2012/ILSVRC2012_img_val.tar
Resolving image-net.org (image-net.org)... 171.64.68.16
Connecting to image-net.org (image-net.org)|171.64.68.16|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6744924160 (6.3G) [application/x-tar]
Saving to: ‘ILSVRC2012_img_val.tar’


2021-12-29 11:56:37 (13.6 MB/s) - ‘ILSVRC2012_img_val.tar’ saved [6744924160/6744924160]

--2021-12-29 11:56:37--  https://image-net.org/data/ILSVRC/2012/ILSVRC2012_devkit_t12.tar.gz
Resolving image-net.org (image-net.org)... 171.64.68.16
Connecting to image-net.org (image-net.org)|171.64.68.16|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2568145 (2.4M) [application/x-gzip]
Saving to: ‘ILSVRC2012_devkit_t12.tar.gz’


2021-12-29 11:56:39 (2.26 MB/s) - ‘ILSVRC2012_devkit_t12.tar.gz’ saved [2568145/2568145]

--2021-12-29 11:56:39--  http://dl.caffe.berkeleyvision.org/caffe_ilsvrc12.tar.gz
Resolving dl.caffe.berkeleyv

In [6]:
os.makedirs(os.path.join(root, "images/sources/imagenet/image_net_data"), exist_ok=True)
os.makedirs(os.path.join(root, "images/sources/imagenet/labels/devkit"), exist_ok=True)
os.makedirs(os.path.join(root, "images/sources/imagenet/labels/vgg_label"), exist_ok=True)

# unzip the files
!tar -xf ILSVRC2012_img_val.tar -C images/sources/imagenet/image_net_data
!tar -xf ILSVRC2012_devkit_t12.tar.gz -C images/sources/imagenet/labels/devkit
!tar -xf caffe_ilsvrc12.tar.gz -C images/sources/imagenet/labels/vgg_label

## Sort imagenet by class labels

In [9]:
# copy the 2 labeling files to current directory

shutil.copyfile(os.path.join(root, "images/sources/imagenet/labels/devkit/ILSVRC2012_devkit_t12/data/ILSVRC2012_validation_ground_truth.txt"), os.path.join(root, "original_labels.txt"))
shutil.copyfile(os.path.join(root, "images/sources/imagenet/labels/vgg_label/val.txt"), os.path.join(root, "VGG_labels.txt"))

os.makedirs(os.path.join(root, "images/sources/imagenet/data_original_label"), exist_ok=True)
os.makedirs(os.path.join(root, "images/sources/imagenet/data_VGG_label"), exist_ok=True)

# there are 1000 labels in total (imagenet starts from 1, vgg starts from 0)
for i in range(1, 1001):
    os.makedirs(os.path.join(root, "images/sources/imagenet", "data_original_label", str(i)), exist_ok=True)
    os.makedirs(os.path.join(root, "images/sources/imagenet", "data_VGG_label", str(i-1)), exist_ok=True)

# record the image file names
img_paths = []
for file in os.listdir(os.path.join(root, "images/sources/imagenet/image_net_data")):
    if file.endswith(".JPEG"):
        img_paths.append(file)

img_paths = sorted(img_paths)

original_labels = open("original_labels.txt")
vgg_labels = open("VGG_labels.txt")
      
for img_path, label, label2 in zip(img_paths, original_labels, vgg_labels):
    src = os.path.join(root, "images/sources/imagenet/image_net_data", img_path)
    shutil.copyfile(src, os.path.join(root, "images/sources/imagenet/data_original_label", label.strip(), img_path))
    shutil.copyfile(src, os.path.join(root, "images/sources/imagenet/data_VGG_label", str(label2.split()[1]), img_path))
    os.remove(src)

shutil.rmtree(os.path.join(root, "images/sources/imagenet/image_net_data"))

## Separate imageNet images into train, val and test directories

Iterate on each class and randomly select N images to be [train, val, test] according to the proportion

image per class in ImageNet Validation dataset = 50

- Train ratio: 50 * (1 * 0.8 * 0.8) = 50 * 0.64 = 32
- Val ratio: 50 * (1 * 0.8 * 0.2) = 50 * 0.16 = 8
- Test ratio: 50 * (1 * 0.2) = 50 * 0.2 = 10

In [10]:
import numpy as np
import os

src_path = os.path.join(root, "images/sources/imagenet/data_VGG_label")

total_train_paths = []
total_val_paths = []
total_test_paths = []

data_ratio = np.array([0.64, 0.16, 0.2])

for class_label in os.listdir(src_path):
    class_full_path = os.path.join(src_path, class_label)
    total_paths = os.listdir(class_full_path)
    np.random.shuffle(total_paths)
    split_ratio = data_ratio * len(total_paths)


    train_paths = []
    val_paths = []
    test_paths = []

    for _ in range(int(split_ratio[0])):
        train_paths.append(os.path.join(class_full_path, total_paths.pop()))

    for _ in range(int(split_ratio[1])):
        val_paths.append(os.path.join(class_full_path, total_paths.pop()))

    for i in range(len(total_paths)):
        total_paths[i] = os.path.join(class_full_path, total_paths[i])
    
    total_train_paths.append(train_paths)
    total_val_paths.append(val_paths)
    total_test_paths.append(total_paths)


In [11]:
assert len(total_train_paths) == len(total_val_paths) == len(total_test_paths) == 1000
assert len(total_train_paths[0]) == 32
assert len(total_val_paths[0]) == 8
assert len(total_test_paths[0]) == 10

## Save as json files

In [12]:
import json

src_path = os.path.join(root, "images/sources/imagenet")

save_json(total_train_paths, os.path.join(src_path, "total_train_paths.json"))
save_json(total_val_paths, os.path.join(src_path, "total_val_paths.json"))
save_json(total_test_paths, os.path.join(src_path, "total_test_paths.json"))

## download from google drive using gdown

In [13]:
!pip install git+https://github.com/giuliano-oliveira/gdown_folder.git

Collecting git+https://github.com/giuliano-oliveira/gdown_folder.git
  Cloning https://github.com/giuliano-oliveira/gdown_folder.git to /tmp/pip-req-build-y13yea4i
  Running command git clone --filter=blob:none -q https://github.com/giuliano-oliveira/gdown_folder.git /tmp/pip-req-build-y13yea4i
  Resolved https://github.com/giuliano-oliveira/gdown_folder.git to commit a3fb6565e56b8294994f1faa2534a5a2a31a7b94
  Running command git submodule update --init --recursive -q
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing wheel metadata (pyproject.toml) ... [?25ldone
[?25hCollecting filelock
  Downloading filelock-3.4.2-py3-none-any.whl (9.9 kB)
Collecting pathlib2
  Downloading pathlib2-2.3.6-py2.py3-none-any.whl (17 kB)
Collecting beautifulsoup4
  Downloading beautifulsoup4-4.10.0-py3-none-any.whl (97 kB)
     |████████████████████████████████| 97 kB 3.4 MB/s            
[?25hCollecting soupsieve>1.2
  Downloadin

In [None]:
import gdown
target_path = "images/sources/hedge_masks"
os.makedirs(target_path, exist_ok=True)

gdrive_link = "https://drive.google.com/uc?id=1jvArkVStVgPIp33JEtVI-fT03wpqgtYa"
output = "images/sources/hedge_masks.7z"

gdown.download(gdrive_link, output=output, quiet=True)

Access denied with the following error:



 	Too many users have viewed or downloaded this file recently. Please
	try accessing the file again later. If the file you are trying to
	access is particularly large or is shared with many people, it may
	take up to 24 hours to be able to view or download the file. If you
	still can't access a file after 24 hours, contact your domain
	administrator. 

You may still be able to access the file from the browser:

	 https://drive.google.com/uc?id=1jvArkVStVgPIp33JEtVI-fT03wpqgtYa 



In [None]:
! p7zip -d images/sources/hedge_masks.7z

In [None]:
for i in range(1, 9):
    current_label = str(round(i/10, 1))
    shutil.move(current_label, os.path.join("images/sources/hedge_masks", current_label))

# Other Utils

## zip google drive folders

In [None]:
path = "/content/drive/MyDrive/Image_Dehedger_Project/images"

In [None]:
!zip -r images.zip /content/drive/MyDrive/Image_Dehedger_Project/images

[1;30;43m串流輸出內容已截斷至最後 5000 行。[0m
  adding: content/drive/MyDrive/Image_Dehedger_Project/images/sources/imagenet/data_original_label/684/ILSVRC2012_val_00034033.JPEG (deflated 0%)
  adding: content/drive/MyDrive/Image_Dehedger_Project/images/sources/imagenet/data_original_label/684/ILSVRC2012_val_00035352.JPEG (deflated 0%)
  adding: content/drive/MyDrive/Image_Dehedger_Project/images/sources/imagenet/data_original_label/684/ILSVRC2012_val_00035478.JPEG (deflated 0%)
  adding: content/drive/MyDrive/Image_Dehedger_Project/images/sources/imagenet/data_original_label/684/ILSVRC2012_val_00036327.JPEG (deflated 0%)
  adding: content/drive/MyDrive/Image_Dehedger_Project/images/sources/imagenet/data_original_label/684/ILSVRC2012_val_00037499.JPEG (deflated 0%)
  adding: content/drive/MyDrive/Image_Dehedger_Project/images/sources/imagenet/data_original_label/684/ILSVRC2012_val_00041009.JPEG (deflated 0%)
  adding: content/drive/MyDrive/Image_Dehedger_Project/images/sources/imagenet/data_origi

## upload local files

In [None]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

## Sync to Google Drive

In [None]:
drive.flush_and_unmount()