# Mount Google Drive

Mount your google drive to be used for storing the storing the dataset into it (optional to save datasets on mega to it) and also for saving the results in the drive.

In [None]:
from google.colab import drive
drive.mount._DEBUG = False
drive.mount('/content/drive', force_remount=True)

# Move Dataset from Mega to Drive

Move the file with the provided link (Provide your username and password if it's not a public link) to your google drive already mounted before in the previous step. 

### Install Mega

In [None]:
import sys, os, urllib.request
import time
import subprocess
import contextlib
from IPython.display import clear_output
from functools import wraps
import errno
import os
import signal
import subprocess
import shlex
import glob

HOME = os.path.expanduser("~")
if not os.path.exists(f"{HOME}/.ipython/ocr.py"):
    hCode = "https://raw.githubusercontent.com/biplobsd/" \
                "OneClickRun/master/res/ocr.py"
    urllib.request.urlretrieve(hCode, f"{HOME}/.ipython/ocr.py")

from ocr import (
    runSh,
    loadingAn,
)

if not os.path.exists("/usr/bin/mega-cmd"):
    loadingAn()
    print("Installing MEGA ...")
    runSh('sudo apt-get -y update')
    runSh('sudo apt-get -y install libmms0 libc-ares2 libc6 libcrypto++6 libgcc1 libmediainfo0v5 libpcre3 libpcrecpp0v5 libssl1.1 libstdc++6 libzen0v5 zlib1g apt-transport-https')
    runSh('sudo curl -sL -o /var/cache/apt/archives/MEGAcmd.deb https://mega.nz/linux/MEGAsync/Debian_9.0/amd64/megacmd-Debian_9.0_amd64.deb', output=True)
    runSh('sudo dpkg -i /var/cache/apt/archives/MEGAcmd.deb', output=True)
    print("MEGA is installed.")
    clear_output()


### provide URL and Mega ID

Add the URL to fetch from Mega, and the username and password for those files if it wasn't a public URL and to use the pro mega quota if you have a pro account.

In [None]:
#It's optional to provide the MEGA username and password, it's used for giving more download quota if you have a MEGA pro account. 
MEGA_USERNAME = ""  #optional 
MEGA_PASSWORD = ""  #optional 

TAGGED_DATASET_URL = "https://mega.nz/file/kc40hbgL#WH0rLmRwkJodzD4yazWWAlnmp_IJQiLG0jx2hhDJLhY"
OTHER_DATASET_URL  = "https://mega.nz/file/MNo0ABSB#8rDqevxdQmtaNFKjQ3RS9v2pF-jL8xzmM9LLxIfAhG0"
OUTPUT_PATH = ""

In [None]:
# Unix, Windows and old Macintosh end-of-line
newlines = ['\n', '\r\n', '\r']

def latest_file(folder):
  list_of_files = glob.glob(f'{folder}/*') # * means all 
  latest_file = max(list_of_files, key=os.path.getctime)
  return latest_file

def unbuffered(proc, stream='stdout'):
    stream = getattr(proc, stream)
    with contextlib.closing(stream):
        while True:
            out = []
            last = stream.read(1)
            # Don't loop forever
            if last == '' and proc.poll() is not None:
                break
            while last not in newlines:
                # Don't loop forever
                if last == '' and proc.poll() is not None:
                    break
                out.append(last)
                last = stream.read(1)
            out = ''.join(out)
            yield out


def transfer(url):
    import codecs
    decoder = codecs.getincrementaldecoder("UTF-8")()
    cmd = ["mega-get", url, OUTPUT_PATH]
    proc = subprocess.Popen(
        cmd,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        # Make all end-of-lines '\n'
        universal_newlines=True,
    )
    for line in unbuffered(proc):
        print(line)
        
if not OUTPUT_PATH:
  os.makedirs("downloads", exist_ok=True)
  OUTPUT_PATH = "downloads"


class TimeoutError(Exception):
    pass

def timeout(seconds=10, error_message=os.strerror(errno.ETIME)):
    def decorator(func):
        def _handle_timeout(signum, frame):
            raise TimeoutError(error_message)

        def wrapper(*args, **kwargs):
            signal.signal(signal.SIGALRM, _handle_timeout)
            signal.alarm(seconds)
            try:
                result = func(*args, **kwargs)
            finally:
                signal.alarm(0)
            return result

        return wraps(func)(wrapper)

    return decorator


@timeout(10)
def runShT(args):
    return runSh(args, output=True)

def login(): 
    runShT(f"mega-login {MEGA_USERNAME} {MEGA_PASSWORD}")

#if the username and password provided then login to MEGA. 
if MEGA_USERNAME != "" and MEGA_PASSWORD != "":
    try:
        login()
    except TimeoutError:
        runSh('mega-whoami', output=True)
else:
    print("Please Input your Mega IDs.")

transfer(TAGGED_DATASET_URL)
tagged_dataset_path = latest_file('./downloads')
transfer(OTHER_DATASET_URL)
other_dataset_path = latest_file('./downloads')

# Fetch & Clone Repo

Clone the `image-tagging-tools` repo as it has all the required utils and codes from preprocessing the image datasets to training the models and using them to classify the images. 

In [None]:
!git clone https://github.com/kk-digital/image-tagging-tools.git

# Preprocess the dataset images (Stage 1)

Use the `image-dataset-processor` utils from the previously cloned repo to process a directory of images (paths to directory of images or an archived dataset), and computes the images metadata along with its CLIP embeddings and writes the result into a JSON file into `output_folder`

### Install requirements 

In [None]:
%%capture
%pip install ascii_graph open_clip_torch patool fire

### Import the module/Utility 

In [None]:
import sys
sys.path.insert(0, '../image-tagging-tools/stage1/')
sys.path.insert(0, '../image-tagging-tools/stage2/')
sys.path.insert(0, '../image-tagging-tools/stage3/')
sys.path.insert(0, '../image-tagging-tools/stage4/')
from ImageDatasetProcessor import ImageDatasetProcessor
from classify import main as classify_main
from classify_zip import main as classify_main_zip
from classify_zip import zip_gen as zip_generator
from train import main as train_main
from classify_helper_functions import *
import patoolib
import shutil


def is_archive(path: str) -> bool:
    """method to check if a given path is an archive.
    :param path: The file path to check. 
    :type path: str
    :returns: `True` if the given path is a path of an archived file. 
    :rtype: bool
    """
    try: 
        patoolib.get_archive_format(path)
        return True 
    except Exception: 
        return False 

def unzip_folder(folder_path :str):
    """takes an archived file path and unzip it.
    :param folder_path: path to the archived file.
    :type folder_path: str
    :returns: path of the new exracted folder 
    :rtype: str
    """
    dir_path  = os.path.dirname(folder_path)
    file_name = os.path.basename(folder_path).split('.zip')[0]
    os.makedirs(dir_path , exist_ok=True)
    
    print("[INFO] Extracting the archived file...")
    patoolib.extract_archive(folder_path, outdir=dir_path)
    print("[INFO] Extraction completed.")
    
    return latest_file(dir_path)


def clean_directory(dir_path: str, only_sub_dir: bool = False):
    
    for dir in os.listdir(dir_path):
        sub_dir = os.path.join(dir_path, dir)
        
        if os.path.isfile(sub_dir): # It is a file. Check and break if it is outside of tag folder
            if only_sub_dir: 
                '''RV: Deletion codes are removed since it should not delete anything. Prompt error instead'''
                raise Exception (f'[ERROR: Input dataset contains file outside of tag folder: {sub_dir}]')

            ImageDatasetLoader.check_file(sub_dir)
            continue

        if len(os.listdir(sub_dir)) == 0: # Empty folder
            '''RV: Deletion codes are removed since it should not delete anything. Prompt error instead'''
            raise Exception (f'[ERROR]: Input dataset contains empty folder: {sub_dir}]')

        if os.path.isdir(sub_dir) and only_sub_dir: # move to the sub-directory and clean it.
            ImageDatasetLoader.clean_directory(sub_dir)
        else:
            '''RV: Deletion codes are removed since it should not delete anything. Prompt error instead'''
            raise Exception ('[ERROR]: Dataset format is possible invalid...]')


### Unzipping and cleaning for the tagged images

In [None]:
if is_archive(tagged_dataset_path):
  tagged_images_folder = unzip_folder(tagged_dataset_path)
  clean_directory(tagged_images_folder, only_sub_dir=True)

### set required variables by the utility

Initialize the required parameters needed by the dataset preprocessor utility and they are described as follows: 

* `input_folder` _[str]_ -  path to the directory containing sub-folders of each tag.
* `output_folder` _[str]_ - path to the directory where to save the files into it.

* `clip_model` _[str]_ - CLIP model to be used

* `pretrained` _[str]_ - the pre-trained model to be used for CLIP
* `batch_size` _[int]_ -  number of images to process at a time
* `num_threads` _[int]_ - the number to be used in this process

* `device` _[str]_ -  the device to be used in computing the CLIP embeddings, if `None` is provided then `cuda` will be used if available


In [None]:
dataset_path = tagged_images_folder
output_folder = "./output"
tagged_dataset = True
clip_model = "ViT-B-32"
pretrained = 'openai'
batch_size = 32
num_threads = 4
device = None

### Run the Preprocessor

In [None]:
ImageDatasetProcessor.process_dataset(
    dataset_path, 
    output_folder,
    tagged_dataset, 
    clip_model, 
    pretrained,
    batch_size, 
    num_threads, 
    device
)

### Train script variables


* `metadata_json` _[string]_ - _[required]_ - The path to the metadata json file. 
* `tag_to_hash_json` _[string]_ - _[required]_ - The path to tag-to-hash json file. 

* `output` _[string]_ - _[optional]_ - The path to the output directory.
* `test_per` _[float]_ - _[optional]_ - The percentage of the test images from the dataset, default = 0.1 


In [None]:
metadata_json = './image-tagging-tools/output/input-metadta.json' 
tag_to_hash_json = './image-tagging-tools/output/input-tag-to-image-hash-list.json'
output_dir = './image-tagging-tools/output'
test_per = 0.1

### Run training script

In [None]:
train_main(
    metadata_json = metadata_json,
    tag_to_hash_json = tag_to_hash_json,
    output_dir = output_dir,
    test_per = test_per
)

### Listing all the models

In [None]:
list_models('./output/models') # listing all the models we have for classification. 

### Classification script variables

* `directory` _[string]_ - _[required]_ - The path to the images' folder or images' .zip file. 
* `metadata_json` _[string]_ - _[required]_ - The path to the metadata json file for CLIP embeddings. 
* `output` _[string]_ - _[optional]_ - The path to the output directory for the inference results. 
* `model` _[string]_ - _[optional]_ - The path to the models' .pkl files directory or single .pkl file model.
* `output_bins` _[int]_ - _[optional]_ -  The number of bins of the results for each model.


#### Classification fot other-validation folder (pre-computed CLIP embeddings)

In [None]:
folder_path    = os.path.join(tagged_images_folder,"other-validation")
output_dir     = "./classification_other_validation"
json_file_path =  "./output/input-metadata.json"
bins_number    = 10
model_path     = "./output/models" 

In [None]:
classify_main(
        folder_path    = folder_path , 
        output_dir     = output_dir, 
        json_file_path = json_file_path, 
        bins_number    = bins_number , 
        model_path     = model_path, 
        )

### Single Image -- Single Model Classification Example

In [None]:
# Image file to be classified 
folder_path    = "/content/downloads/pixel-art-tagged-v2/other-validation/https___i.pinimg.com_originals_00_79_2c_00792c0d83d415a707bdacee51646d9e.png"
output_dir     = ".output/classification_single_image_single_model"
json_file_path = ".output/input-metadata.json"
bins_number    = 10
model_path     = "./output/models/model-ovr-logistic-regression-tag-not-pixel-art-digital.pkl"

In [None]:
classify_main(
        folder_path    = folder_path , 
        output_dir     = output_dir, 
        json_file_path = json_file_path, 
        bins_number    = bins_number, 
        model_path     = model_path, 
        )

### Single image -- All the models classification example

In [None]:
# Image file to be classified 
folder_path    = "/content/downloads/pixel-art-tagged-v2/other-validation/https___i.pinimg.com_originals_00_79_2c_00792c0d83d415a707bdacee51646d9e.png"
output_dir     = "./output/classification_single_image_all_models"
json_file_path = "./output/input-metadata.json"
bins_number    = 10
model_path = "./output/models"

In [None]:
classify_main(
        folder_path    = folder_path , 
        output_dir     = output_dir, 
        json_file_path = json_file_path, 
        bins_number    = bins_number, 
        model_path     = model_path, 
        )

### Model and Tag name Classification for Single Image

In [None]:
TAG_NAME   = 'not-pixel-art' # tag which you want to classify.
MODEL_TYPE = 'ovr-logistic-regression' # model type you want to use.

In [None]:
# Image file to be classified 
folder_path    = "/content/downloads/pixel-art-tagged-v2/other-validation/https___i.pinimg.com_originals_00_79_2c_00792c0d83d415a707bdacee51646d9e.png"
output_dir     = ".output/classification_single_image_custom_model"
json_file_path = "./output/input-metadata.json"
bins_number    = 10
model_path = generate_model_path(
                                  './output/models',
                                  model_type= MODEL_TYPE,
                                  tag_name= TAG_NAME
                                 )

In [None]:
classify_main(
        folder_path    = folder_path, 
        output_dir     = output_dir, 
        json_file_path = json_file_path, 
        bins_number    = bins_number, 
        model_path     = model_path, 
        )

#### Classifcation for .zip file full of images (takes more time)

In [None]:
folder_path    = 'dataset/subset3_.zip'
output_dir     = './output/tagging_output_from_zip'
json_file_path =  "./output/input-metadata.json"
bins_number    = 5
model_path     = "./output/models" 

In [None]:
classify_main_zip(
        folder_path    = folder_path , 
        output_dir     = output_dir, 
        json_file_path = json_file_path, 
        bins_number    = bins_number , 
        model_path     = model_path, 
        )

#### Classify Data in ZIP Archives for Single Tag Model and Run Function for Threshold Score

In [None]:
folder_path    = './dataset/subset_3.zip'
output_dir     = './output'
json_file_path = './output/input-metadata.json'
bins_number    = 5
model_path     = './output/models/model-ovr-svm-tag-pos-video-game-side-scrolling.pkl'
th_score       = 0.2

In [None]:
sys.path.insert(0, '../image-tagging-tools/stage4/')
from classify_zip import zip_gen
from classify_zip_helper_functions import classify_to_bin, get_clip, load_json, create_models_dict, get_bins_array, create_out_folder

In [None]:
metadata_json_obj = load_json(json_file_path)
clip_model , preprocess , device = get_clip(clip_model_type= 'ViT-B-32',pretrained= 'openai')
model_path  = os.path.join('output','models') if model_path is None else model_path
models_dict = create_models_dict(model_path)
bins_array  = get_bins_array(bins_number) 

output_dir = create_out_folder(base_dir = output_dir)

In [None]:
def any_function_to_run(score):
    '''Function to run when certain prob_score is met'''
    pass    

In [None]:
# Loop through each zip file.
for file in [folder_path]:
    # Generating images
    for img, img_file_name in zip_gen(file):
        # Classify
        img_out_dict = classify_to_bin(
                                        img,
                                        img_file_name,
                                        models_dict,
                                        metadata_json_obj,
                                        output_dir,
                                        bins_array,
                                        clip_model,
                                        preprocess,
                                        device
                                    )
        if img_out_dict is None:
            continue

        score = img_out_dict['classifiers_output'][0]['tag_prob']
        if th_score < score:
            any_function_to_run(score)

print("[INFO] Finished.")