<a href="https://colab.research.google.com/github/TheEssenceSentry/awesome-web-resources/blob/master/boilerplate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import os
import re
import urllib
import hashlib as hl
from urllib.request import urlopen
from google.colab import drive


"""
file paths must start with "/content/",
or be relative to "/content/",
or be of the form f"{DRIVE_PATH}subfolder"
"""


DRIVE_PATH = "/content/drive/My Drive/"
COLAB_PATH = f"{DRIVE_PATH}Colab Notebooks/"
DATA_PATH = f"{COLAB_PATH}data/"
KB, MB, GB = [2**(10*i) for i in (1, 2, 3)]


def hash_file(file_location, hash_fns={"MD5": hl.md5, "SHA1": hl.sha1}, buffer_size=4*GB):
    """
    Hash a file by default with MD5 and SHA1 and 4GB chunks
    """
    hashes = [hash_fn() for hash_fn in hash_fns.values()]
    with open(file_location, "rb") as f:
        while True:
            data = f.read(buffer_size)
            if not data:
                break
            map(lambda h: h.update(data), hashes)
        return dict(zip(hash_fns.keys(), map(lambda h: h.hexdigest().upper(), hashes)))


def file_download_name(_url):
    """
    Find the download name of a file from its URL
    """
    _url = urlopen(_url).url
    return os.path.basename(_url)


def mount_drive(force=False):
    """
    Mount Google Drive (prefer using the GUI if you can avoid this method)
    """
    drive.mount("drive", force_remount=force)


def exists(file_location):
    """
    Check the existance of a file
    """
    try:
        os.path.exists(file_location)
    except:
        raise ValueError("There is not a file in the specified location.")


def env_vars(vars=None, env_file_location=f"{DRIVE_PATH}.env"):
    """
    Load requested enviroment variables from a given .env file
    """
    exists(env_file_location)
    prefix = ""
    all_vars = dict()
    with open(env_file_location, "r") as lines:
        for line in lines:
            if "=" in line:
                if len(line.split("=") == 2):
                    name, val = line.split("=")
                    if prefix:
                        name = f"{prefix}_{name}"
                    all_vars.update({name: val})
                else:
                    raise ValueError("Variables have to be of the form NAME=VALUE")
            elif re.match(r"^\[\w+\]$", line):
                prefix = re.findall(r"^\[(\w+)\]$", line)[0]
            elif line == "":
                prefix = ""
            else:
                pass
    if all_vars:
        vars = vars or all_vars
        updated_vars = {(k, v) for (k, v) in all_vars.items() if k in vars}
        try:
            del all_vars
        except:
            pass
        os.environ.update(updated_vars)
        return updated_vars
    else:
        raise ValueError(
            "The file does not containt valid variables of the form NAME=VALUE"
        )


def clone_repo(repo_name, secret=False, user=None):
    """
    Clone a public or private GitHub repository
    """
    if user == None:
        env_vars(["GITHUB_NAME"])
        user = os.environ["GITHUB_NAME"]
    if secret == True:
        env_vars(["GITHUB_PASSWORD"])
        password = f":{urllib.parse.quote(os.environ['GITHUB_PASSWORD'])}"
    else:
        password = ""
    os.system(f"git clone https://{user}{password}@github.com/{user}/{repo_name}.git")
    try:
        del password
        del os.environ["GITHUB_PASSWORD"]
    except:
        pass
    assert os.path.exists(f"/content/{repo_name}"), "Error cloning the repository."
    return f"/content/{repo_name}"


def download(url, load_cookies=False, save_cookies=False):
    """
    Download a file from a URL
    Save or load cookies for more involved downloads
    """
    file_name = file_download_name(url)
    cookies = ""
    if save_cookies:
        cookies = "--save-cookies /tmp/cookies.txt --keep-session-cookies"
    elif load_cookies:
        cookies = "--load-cookies /tmp/cookies.txt"
    os.system(f"wget -c {cookies} '{url}'")
    return file_name


def uncompress(file_location):
    """
    Extract compressed files (.zip, .tar, .tar.gz, .tar.bz2, .tar.lz, .tar.lzma,
    .tar.xz)
    """
    if "/" in file_location:
        file_name = file_location.split("/")[-1]
        origin = "/".join(file_location.split("/")[:-1])
    else:
        file_name = file_location
        origin = "/content"
    filters = {
        "gz": "--gzip",
        "bz2": "--bzip2",
        "lz": "--lzip",
        "lzma": "--lzma",
        "xz": "--xz",
    }
    ext = file_name.split(".")[-1]
    if ext == "zip":
        base = " ".join(file_name.split(".")[:-1])
        command = f"unzip '{origin}/{file_name}' -d '/content/{base}'"
    if ext == "tar":
        base = " ".join(file_name.split(".")[:-1])
        command = (
            f"tar --get --file='{origin}/{file_name}' --directory='/content/{base}'"
        )
    elif file_name.split(".")[-2] == "tar" and ext in filters:
        base = " ".join(file_name.split(".")[:-2])
        command = f"tar --get {filters[ext]} --file='{origin}/{file_name}' --directory='/content/{base}'"
    else:
        raise ValueError("Cannot uncompress the file.")
    command = f"mkdir -p '/content/{base}'; {command}"
    os.system(command)


def github_id():
    """
    Identify yourself to GitHub
    """
    env_vars(["GITHUB_NAME", "GITHUB_EMAIL"])
    os.system(f"git config --global user.email {os.environ['GITHUB_EMAIL']}")
    os.system(f"git config --global user.name {os.environ['GITHUB_NAME']}")


def drive_link_to_id(link):
    """
    Extract the link id string from different possible link formats
    """
    if "drive.google.com" in link:
        id = re.findall(r"\/file\/d\/(\w+)", link)[0]
    elif "docs.google.com" in link:
        id = re.findall(r"\Wid=(\w+)", link)[0]
    elif len(re.findall(r"\w{33}", link)) == 33:
        id = re.findall(r"(\w{33})", link)[0]
    elif len(link) == 33 and re.match(r"^\w+$", link):
        id = link
    return id


def big_drive_file(link, calculate_hash=False):
    """
    download Google Drive files (publicly shared with a link) that cannot be
    downloaded directly without confirmation because of their size (Google does
    not perform antivirus analysis on them)
    """
    id = drive_link_to_id(link)
    temp = download(
        f"https://docs.google.com/uc?export=download&id={id}", save_cookies=True
    )
    with open(temp, "r") as t:
        content = t.read()
    code = re.findall(r"confirm=(\w+)", content)[0]
    file_name = re.findall(r"docs.google.com\/open\?id\=\w+\">([^<]+)</a>", content)[0]
    file = download(
        f"https://docs.google.com/uc?export=download&confirm={code}&id={id}",
        load_cookies=True,
    )
    os.system(f"mv '{file}' '{file_name}'")
    os.system(f"rm '/tmp/cookies.txt' '{temp}'")
    print(hash_file(file_name))
    return file_name


def to_drive(file_location, path=""):
    """
    Move file from the local Colaboratory hard disk to your Google Drive storage
    """
    path = f"{DRIVE_PATH}{path}"
    file_name = file_location.split("/")[-1]
    if file.startswith("/content/"):
        os.system(f"cp '{file}' '{path}{file_name}'")
    else:
        os.system(f"cp '/content/{file}' '{path}{file_name}'")


In [None]:
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/googlecolab/colabtools/blob/master/notebooks/colab-github-demo.ipynb)