## Recursive GoIndex Downloader by atlonxp

This code was created and improved by adapting the code from pankaj260 https://colab.research.google.com/drive/1tmsLGuswIZIZ_oM35EMW8TbJ6pQPt1rY#scrollTo=3bCnUMUg_SoT&forceEdit=true&sandboxMode=true

**Features**
*   Recursive crawler (atlonxp)
*   Download all folders and files in a given url (atlonxp)
*   Download all folders and files in in sub-folders (atlonxp)
*   Adaptive delay in fetching url (atlonxp)
*   Store folders/files directly to your Google Drive (pankaj260)
*   Folders and files exclusion filters
*   Download queue supported
*   Auto-domain URL detection
*   API-based GoIndex crawler
*   Parallel/Multiple files downloader

**Version 2**:

	16 April 2020

	+ crawler_v2:
		* API-based GoIndex crawler
		* Collecting all urls to be downloaded
	+ parallel downloader
		* TQDM progress bar

**Version 1**:

	15 April 2020
	-   Added auto-domain URL detection
	-   Added simple download queue

	14 April 2020
		-   initial

In [0]:
# Mounting Google Drive, ignore this section if you don't want to 
# save on your Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [0]:
# Install dependencies
!pip install requests tqdm

In [1]:
# Import dependencies

import json
import multiprocessing
import os
from random import randint
from time import sleep
from urllib import parse

import requests
import tqdm

In [2]:
OVERWRITE = True
MIN_DELAY = 2
MAX_DELAY = 5
def check_exclusion(name, exclusions):
    for exc in exclusions:
        if exc in name:
            return True
    return False


def crawler_v2(url, downloading_dict, path, level, exclusions, verbose=False):
    if not os.path.exists(path):
        os.mkdir(path)

    url = parse.urlparse(url)
    print(('  ' * level) + url.geturl())

    response = requests.post(url.geturl(), data={})
    files_dict = json.loads(response.text)['files']

    for file in files_dict:
        name = file['name']

        # if @name contains exclusion word, we ignore
        if check_exclusion(name, exclusions):
            continue

        if 'folder' in file['mimeType']:
            next_url = url.geturl() + parse.quote(name) + "/"
            next_path = os.path.join(path, name)
            crawler_v2(next_url, downloading_dict, next_path, level + 1, exclusions, verbose)
        else:
            name = file['name']
            if verbose:
                print('  ' * (level + 1) + name)

            downloading_dict.append({
                'folder': path,
                'filename': name,
                'filename_abs': os.path.join(path, name),
                'size': file['size'],
                'url': url.geturl() + parse.quote(name),
            })

    return downloading_dict


def download_agent(task, OVERWRITE=OVERWRITE):
    # Making multiple requests too quick can cause yourself banned, so let set random delay (1, 10)
    sleep(randint(MIN_DELAY, MAX_DELAY))

    folder = task['folder']
    filename_abs = task['filename_abs']
    url = task['url']

    if not os.path.exists(folder):
        os.mkdir(folder)
    if os.path.exists(filename_abs) or OVERWRITE:
        r = requests.get(url, stream=True)
        if r.status_code is not 200:
            return 1
        with open(filename_abs, 'ab+') as f:
            f.write(r.content)
    return 0

In [None]:
MAX_DOWNLOAD_TASKS = 8
exclusions = ['__MACOSX/']

destination = "/content/drive/My Drive/"
download_tasks = [
    {
        'folder': 'AI Product Manager Nanodegree v1.0.0',
        'url': 'https://lol.freecoronavirus.workers.dev/Udacity%20-%20Collections%20[300%20GB]/Nanodegrees/AI%20Product%20Manager%20ND%20v1.0.0/'
    },
    {
        'folder': 'Data Science Nanodegree v1.0.0',
        'url': 'https://lol.freecoronavirus.workers.dev/Udacity%20-%20Collections%20[300%20GB]/Nanodegrees/D.S.ND%20v1.0.0/'
    },
]

print('##################################')
print('# Crawling all downloadable urls #')
print('##################################', end='\n\n')
tasks = []
for task in download_tasks:
	tasks += crawler_v2(task['url'], tasks, os.path.join(destination, task['folder']), 0, exclusions)

# print(json.dumps(tasks, indent=2))

print('##################################')
print('# Downloading files and folders  #')
print('##################################', end='\n\n')
pool = multiprocessing.Pool(processes=MAX_DOWNLOAD_TASKS)  # Num of CPUs

failures = sum(tqdm.tqdm(pool.imap_unordered(download_agent, tasks), total=len(tasks)))
print('Total number of download failures:', failures)
pool.close()
pool.terminate()

print('\nAll done, Voila!')

##################################
# Crawling all downloadable urls #
##################################

https://lol.freecoronavirus.workers.dev/Udacity%20-%20Collections%20[300%20GB]/Nanodegrees/AI%20Product%20Manager%20ND%20v1.0.0/assets/css/
  https://lol.freecoronavirus.workers.dev/Udacity%20-%20Collections%20[300%20GB]/Nanodegrees/AI%20Product%20Manager%20ND%20v1.0.0/assets/css/fonts/
https://lol.freecoronavirus.workers.dev/Udacity%20-%20Collections%20[300%20GB]/Nanodegrees/AI%20Product%20Manager%20ND%20v1.0.0/assets/img/
[
  {
    "folder": "/Users/atlonxp/Desktop/recursive-goIndex-downloader/download/ABC/fonts",
    "filename": "KaTeX_AMS-Regular.ttf",
    "filename_abs": "/Users/atlonxp/Desktop/recursive-goIndex-downloader/download/ABC/fonts/KaTeX_AMS-Regular.ttf",
    "size": "71428",
    "url": "https://lol.freecoronavirus.workers.dev/Udacity%20-%20Collections%20[300%20GB]/Nanodegrees/AI%20Product%20Manager%20ND%20v1.0.0/assets/css/fonts/KaTeX_AMS-Regular.ttf"
  },
  {
    "

  5%|▍         | 12/262 [00:18<05:36,  1.35s/it]