# --

In [1]:
def log_progress(sequence, every=None, size=None, name='Items'):
    """https://habr.com/ru/post/276725/
    способ создания красивых прогресс-баров
    см. также pypi.python.org/pypi/tqdm"""
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

In [2]:
# прогресс-бар числом - для случаев while и т.п.
from ipywidgets import HTML
from IPython.display import display
from time import sleep
label = HTML()
display(label)
for x in range(10):
    label.value = str(x)
    sleep(0.1)

HTML(value='')

In [3]:
# вычисляет полный размер объекта в памяти

import sys
from types import ModuleType, FunctionType
from gc import get_referents

# Custom objects know their class.
# Function objects seem to know way too much, including modules.
# Exclude modules as well.
BLACKLIST = type, ModuleType, FunctionType


def getmemsize(obj):
    """sum size of object & members."""
    if isinstance(obj, BLACKLIST):
        raise TypeError('getsize() does not take argument of type: '+ str(type(obj)))
    seen_ids = set()
    size = 0
    objects = [obj]
    while objects:
        need_referents = []
        for obj in objects:
            if not isinstance(obj, BLACKLIST) and id(obj) not in seen_ids:
                seen_ids.add(id(obj))
                size += sys.getsizeof(obj)
                need_referents.append(obj)
        objects = get_referents(*need_referents)
    return size

In [4]:
# проверка на симлинки
# стандартная проверка в windows junction-ы не считает симлинками

from ctypes import *
from ctypes.wintypes import *

FILE_ATTRIBUTE_REPARSE_POINT = 0x00400
INVALID_FILE_ATTRIBUTES = 0xFFFFFFFF

kernel32 = WinDLL('kernel32')
GetFileAttributesW = kernel32.GetFileAttributesW
GetFileAttributesW.restype = DWORD
GetFileAttributesW.argtypes = (LPCWSTR,) #lpFileName In

def islink(path):
    result = GetFileAttributesW(path)
    if result == INVALID_FILE_ATTRIBUTES:
        raise WinError()
    return bool(result & FILE_ATTRIBUTE_REPARSE_POINT)

islink(r'D:\Users\feelus\Local Settings')

True

In [5]:
import os
os.path.split(r'C:\a\b\c')

('C:\\a\\b', 'c')

In [6]:
def slash_replacer(s):
    while s[0]=='\\':
        s = s[1:]
    while s[-1]=='\\':
        s = s[:-1]
    return s
def my_path_join_a(*ll):
    return '\\'.join([slash_replacer(s) for s in ll])
def my_path_join_l(ll):
    return '\\'.join([slash_replacer(s) for s in ll])
my_path_join_a('a\\c','b')

'a\\c\\b'

In [7]:
import os
from stat import *

def scan(rootpath):
    """сканирует заданную папку и возвращает дерево:
    имя_папки : {...}
    имя_файла : [размер]"""
    root = {}
    #rootpath = 'D:\\'
    total_size = 0
    ts_printed = 0

    label = HTML()
    display(label)
    
    for curdir, indirs, infiles in os.walk(rootpath):
        path = curdir
        folders = []
        while path != rootpath:
            #print(path)
            path, folder = os.path.split(path)
            folders.append(folder)

        #print(curdir)
        #print(folders)
        curroot = root
        try:
            for i in reversed(folders):
                curroot = curroot[i]
        except:
            #print('path is apsent in tree:',curdir)
            continue

        for d in indirs:
            cur_d = my_path_join_a(curdir,d)
            try:
                mode = os.stat(cur_d).st_mode
                islnk = islink(cur_d)
            except:
                print("can't check stat of dir:",cur_d)
                continue
            if not S_ISDIR(mode) or islnk:
                print("this dir is not dir:",cur_d,mode)
                continue
            curroot[d]={}
        for f in infiles:
            cur_f = my_path_join_a(curdir,f)
            try:
                st = os.stat(cur_f)
                mode = st.st_mode
                size = st.st_size
                islnk = islink(cur_d)
            except:
                print("can't check stat of file:",cur_f)
                continue
            if not S_ISREG(mode):
                print("this file is not file:",cur_f,mode)
                continue
            if islnk:
                print("this file is not usual:",cur_f,mode)
            curroot[f]=[size]
            
            total_size+=size
            if ts_printed<int(total_size/1024/1024/1024):
                ts_printed = int(total_size/1024/1024/1024)
                label.value = str(ts_printed)+' GB scanned'
            
    label.value = str(ts_printed)+' GB scanned - completed'
    return root


In [8]:
def tree_iterator(tree):
    """проходится по всему дереву
    на каждом узле(листе) возвращает пару (путь, значение)
    где путь - список имен, по которым надо добираться по дереву до значения"""
    for k,v in tree.items():
        if type(v)==dict:
            for path,v2 in tree_iterator(v):
                path.insert(0,k)
                yield path,v2
        else:
            yield [k],v

In [9]:
r = {'a':1,'b':{'c':2,'d':3}}
for x in tree_iterator(r):
    print(x)

(['a'], 1)
(['b', 'c'], 2)
(['b', 'd'], 3)


In [10]:
def get_subtree(root,path):
    """берет корень и путь, проходит по пути, и возвращает то, где оказался"""
    if len(path)==0: return root
    else: return get_subtree(root[path[0]],path[1:])
def make_subdir(root,path):
    """берет корень и путь, проходит по пути, и возвращает то, где оказался"""
    if len(path)==0: return root
    else: 
        if path[0] not in root: root[path[0]]={}
        return make_subdir(root[path[0]],path[1:])

In [11]:
import hashlib
def md5(fname):
    """вычисляет хеш файла по его пути"""
    hash_md5 = hashlib.md5()
    with open(fname, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

In [12]:
def get_diplics_by_size(root):
    root_by_size={} # map: размер -> множество файлов
    # файл - tuple из пути к файлу
    for x,v in tree_iterator(root):
        size = v[0]
        file = tuple(x)
        #print(x,v,size)
        if size in root_by_size:
            root_by_size[size].add(file)
        else:
            root_by_size[size] = {file}

    # map размер -> кол-во повторений (если их >1)
    dups_by_size = [(i,len(root_by_size[i])) for i in reversed(sorted(root_by_size.keys())) \
                   if len(root_by_size[i])!=1]
    tot_s=0
    tot_ss = 0
    tot_n = 0
    for s,n in dups_by_size:
        tot_s+=s*(n-1)
        tot_ss+=s*(n)
        tot_n+=n
    print('by size:')
    print('размер повторений', '%.3f'%(tot_ss/1024/1024),'MB')
    print('размер лишнего   ', '%.3f'%(tot_s/1024/1024),'MB')
    print('кол-во повторений', tot_n)
    print('кол-во лишнего   ', tot_n-len(dups_by_size))
    return root_by_size,dups_by_size


In [13]:
def list_duplics_by_size(root_by_size,dups_by_size):
    """перечисляет все повторы по размеру"""
    for s,n in dups_by_size:
        for p in sorted(root_by_size[s]):
            yield s,p

In [17]:
def is_subpath(subpath,path):
    """сначала длинный, потом короткий"""
    if len(subpath)<len(path):
        return False
    for i in range(len(path)):
        if subpath[i]!=path[i]:
            return False
    return True

In [79]:
# загружаем хэши
import json, codecs

def load_hashes(hash_path,prefix):
    """загружаем хэши, вычисляем хэши, сохраняем хэши"""
    try:
        with codecs.open(hash_path,'r', encoding='utf-8') as file:
            old_root = file.read()
            old_root = json.loads(old_root)
            print('readed',hash_path)
            print(old_root.keys())
    except BaseException as e:
        print(e)
        old_root = {}
    return old_root

def calc_hashes(root,old_root,prefix,root_by_size,dups_by_size):
    #assert len(prefix)>=1
    if type(prefix)==set:
        if len(prefix)==1:
            prefix = next(iter(prefix))
        else:
            prefix_list = prefix
            prefix = []
    #print('prefix=',prefix)
    def del_old_hashes(root,old_root):
        for_del = set()
        for k in old_root:
            if k not in root:
                #for_del.add(k)#del old_root[k]
                continue
            if type(root[k])==dict and type(old_root[k])==dict:
                del_old_hashes(root[k],old_root[k])
            elif type(root[k])==list and type(old_root[k])==list:
                if root[k][0]!=old_root[k][0]:
                    for_del.add(k)#del old_root[k]
                elif len(old_root[k])==2 and len(root[k])==1:
                    root[k].append(old_root[k][1])
            else:
                for_del.add(k)#del old_root[k]
        for k in for_del:
            del old_root[k]
    del_old_hashes(root,old_root)

    # вычисляем хэши
    tot_ss = 0
    tot_n = 0
    counter2 = 0

    label = HTML()
    display(label)

    calculated = set()
    for s,p in list_duplics_by_size(root_by_size,dups_by_size):
        tot_ss += s
        tot_n+=1
        counter2+=1
        p = list(prefix)+list(p)
        try:
            old_fs = get_subtree(old_root,p)
            if len(old_fs)>=2:
                hashh = old_fs[1]
            else:
                fs = get_subtree(root,p)
                if len(fs)>=2:
                    hashh = fs[1]
                else: raise

        except:
            #hashh = md5(os.path.join(*p))
            try:
                tmp_p = my_path_join_l(p)
                calculated.add(tmp_p)
                hashh = md5(tmp_p)
            except BaseException as e:
                print(e)
                continue
            if counter2%10==0 or tot_n<10:
                label.value = str(tot_n)+' files,  ' +'%.3f'%(tot_ss/1024/1024)+' MB'
        fs = get_subtree(root,p)
        if len(fs) == 1: fs.append(hashh)
        else:            fs[1] = hashh
        #old_fs = get_subtree(old_root,p)
        #if len(old_fs) == 1: old_fs.append(hashh)
        #else:            old_fs[1] = hashh
        #print(hashh+'\t'+my_path_join_a(rootpath_YD,*p))
    label.value = str(tot_n)+' files,  ' +'%.3f'%(tot_ss/1024/1024)+' MB - completed'
    return calculated

def unload_hashes(root,hash_path,prefix):
    #assert len(prefix)>=1
    if type(prefix)==set:
        if len(prefix)==1:
            prefix = next(iter(prefix))
        else:
            prefix_list = prefix
            prefix = []
    # сохраняем хэши
    if len(prefix)>0:
        tmp = make_subdir(old_root,prefix[:-1])
        subroot = get_subtree(root,prefix)
        tmp[prefix[-1]] = subroot
    else:
        for prefix in prefix_list:
            tmp = make_subdir(old_root,prefix[:-1])
            subroot = get_subtree(root,prefix)
            tmp[prefix[-1]] = subroot
    try:
        print('start writing')
        with codecs.open(hash_path,'w', encoding='utf-8') as file:
            s = json.dumps(old_root,indent='\t',ensure_ascii=False)
            file.write(s)
            print('writed',hash_path)
    except BasicException as e:
        print('start writing with exception',e)
        with codecs.open(hash_path,'w', encoding='utf-8') as file:
            s = json.dumps(old_root,indent='\t',ensure_ascii=False)
            file.write(s)
            print('writed',hash_path)


In [15]:
def get_duplics_by_hash(root,root_by_size,dups_by_size):
    """повторы по хэшам (и размеру)"""

    dups_by_hash = {}
    for s,n in dups_by_size:
        for p in sorted(root_by_size[s]):
            fdata = get_subtree(root,p)
            if len(fdata)<2: continue
            if len(fdata)>2: del fd[2:]
            hashh = fdata[1]
            if (s,hashh) in dups_by_hash:
                dups_by_hash[(s,hashh)].add(p)
            else:
                dups_by_hash[(s,hashh)] = {p}
    dups_by_hash = {s_h:pp for (s_h,pp) in dups_by_hash.items() if len(pp)>1}
    tot_s=0
    tot_ss = 0
    tot_n = 0
    for s_h,pp in dups_by_hash.items():
        n = len(pp)
        s = s_h[0]
        tot_s+=s*(n-1)
        tot_ss+=s*(n)
        tot_n+=n
    print('by hash:')
    print('размер всего  ', '%.3f'%(tot_ss/1024/1024),'MB')
    print('размер лишнего', '%.3f'%(tot_s/1024/1024),'MB')
    print('кол-во повторений', tot_n)
    print('кол-во лишнего   ', tot_n-len(dups_by_hash))
    return dups_by_hash

In [16]:
def ignore_files(dups_by_hash):
    for k in ignored_files:
        if k in dups_by_hash: del dups_by_hash[k]

    tot_s=0
    tot_ss=0
    tot_n=0
    for s_h,pp in dups_by_hash.items():
        n = len(pp)
        s = s_h[0]
        tot_s+=s*(n-1)
        tot_ss+=s*(n)
        tot_n+=n
    print('by hash, ignore files:')
    print('размер всего  ', '%.3f'%(tot_ss/1024/1024),'MB')
    print('размер лишнего', '%.3f'%(tot_s/1024/1024),'MB')
    print('кол-во повторений', tot_n)
    print('кол-во лишнего   ', tot_n-len(dups_by_hash))
    

In [18]:
def ignore_dirs(root,prefix,dups_by_hash):
    if type(prefix)==set:
        if len(prefix)==1:
            prefix = next(iter(prefix))
        else:
            prefix_list = prefix
            prefix = []
    print('prefix=',prefix)
    def ignore_subdirs(subroot,cp): # common path
        #print('start:',cp)
        for k,f in subroot.items():
            if type(f)==dict:
                ignore_subdirs(f,cp)
            elif type(f)==list and len(f)>=2:
                sh = tuple(f[:2]) # size hash
                same = True
                if sh in dups_by_hash:
                    #print('try to del',sh)
                    #print('\t',cp)
                    for path in dups_by_hash[sh]:
                        path = list(prefix)+list(path)
                        #print('\t',path)
                        if not is_subpath(path,cp):
                            same = False
                            break
                    if same:
                        #print('del')
                        del dups_by_hash[sh]
    for d in ignored_dirs:
        try:
            d = d.split('\\')
            #try:
            subroot = get_subtree(root,d)
            #except KeyError:
            #    print('key error')
            #except BaseException as e:
            #    print('111',type(e),e)
            ignore_subdirs(subroot,d)
        except KeyError:
            pass
            #print('key error')
        except BaseException as e:
            print(type(e),e)
        
    tot_s=0
    tot_ss=0
    tot_n=0
    for s_h,pp in dups_by_hash.items():
        n = len(pp)
        s = s_h[0]
        tot_s+=s*(n-1)
        tot_ss+=s*(n)
        tot_n+=n
    print('by hash, ignore dirs:')
    print('размер всего  ', '%.3f'%(tot_ss/1024/1024),'MB')
    print('размер лишнего', '%.3f'%(tot_s/1024/1024),'MB')
    print('кол-во повторений', tot_n)
    print('кол-во лишнего   ', tot_n-len(dups_by_hash))
    

In [19]:
def del_void_dirs(root,path):
    #print(path)
    fordel = set()
    for k,f in root.items():
        if type(f)==dict:
            del_void_dirs(f,path+[k])
            #print(k,len(f))
            if len(f)==0:
                fordel.add(k)
    for k in fordel:
        p = my_path_join_l(path+[k])
        try: 
            os.rmdir(p)
            print('del dir',p)
            del root[k]
        except OSError as e:
            print(e,p)

def del_dir(root,prefix,dups_by_hash,base_dir,del_dir, stop_size=0):
    if type(prefix)==set:
        if len(prefix)==1:
            prefix = next(iter(prefix))
        else:
            prefix_list = prefix
            prefix = []
    print('prefix=',prefix)
    base_dir = base_dir.split('\\')
    del_dir  =  del_dir.split('\\')
    try:
        with codecs.open('deleted.json','r', encoding='utf-8') as file:
            del_root = file.read()
            del_root = json.loads(del_root)
            print('readed',hash_path)
            print(del_root.keys())
    except BaseException as e:
        print(e)
        del_root = {}

    def save_deleted(del_root,path,sh):
        if len(path)==1:
            del_root[path[0]]=sh
        else:
            if path[0] not in del_root:
                del_root[path[0]]={}
            save_deleted(del_root[path[0]],path[1:],sh)
    def del_subdir(base_root,base_path): # рекурсивно проходимся по базовой директории
        fordel_gr = set()
        for k,f in base_root.items():
            if type(f)==dict:
                del_subdir(f,base_path+[k])
            elif type(f)==list and len(f)>=2: # если указан хеш
                sh = tuple(f[:2]) # size hash
                if sh[0]>=stop_size and sh in dups_by_hash:        # и он содержится в дубликатах
                    #print('try to del',sh)
                    #print('\t',cp)
                    fordel = set()
                    for path in dups_by_hash[sh]: # по всем файлам этого дубликата
                        path = list(prefix)+list(path)
                        #print('check')
                        #print('\t',path)
                        #print('\t',is_subpath(path,base_dir),base_dir)
                        #print('\t',is_subpath(path,del_dir),del_dir)
                        # проверить, что путь не содержится в base_dir и содержится в del_dir
                        if not is_subpath(path,base_dir) and is_subpath(path,del_dir):
                            fordel.add(tuple(path[len(prefix):]))
                            p = my_path_join_l(path)
                            # удаляем с диска
                            try: 
                                os.remove(p)
                                print('del',sh,p)
                                save_deleted(del_root,path,sh)
                                # удаляем из root
                                try:
                                    tmp = get_subtree(root,path[:-1])
                                    del tmp[path[-1]]
                                except KeyError:
                                    print('уже удален из root',p)
                            except OSError as e:
                                print(e,p)
                    # удаляем из dups_by_hash
                    #print('from',dups_by_hash[sh])
                    #print('del',fordel)
                    dups_by_hash[sh] = dups_by_hash[sh] - fordel
                    #print(sh,len(dups_by_hash[sh]))
                    if len(dups_by_hash[sh])<=1:
                        fordel_gr.add(sh)
        for sh in fordel_gr:
            del dups_by_hash[sh]
            
    del_subdir(get_subtree(root,base_dir),base_dir)

    try:
        print('start writing')
        with codecs.open('deleted.json','w', encoding='utf-8') as file:
            s = json.dumps(del_root,indent='\t',ensure_ascii=False)
            file.write(s)
            print('writed','deleted.json')
    except BasicException as e:
        print('start writing with exception',e)
        with codecs.open('deleted.json','w', encoding='utf-8') as file:
            s = json.dumps(del_root,indent='\t',ensure_ascii=False)
            file.write(s)
            print('writed','deleted.json')

    del_void_dirs(get_subtree(root,del_dir),del_dir)
    if len(del_dir)>1 and len(get_subtree(root,del_dir))==0:
        p = my_path_join_l(del_dir)
        try: 
            os.rmdir(p)
            print('del dir',p)
            del get_subtree(root,del_dir[:-1])[del_dir[-1]]
        except OSError as e:
            print(e,p)
    
    tot_s=0
    tot_ss=0
    tot_n=0
    for s_h,pp in dups_by_hash.items():
        n = len(pp)
        s = s_h[0]
        tot_s+=s*(n-1)
        tot_ss+=s*(n)
        tot_n+=n
    print('by hash after deletion:')
    print('размер всего  ', '%.3f'%(tot_ss/1024/1024),'MB')
    print('размер лишнего', '%.3f'%(tot_s/1024/1024),'MB')
    print('кол-во повторений', tot_n)
    print('кол-во лишнего   ', tot_n-len(dups_by_hash))
    

In [88]:
ignored_files = {
    (4_021_049, "323c0fd51071400b51eedb1be90a8188" ), # программы-установщики\_крякнутые\activators\222-3927\
    (7_163_744, 'db7796e33b0a6925fa0d2f5a14c3b0c0'),
    (7_161_696, "0676e5f63b467b2520b5b794ee9cac1e" ),
    (7_161_696, "c6171d6673c80f2d9f53229fc8e74e93" ),
    (3_141_496, "7efe66e76728eb555eab03f2c94779f7" ),
    (2_911_096, "957dad99c1df5728fd0fda142c2ae976" ),
    (1_290_752, "c9ae332b021335f84b117e0f3e0dc0c4" ), # _programming_arxiv\src_2013\work\q_client   2
    (2_847_232, "cbbd8381d595a5f5b15f4e9745a048f0" ), # _programming_arxiv\src_2013\work/q_client(2)
    (10_137_600, "d3c70c355594167db4ebc96fe7eaf36c" ),
    (3_548_698, "cd2f57342d07298cb7fa4d33e8e79f44" ), # _programming_arxiv\_networks\Томсон Л.Разработка WEB-приложений на PHP и MySQL.Диасофт.[RUS,672с.,2003]\Томсон Л.Разработка WEB-приложений на PHP и MySQL.CD
    (3_160_035, "407859fb8a10cc96871b896a3c977135" ),
    (2_998_523, "fb29dc1e6cbb7dfaa631a4ec737c1619" ),
    (1_184_499, "9f934a8a7b9172bcb0eff8cd38662e29" ), # _programming_arxiv\lang_delphy_pascal\research\консоль\Windows.pas
    (765_794, "865282cd4a95ff4bbfa5b91bf9a1a148" ),   # _programming_arxiv\lang_delphy_pascal\lang_pascal\compilators\BPdos\intrfcBIN\TURBO.TPH
    (614_785, "faed282f3f00a30a3c00697d77487a9a" ),   # _programming_arxiv\lang_delphy_pascal\lang_pascal\compilators\BPdos\intrfcBIN\TVISION.TPH
    (2_867_749, "f0c353b8ae07d40e1738c4ac6f608cd0" ), # _programming_arxiv\lang_delphy_pascal
    (3_147_048, "a7e8772270e9d0e5d577da40e6d87679" ), # _физика\3_квантовые вычисления\QC\bloch3dapp
    (2_957_997, "3c7698a8f64a654710ac8d1953e4c3c0" ),
    (658_776, "48d07aaf49aa92813ceb7bbd55cb4f0b" ),   # fonts_cnt.css
    (1_401_573, "251435ee12b6394452cd9126314323d6" ), # _programming\talbot
    (1_301_549, "d3ffe8ea7d60db1d6eeb9643bfd7c5c5" ),
    (1_704_635, "aec595bb8401ffce2bba070fb4ae7eea" ), # _физика\3_квантовые вычисления\QC\bloch3dapp
    (436_089, "c64bcbea7a619d50f3647156c7b98809" ),   # jquery-ui-1.10.2.custom.js
    (92_629, "397754ba49e9e0cf4e7c190da78dda05" ),    # jquery.min.js
    (5_814, "e95a70ca6171d9b12a6188c4549d6e30" ),     # .css
    (2_344, "c505889ab1685a942d82a7905f01ba02" ),     # html_files
    (1_744, "038c464e3f3f2994ce3beda0ae0428bf" ),
    (1_684, "dfa71e97d8fe66456b4ad518c897c024" ),
    (1_669, "6f996d51f9194252b646beea573aa3ae" ),
    (1_665, "8b6c9cccfc650ab202aeef7807df7c21" ),
    (1_613, "0f3cfa31850c1d36a42b46aa47a9c659" ),
    (1_608, "fd392ce9e6a30c3edb86b22e297be001" ),
    (1_600, "bdcc1e25cc8eb91bf2f01b268878c888" ),
    (950, "78ecf9f0b85511d82f0b0d2d28b45cb1" ),
    (796, "7c1c5b35726e05fff4a9d0d96314eb93" ),
    (429, "e67c90a18c89f8d05125c045b2978dcf" ),
    (241, "e73df154bb8d5f13aa4b1a6fbd1057eb" ),
    (43, "fc94fb0c3ed8a8f909dbc7630a0987ff" ),
    (16, "c630b0d2ea3415b07989b13bf3ae7e5a" ),
    (24_446, "07fbdbd97b38c3a1dc0d6315e616459c" ),
    (7_935, "b8173660d6918dcae3757f87b01ba37a" ),
    (2_747, "a7056b3db0858592f1afc18198aa6792" ),
    (2_720, "9b767e01abde4f96805a6b79d85bdb11" ),
    (2_484, "cc53cf96c0614d2f455124fb6e47f564" ),
    (1_260, "d835db4ea8ec417b9ee9a4b1011dda32" ),
    (772, "814865989fbf9cd423aacfca087bc985" ),
    (327, "0813bb506f592f948ac7fca2aebd8375" ),
    (250_378, "649d385019e958f2dc449d760eae1433" ),
    (44_402, "b8e32154eb85161d798a482698a3a15a" ),
    (27_504, "3a972b87d3e06de52394236dc40dac17" ),
    (24_546, "5ce44fe0141243fe8b8954d313487d6d" ),
    (19_569, "4a30bda8c53d7ad79cc1c9fdc20853ac" ),
    (6_857, "38bdf450941e648c78f5c44180d2c6b5" ),
    (5_443, "cc58acc7fee92cf8ad71bcfc865929f1" ),
    (2_055, "981964fc97de33907aa8a792e0ef8a58" ),
    (1_499, "f284a5859416c774e292e322eca10342" ),
    (1_360, "49a185c3d50e6344ed830c35362daf19" ),
    (857, "0adde11c2b3ab22644513a861673a175" ),
    (698, "484f6cd068e3aa8b917b9d3ae94287f7" ),
    (615, "df5b7b300059ac33fd4eac219bf7d7e8" ),
    (441, "c7041b531508730ad1ec6f79af96e016" ),
    (108, "67c58a38087e1a243fd14984f663b520" ),
    (43, "9bb191c6827273aa978cab39a3587950" ),
    (11_314, "06cec7c2dcebe067cc9c48459a44c3b3" ),
    (1_731, "1d64c32dda48159518461bbfd44b890c" ),
    (9_068, "7db84acc1f8d8b7b8f113329b2fcf7a2" ),
    (986, "845f45f83bea424bdd568b572d41bdff" ),
    (371, "35f5dabb3310f74a016e9e742af42784" ),
    (157, "2f644fe56705ccebca516a1d0c3524a8" ),
    (140, "813a7f41542bfe7da4ff1aab9518c0a0" ),
    (138, "2099cba4cfa03a795700942562325baa" ),
    (138, "2b37d2b8ee9c5334b10d43117de6d030" ),
    (136, "722089ca67a897ba0967f4c49b730846" ),
    (136, "72793f10acd9594091f44ed1e66ae4c9" ),
    (6_159, "f98fb2daa8efdabe2dbb4d4b847f6b93" ),
    (931, "1b60df911d0b2709a27f3298db7643e5" ),
    (860, "a0cfa2acd631063a5ce7b8cb6788e8c9" ),
    (43, "ad4b0f606e0f8465bc4c4c170b37e1a3" ),
    (43, "b4491705564909da7f9eaf749dbbfbb1" ),
    (35, "d57de1bd97baaffb63fe2a0602c03cbf" ),
    (475_729, "fb0a6fac20c9ac1f5a53dfe98f8cc03b" ), # html_files 3ds-max
    (352_022, "85e6575536361ab55add63590e7b1b23" ),
    (328_466, "0b9b2c3a50c8d6d7f62c4e7e4c9e624f" ),
    (305_231, "6dc137a38f5f7e99491f97aefdd1408b" ),
    (91_357, "964f6c5c67eecee8b89e87d204746a36" ),
    (91_243, "52ccb9dbfbb5acd479c689bcb0bb821d" ),
    (75_451, "43dc8d745cb88d102d27133d35a28216" ),
    (65_281, "a6bdab3aa4e5b9d6d4d5413e50aa896d" ),
    (36_788, "8c3e276d43f9d154193d5b009944046b" ),
    (27_665, "54b6fef5e581b03d08f91e30c9cc9720" ),
    (26_339, "d96d87c99db49fa4e65afbff685f2b79" ),
    (24_473, "35e5d2aff2f32c533cd834fc483a44d2" ),
    (14_694, "ef6bc825fb7d1a523994895c5ca5c1d8" ),
    (14_428, "cc9baf55a5ae241e0c31f3f48173c4e2" ),
    (12_783, "8e6eba6b41e432bfe59a9f19f0856c8e" ),
    (10_306, "81a7b377bfc310fc3b7999d0132f66e6" ),
    (10_001, "73668d4d0b9150a500bdb461c7e83542" ),
    (8_431, "29c66274d96710aaa1f537f313f208f3" ),
    (7_355, "c103190fc64ae07467f9473d49ca522f" ),
    (5_854, "ad44007be5e39529077868bf0d1e1fa8" ),
    (4_311, "2489f2e86c13a322f93a8dbc92aa6818" ),
    (3_588, "cc87e45f6a4cffe09bf42a633b7d1975" ),
    (3_153, "15fd5d53d04ee079fd64d76f0d8fe6ff" ),
    (3_070, "981cd5c338a169682bf9e7e075a3c4bd" ),
    (2_153, "056a277b9cf88c08ede52224afa2e243" ),
    (2_062, "69b9645b92ebf920e8c8cb1fb667497d" ),
    (1_009, "ba89910181ce7b387e172784ce2f0bad" ),
    (750, "71be19e25403369a5a3d1420c918c3ab" ),
    (307, "d8e2d40de34f54b587630b4bb4f716cf" ),
    (171, "0242a8d9c4f049cd30755a0f92702ed6" ),
    (28, "603dfdbdae47add9d387f3e68ba95db9" ),
    (50_600, "3ab543d2ce4b78e185d96ddca865d6b4" ), # html_files msdn
    (49_210, "ef0721a630043c0915eae6d7c3d42bf8" ),
    (27_035, "c764fa3d6b8a84bd40aa131355215c98" ),
    (14_512, "7dd5010625d10aecaab21b9f1162cbae" ),
    (12_301, "f47b71b086bba5822af542fa7101aa50" ),
    (4_203, "8e010b67ceb176b577f5afa1ceb6f5ad" ),
    (1_399, "b97c6c870866d885285a9916fb11ca37" ),
    (493, "0d0542f5f45cf9fc7273abd11cf5c0ee" ),
    (186, "9166785a2d662ca4c0379dcbb5813e69" ),
    (85, "030c41d9079671d09a62d8e2c1db6973" ),
    (255, "5ea4487e38d947beabed2fabac01985d" ), # html_files ядерка в картинках
    (70, "fa74e5c9eef0c6bcc042b3566e6f8a71" ),
    (70, "9814720c2787382e0f6a5323ee2dd04d" ),
    (70, "0b1d98eea461d796574f291c97874a58" ),
    (67, "3539ce8e0ab777f345ebf7f4ceebcdbc" ),
    (66, "55f94d685494301179aac9ebeda1903d" ),
    (65, "eeddd315c785c1da6c843a8857d877f4" ),
    (64, "0500eb7c6f993f64bb3e99b30859d26f" ),
    (63, "186af931b79af9927ff562bb0064a1d0" ),
    (62, "a0aa32fab42e0d271bdec9f450f36a80" ),
    (61, "6a1fe643d61cca0347ebc1021e295861" ),
    (61, "24183c0285561fffd74ee694ea9c8024" ),
    (61, "3e1c99a277fb93295d749b6b347e3877" ),
    (60, "dec5318b5b5f857a9851c897ba6ce037" ),
    (59, "2757ff4b1f96badb43b79ee4ad10e6e5" ),
    (58, "c3826e3595941d0f2eb7605a2efdb4b0" ),
    (55, "8fc4994fe73c12335f988c6d45963627" ),
    (54, "e5376996a7458b2b53f2301651fad64d" ),
    (53, "21d0dcc329e631f70451355ba787a330" ),
    (896, "9ad22f411108f2aa673ffc7e8bf658a7" ),
    (116, "a12b6e532e4d856d8495f0fa3ed809aa" ),
    (116, "e453dec490462ec6799516646515ef3d" ),
    (76, "bf941b9277599664124688dd04690c30" ),
    (73, "616363f7cf2230203bf297d9b54646fa" ),
    (73, "b4e00279ee5f87d6904063c003f60aa0" ),
    (72, "44cda4626cf69c3b55ccd756fafb53cb" ),
    (69, "08ce127d77bfa71ee2b1bb2bf51144dd" ),
    (63, "aa0f1b642fc3e8830e0fce3c6a8f112d" ),
    (61, "849f4554c3e7d997211d357435d5530f" ),
    (59, "4878626af6b7f618487a73e5ba1d52b0" ),
    (58, "c1abd383cd5df3f72640d9fd5f1422ed" ),
    (58, "af171591a40928b3e7c854578ab21a1c" ),
    (35_147, "d32239bcb673463ab874e80d47fae504" ), # COPYING.GPL
    (0, "d41d8cd98f00b204e9800998ecf8427e" ),      # size==0
    (66_501, "414faddd4014819a28da4e05e96c1dbc" ), # borland
    (18_063, "9ec015b7f53ef492357b02583a4c7061" ),
    (17_355, "0be1b2d5b4c2ad31ed678694a949773f" ),
    (16_677, "a6d500698ba9b68e904392c3b3aa9b8f" ),
    (14_670, "a49bca341f3cb3baab6c8b132aefed6a" ),
    (13_596, "04aa2c7082c5bf66da8d60e1fc214443" ),
    (10_987, "a5340e563dbf2429d6017076b86ceadc" ),
    (12_083, "4b001ca5a391481deebfdb19cf365880" ),
    (8_439, "2f3c50c65dff32d69e2f60c352a670c8" ),
    (8_437, "d09b538e0278d8fd934cfa25b6ea2b7b" ),
    (5_554, "738cda3ae2949ee902f8d55b6021d868" ),
    (5_131, "f53a2b6f38f1098fefe4397b3bd31759" ),
    (2_477, "c85d88caf0a8e6c9fed060fd079604f2" ),
    (2_474, "fd209bbeb1ec7d7a13f26a375dc47bfc" ),
    (2_309, "c2a8df8aff9076fd317796c08090a256" ),
    (2_253, "6550325d56f536ff49db63f15dc2d944" ),
    (878, "fde39708905c7b6ffe7cc137714ead0f" ),   # msvc manifest
    (1_425, "8a24adb3dc806e3f026fc641d43c5b72" ),
    (1_417, "a65e8754601de76b02a3cc2819aab3ef" ),
    (728, "e70025a7266b35ba4b48407fc688fd2c" ),
    (663, "9f97434014566363118e883726f603ff" ),
    (621, "8da3fa81efa85adad78efc1d31537d2a" ),
    (1_429, "fee0e8889be26bc72132a97a64454b29" ),
    (1_413, "dfdbe8f9224050b2637342425f9f1d2a" ),
    (2_048, "e1bf2cb8445d93d66209b6a06a2dc941" ),

    (3_962, "7fa92bb6f9a1d90d96b03937c01d49f9" ), # msvc грамматика с++
    (899, "825bee862b469247c8afc1c0b0adb624" ),   # msvc грамматика с++

    (13_998, "9d5ae975ae461be51dfd41d904c52af2" ),# gnu build scripts
    (10_346, "52cc47fe5f31e22f14f911caaf821429" ),# gnu build scripts
    
    (753, "87490c36908663abfee7896b356fda81" ), # puasson & прак полупроводниковый детектор
    
    (318_956, "e5d2b7f46c4800a32f62ce75676a5710" ), # QC .jar
    
    (4_951, "81005745454846bb79cc3c7c0c57658d" ), # git sample
    (3_610, "517f14b9239689dff8bda3022ebd9004" ),
    (1_642, "01b1688f97f94776baae85d77b06048b" ),
    (1_348, "3c5989301dd4b949dfa1f43738a22819" ),
    (1_239, "7dfe15854212a30f346da5255c1d794b" ),
    (896, "579a3c1e12a1e74a98169175fb913012" ),
    (478, "ce562e08d8098926a3862fc6e7905199" ),
    (424, "054f9ffb8bfe04a599751cc757226dda" ),
    (240, "036208b4a1ab4a235d75c181e685e5a3" ),
    (189, "2b7ea5cee3c49ff53d41e00785eb974c" ),
    (73, "a0a7c3fff21f2aea3cfa1d0316dd816c" ),
    (32, "73a00957034783b7b5c8294c54cd3e12" ),
    (23, "4cf2d64e44205fe628ddd534e1151b58" ),
    (25, "5ab7a4355e4c959b0c5c008f202f51ec" ),
    (34, "b501512a260537c5e52df65d2a034251" ),
    (4_898, "56e45f2bcbc8226d2b4200f7c46371bf" ),
    (3_327, "ecbb0cb5ffb7d773cd5b2407b210cc3b" ),
    (1_638, "e4db8c12ee125a8a085907b757359ef0" ),
    (1_492, "2b5c047bdb474555e1787db32b2d2fc5" ),
    (544, "2ad18ec82c20af7b5926ed9cea6aeedd" ),
    
    (4_996, "54b3964dbdd5e0595cf51a0d837a13d2" ), # mocha.css
    (258_388, "56f1d01ee4bb68d1572cfd60755cf67a" ), # jquery-2.2.0.js
}

In [90]:
ignored_dirs = {
    r'D:\Users\feelus\YandexDisk\_programming\_сети\_networks\Томсон Л.Разработка WEB-приложений на PHP и MySQL.Диасофт.[RUS,672с.,2003]\Томсон Л.Разработка WEB-приложений на PHP и MySQL.CD',
    r'D:\Users\feelus\YandexDisk\_programming_arxiv\src_sample\corba\mico',
    r'D:\Users\feelus\YandexDisk\_programming_arxiv\lang_java\jdk1.7.0_02',
    r'D:\Users\feelus\YandexDisk\_физика\3_квантовые вычисления\QC\_QuantumCirquits\qcs-code',
    r'D:\Users\feelus\YandexDisk\_физика\3_квантовые вычисления\QC\bloch3dapp\osx-x86\jogl\lib',
    r'D:\Users\feelus\YandexDisk\программы-установщики\_дрова\v10_1200a (VIA HD Audio UAA Driver)\VIAHDAud\Present',
    r'D:\Users\feelus\YandexDisk\программы-установщики\_дрова\motherboard_GIGABYTE\drivers\VGA',
    r'D:\Users\feelus\YandexDisk\программы-установщики\_дрова\motherboard_GIGABYTE\drivers\Raid\ITE_RAID',
    r'D:\Users\feelus\YandexDisk\программы-установщики\_дрова\дрова для планшета\wifi\Atheros_Bluetooth_Drivers_8.0.0.214',
    r'D:\Users\feelus\YandexDisk\программы-установщики\_дрова\L210_SCNDRV_3.7.9.1_Win_Home7_Nordic4_East9_TR_EL_RU_UK_AR_CA',
    r'D:\Users\feelus\YandexDisk\программы-установщики\_дрова\motherboard_GIGABYTE\drivers\LAN\Marvell\Other Driver',
    r'D:\Users\feelus\YandexDisk\программы-установщики\_крякнутые\MatLab 701-2004',
    r'D:\Users\feelus\YandexDisk\_programming\_web\javascript_goo',
    r'D:\Users\feelus\YandexDisk\_programming_arxiv\lang_bash\bash_src',
    r'D:\Users\feelus\YandexDisk\_programming\__криптография',
    r'D:\Users\feelus\YandexDisk\_programming\_lang_cpp\lib_glib-2.32.3',
    r'D:\Users\feelus\YandexDisk\_programming\_lang_cpp\lib_QT\Шлее М.Qt4 профессиональное программирование на  C++.БХВ.[RUS,880p.,2007]',
    r'D:\Users\feelus\YandexDisk\_programming\_lang_cpp\lib_QT\Марк Саммерфилд - Qt Профессиональное программирование (High tech) - 2011',
    r'D:\Users\feelus\YandexDisk\_programming\_lang_c\cpp',
    r'D:\Users\feelus\YandexDisk\_programming\form_course',
    r'D:\Users\feelus\YandexDisk\crossingover\crossingover\фото\фото-нумерованное',
    r'D:\Users\feelus\YandexDisk\crossingover\crossingover\фото\poses',
    r'D:\Users\feelus\YandexDisk\программы-установщики\_дрова',
    r'D:\Users\feelus\Repos\muzon\.git',
    r'D:\Users\feelus\Repos\__my_repos\parser\.git',
    r'D:\Users\feelus\Repos\__my_forked\Instantfox\.git',
}

# On One Disk

In [22]:
if 1:
    root = {}
    rootpath_YD_list_list = set()

if 0: # 35s +  7s :   682.337 MB 579.163 MB 31367    24864
    rootpath_YD = r'D:\Users\feelus\YandexDisk'
    root_YD = scan(rootpath_YD)
    root['D:'] = {}
    root['D:']['Users'] = {}
    root['D:']['Users']['feelus'] = {}
    root['D:']['Users']['feelus']['YandexDisk'] = root_YD
    rootpath_YD_list = ['D:','Users','feelus','YandexDisk']
    rootpath_YD_list_list.add(tuple(rootpath_YD_list))
    hash_YD = 'hash_cash_YD.json'

if 1: # 5m 21s + 2m 37s :   11638.450 MB 6810.797 MB 174977    124404
    # 4m 21s + 1m 54s:   10973.443 MB 6669.114 MB 175843    128178
    rootpath_YD = r'D:\\'
    root_YD = scan(rootpath_YD)
    if '$RECYCLE.BIN' in root_YD:
        del root_YD['$RECYCLE.BIN']
    root['D:'] = root_YD
    rootpath_YD_list = ['D:']
    rootpath_YD_list_list.add(tuple(rootpath_YD_list))
    hash_YD = 'hash_cash_D.json'

if 0:# 54s + 47s : 
    rootpath_YD = r'H:\yadisks'
    root_YD = scan(rootpath_YD)
    root['H:'] = {}
    root['H:']['yadisks'] = root_YD
    #root['H:']['yadisks']['2019'] = {}
    #root['H:']['yadisks']['2019']['_programming_arxiv'] = root_YD
    rootpath_YD_list = ['H:','yadisks']#,'2019','_programming_arxiv']
    rootpath_YD_list_list.add(tuple(rootpath_YD_list))
    hash_YD = 'hash_cash.json'
if 1:
    rootpath_YD = r'H:'
    root_YD = scan(rootpath_YD)
    if '$RECYCLE.BIN' in root_YD:
        del root_YD['$RECYCLE.BIN']
    root['H:'] = root_YD
    #root['H:']['yadisks']['2019'] = {}
    #root['H:']['yadisks']['2019']['_programming_arxiv'] = root_YD
    rootpath_YD_list = ['H:']#,'2019','_programming_arxiv']
    rootpath_YD_list_list.add(tuple(rootpath_YD_list))
    hash_YD = 'hash_cash.json'

if 1:
    rootpath_YD = r'I:'
    root_YD = scan(rootpath_YD)
    if '$RECYCLE.BIN' in root_YD:
        del root_YD['$RECYCLE.BIN']
    root['I:'] = root_YD
    rootpath_YD_list = ['I:']
    rootpath_YD_list_list.add(tuple(rootpath_YD_list))
    hash_YD = 'hash_cash.json'

if 1:
    # YD+HYD:  51.3s + 
    
    # H+I:    26m 28s +     3m 50s :  63955.552 MB  40_354.847 MB  406134    254424
    # D+H+I:  23m 46s + 4h 13m 47s : 272892.752 MB 152_887.327 MB 1225489    778638
    root_YD = root
    rootpath_YD_list = []
    hash_YD = 'hash_cash.json'
else:
    if len(rootpath_YD_list_list)==1:
        rootpath_YD_list_list = next(iter(rootpath_YD_list_list))
    else:
        raise BasicException('qwe')

len(root_YD)

HTML(value='')

can't check stat of file: D:\\$RECYCLE.BIN\S-1-5-21-281476240-1351446808-3602091694-1000\$R80B473
can't check stat of file: D:\\$RECYCLE.BIN\S-1-5-21-281476240-1351446808-3602091694-1000\$RHEH575
can't check stat of file: D:\\$RECYCLE.BIN\S-1-5-21-281476240-1351446808-3602091694-1000\$RSMWGH3
this dir is not dir: D:\\Users\feelus\Application Data 16895
this dir is not dir: D:\\Users\feelus\Cookies 16895
this dir is not dir: D:\\Users\feelus\Local Settings 16895
this dir is not dir: D:\\Users\feelus\My Documents 16749
this dir is not dir: D:\\Users\feelus\NetHood 16895
this dir is not dir: D:\\Users\feelus\PrintHood 16895
this dir is not dir: D:\\Users\feelus\Recent 16749
this dir is not dir: D:\\Users\feelus\SendTo 16749
this dir is not dir: D:\\Users\feelus\Start Menu 16749
this dir is not dir: D:\\Users\feelus\Templates 16895
this dir is not dir: D:\\Users\feelus\Главное меню 16749
this dir is not dir: D:\\Users\feelus\Мои документы 16749
this dir is not dir: D:\\Users\feelus\Шаблоны

can't check stat of file: D:\\Users\feelus\cyg-home\.trash\2017-07-05 15-23-04%-cygdrive%-c%-Users%-feelus%-cy%-src%-linux\drivers\gpu\drm\nouveau\nvkm\subdev\i2c\aux.h
can't check stat of file: D:\\Users\feelus\cyg-home\src\linux-cur\drivers\gpu\drm\nouveau\nvkm\subdev\i2c\aux.c
can't check stat of file: D:\\Users\feelus\cyg-home\src\linux-cur\drivers\gpu\drm\nouveau\nvkm\subdev\i2c\aux.h
can't check stat of file: D:\\Users\feelus\cyg-home\src\linux-cur\include\soc\arc\aux.h
can't check stat of file: D:\\Users\feelus\cyg-home\wget-downloads\www.maiclub.ru\cgi-bin\picturmai.pl@1_PARAPLAN_GAGRA_DRUZH007.jpg%3A2003_09_gagra%3A%20%D0%94%D1%80%D1%83%D0%B6%D0%B8%D0%BD%D0%B8%D0%BD%20%D0%90%D0%BB%D0%B5%D0%BA%D1%81%D0%B5%D0%B9,%20%D0%BF%D0%B0%D1%80%D0%B0%D0%BF%D0%BB%D0%B0%D0%BD%20%D1%80%D0%BE%D0%B7%D0%BE
can't check stat of file: D:\\Users\feelus\cyg-home\wget-downloads\www.maiclub.ru\cgi-bin\picturmai.pl@1_PARAPLAN_GAGRA_DRUZH009.jpg%3A2003_09_gagra%3A%20%D0%A1%D1%82%D0%B0%D1%80%D1%82%20%D1%8

this dir is not dir: D:\\Users\feelus\Documents\My Music 16749
this dir is not dir: D:\\Users\feelus\Documents\My Pictures 16749
this dir is not dir: D:\\Users\feelus\Documents\My Videos 16749
this dir is not dir: D:\\Users\feelus\Documents\Мои видеозаписи 16749
this dir is not dir: D:\\Users\feelus\Documents\Мои рисунки 16749
this dir is not dir: D:\\Users\feelus\Documents\Моя музыка 16749
this file is not usual: D:\\Users\feelus\Documents\.RData 33206
this file is not usual: D:\\Users\feelus\Documents\.Rhistory 33206
this file is not usual: D:\\Users\feelus\Documents\desktop.ini 33206
this file is not usual: D:\\Users\feelus\Documents\Doc1.htm 33206
this file is not usual: D:\\Users\feelus\Documents\Track01.mp3 33206
this file is not usual: D:\\Users\feelus\Documents\Track01_1.mp3 33206
this file is not usual: D:\\Users\feelus\Documents\Track02.mp3 33206
this file is not usual: D:\\Users\feelus\Documents\Track02_1.mp3 33206
this file is not usual: D:\\Users\feelus\Documents\Track03.m

can't check stat of file: D:\\Users\feelus\YandexDisk\_programming\_lang_cpp\поиск файлов и msdn _html\_msdn_html\z_windows development\z_getting started\z_learn to program  for windows in c++\z_introduction to windows programming\Windows Coding Conventions (Windows)_files\avatar(1).jpg
can't check stat of file: D:\\Users\feelus\YandexDisk\_programming\_lang_cpp\поиск файлов и msdn _html\_msdn_html\z_windows development\z_getting started\z_learn to program  for windows in c++\z_introduction to windows programming\Windows Coding Conventions (Windows)_files\avatar(2).jpg
can't check stat of file: D:\\Users\feelus\YandexDisk\_programming\_lang_cpp\поиск файлов и msdn _html\_msdn_html\z_windows development\z_getting started\z_learn to program  for windows in c++\z_introduction to windows programming\Windows Coding Conventions (Windows)_files\avatar(3).jpg
can't check stat of file: D:\\Users\feelus\YandexDisk\_programming\_lang_cpp\поиск файлов и msdn _html\_msdn_html\z_windows development\

HTML(value='')

can't check stat of file: H:yadisks\_common\_programming\_lang_cpp\programming\стандарты и справочные руководства\Мюссер Д., Дердж Ж., Сейни А. C++ и STL. Справочное руководство (2-е издание, 2010)\Мюссер Д., Дердж Ж., Сейни А. C++ и STL. Справочное руководство (2-е издание, 2010).djvu
can't check stat of file: H:yadisks\_common\Документы\programming\литература\интернет оффлайн\msdn_html\z_windows development\z_data acces and storage\z_Local File Systems\z_Directory Management\z_About Directory Management\Creating and Deleting Directories (Windows)_files\Combined.css
can't check stat of file: H:yadisks\_common\Документы\programming\литература\интернет оффлайн\msdn_html\z_windows development\z_data acces and storage\z_Local File Systems\z_Directory Management\z_Using Directory Management\Listing the Files in a Directory (Windows)_files\avatar(1).jpg
can't check stat of file: H:yadisks\_common\Документы\programming\литература\интернет оффлайн\msdn_html\z_windows development\z_data acces 

HTML(value='')

can't check stat of file: I:C\Users\feelus\AppData\Local\Packages\29352GalleryImages.SexyAnimeCosplayGirlsDaily_8cearpz8v2nym\LocalState\MarkedUp\Network\Requests\SessionStart\MarkedUp.Infrastructure.Networking.NetworkServiceRequest\635681119430186469_1_41cf8637e369493ab32f87e5c86b3b44
can't check stat of file: I:C\Users\feelus\AppData\Local\Packages\29352GalleryImages.SexyAnimeCosplayGirlsDaily_8cearpz8v2nym\LocalState\MarkedUp\Network\Requests\SessionStart\MarkedUp.Infrastructure.Networking.NetworkServiceRequest\635681143364703554_3_4da8796b58c742bca8683412d2d47aa5
can't check stat of file: I:C\Users\feelus\AppData\Local\Packages\winstore_cw5n1h2txyewy\LocalState\Cache\0\0-DevApps-https∺∯∯next-services.apps.microsoft.com∯search∯6.3.9600-0∯776∯en-US_en-US.en.ru∯c∯RU∯cp∯10012737∯DevApps∯pc∯0∯pt∯x64∯af∯0∯lf∯1∯pn∯1∿developerName=Droid-Veda%20LLP.dat
can't check stat of file: I:C\Users\feelus\AppData\Local\Packages\winstore_cw5n1h2txyewy\LocalState\Cache\0\0-DevApps-https∺∯∯next-services.

can't check stat of file: I:C\Users\feelus\AppData\Local\Packages\winstore_cw5n1h2txyewy\LocalState\Cache\0\0-ProductTileExtendedByProductGuid-https∺∯∯next-services.apps.microsoft.com∯browse∯6.3.9600-0∯788∯en-US_en-US.en.ru∯c∯RU∯cp∯10012737∯Apps∯8ce95333-7678-4d22-a039-90045f9a73c2.dat
can't check stat of file: I:C\Users\feelus\AppData\Local\Packages\winstore_cw5n1h2txyewy\LocalState\Cache\0\0-ProductTileExtendedByProductGuid-https∺∯∯next-services.apps.microsoft.com∯browse∯6.3.9600-0∯788∯en-US_en-US.en.ru∯c∯RU∯cp∯10012737∯Apps∯8d5ae319-09b9-490b-8d00-615aae7735dd.dat
can't check stat of file: I:C\Users\feelus\AppData\Local\Packages\winstore_cw5n1h2txyewy\LocalState\Cache\0\0-ProductTileExtendedByProductGuid-https∺∯∯next-services.apps.microsoft.com∯browse∯6.3.9600-0∯788∯en-US_en-US.en.ru∯c∯RU∯cp∯10012737∯Apps∯8e4e8056-0f2a-42e9-bfdd-677292bf9e4a.dat
can't check stat of file: I:C\Users\feelus\AppData\Local\Packages\winstore_cw5n1h2txyewy\LocalState\Cache\0\0-ProductTileExtendedByProductG

can't check stat of file: I:C\Users\feelus\cy\src\linux\drivers\gpu\drm\nouveau\nvkm\subdev\i2c\aux.c
can't check stat of dir: I:C\Users\feelus\cy\src\minix\external\mit\xorg\lib\xcb-util\aux
can't check stat of file: I:C\Users\feelus\Desktop\данные\Programming stuff_ C++ 11 FAQ от Бьярна Страуструпа_files\-dBKKQzGADDqMBCEf0MyZ0PjnXm2d5RGOSkmpoc_HuH_vRO7iCpZAueee9Ft5Wr9Ks3ZAkEA8YRNh09du0IhmrHp-eBbx0PtKHyXVUj64_x39E0o_IjWdBkKe69jWOCjeZKKIj8ssj91FHjwKw17xCopVz3_8dGh2QOuQQBJLNcjx2Frc
can't check stat of file: I:C\Users\feelus\Desktop\данные\Programming stuff_ C++ 11 FAQ от Бьярна Страуструпа_files\-kqmWgACtFXqMBCEf0MyZ0PjnXm2d5RGOSkmpoc_HuH_vRO7iCpZAueee9Ft5Wr9Ks3ZAkEA8YRNh09du0IhmrHp-eBbx0PtNKA2geoQZ75pr3lTAqSxzBkKe69jWOCjeZKKIj8ssj-FlAqKt84tia2qyF8Umtvfs6t2TP44usDo0E6TS
can't check stat of file: I:C\Users\feelus\Desktop\данные\Programming stuff_ C++ 11 FAQ от Бьярна Страуструпа_files\6BaLfFXAAuPqMBCEf0MyZ0PjnXm2d5RGOSkmpoc_HuH_vRO7iCpZAueee9Ft5Wr9Ks3ZAkEA8YRNh09du0IhmrHp-eBbx0PtWXGqEC1i

3

In [25]:
print('size in memory:',getmemsize(root_YD)/1024/1024,'MB')

root_YD_by_size,dups_YD_by_size = get_diplics_by_size(root_YD)

size in memory: 241.65509796142578 MB
by size:
размер повторений 308846.308 MB
размер лишнего    173053.359 MB
кол-во повторений 1634950
кол-во лишнего    1555823


In [27]:
old_root = load_hashes(hash_YD,rootpath_YD_list_list)

readed hash_cash.json
dict_keys(['qwer', 'H:', 'D:', 'I:'])


In [28]:
calc_hashes(root,old_root,rootpath_YD_list_list,root_YD_by_size,dups_YD_by_size)

HTML(value='')

[Errno 13] Permission denied: 'D:\\Users\\feelus\\AppData\\Local\\Google\\Chrome\\User Data\\Default\\Application Cache\\Cache\\data_3'
[Errno 13] Permission denied: 'D:\\Users\\feelus\\AppData\\Local\\Google\\Chrome\\User Data\\Default\\GPUCache\\data_3'
[Errno 13] Permission denied: 'D:\\Users\\feelus\\AppData\\Local\\Google\\Chrome\\User Data\\ShaderCache\\GPUCache\\data_3'
[Errno 13] Permission denied: 'D:\\Users\\feelus\\AppData\\Local\\Google\\Chrome\\User Data\\Default\\Application Cache\\Cache\\data_2'
[Errno 13] Permission denied: 'D:\\Users\\feelus\\AppData\\Local\\Google\\Chrome\\User Data\\Default\\GPUCache\\data_2'
[Errno 13] Permission denied: 'D:\\Users\\feelus\\AppData\\Local\\Google\\Chrome\\User Data\\Default\\Cache\\data_0'
[Errno 13] Permission denied: 'D:\\Users\\feelus\\AppData\\Local\\Google\\Chrome\\User Data\\Default\\Application Cache\\Cache\\data_1'
[Errno 13] Permission denied: 'D:\\Users\\feelus\\AppData\\Local\\Google\\Chrome\\User Data\\Default\\Applicati

In [29]:
unload_hashes(root,hash_YD,rootpath_YD_list_list)

start writing
writed hash_cash.json


In [30]:
dups_YD_by_hash = get_duplics_by_hash(root_YD,root_YD_by_size,dups_YD_by_size)

by hash:
размер всего   291016.797 MB
размер лишнего 154999.755 MB
кол-во повторений 869773
кол-во лишнего    563216


In [61]:
from copy import copy,deepcopy
dups_YD_by_hash_cp = deepcopy(dups_YD_by_hash)
root_cp = deepcopy(root)

ignore_files(dups_YD_by_hash_cp)
ignore_dirs(root_cp,rootpath_YD_list_list,dups_YD_by_hash_cp)

by hash, ignore files:
размер всего   290557.299 MB
размер лишнего 154690.154 MB
кол-во повторений 847057
кол-во лишнего    541447
prefix= []
by hash, ignore dirs:
размер всего   290382.758 MB
размер лишнего 154598.440 MB
кол-во повторений 842561
кол-во лишнего    538662


In [33]:
if type(rootpath_YD_list_list)==set:
    rootpath_YD = ''
count = 0
for s_h,pp in dups_YD_by_hash_cp.items():
    s,h = s_h
    for p in pp:
        if count<100 and is_subpath(p,r'D:\Users\feelus\Repos\muzon'.split('\\')):
          #p[0]=='Users':# and p[1]=='Users':# and p[2]=='Repos':
            count+=1
            ss = str(s)
            ss = ss if len(ss)<=3 else\
                ss[:-3]+'_'+ss[-3:] if len(ss)<=6 else\
                ss[:-6]+'_'+ss[-6:-3]+'_'+ss[-3:]
            print('\t('+ss+', "'+h+'" ),')
            for q in pp:
                if rootpath_YD=='':
                    print((my_path_join_l(list(q))))
                else:
                    #print([rootpath_YD]+list(q))
                    print((my_path_join_l([rootpath_YD]+list(q))))
            break
if count==100: print('...')
else:          print('--- the end ---')
    

	(40_300_358, "87c5392fa98a9f89116323eb8f232b1e" ),
H:\_музыка\_не умею - учить\Леонид Федоров и Крузенштерн Летел и Таял - YouTube.mp4
D:\Users\feelus\Repos\muzon\аккорды\_не умею - учить\Леонид Федоров и Крузенштерн Летел и Таял - YouTube.mp4
	(25_957_787, "d6af162c1579b1e26d2354702204848b" ),
D:\Users\feelus\Repos\muzon\аккорды\_не умею - учить\Сплин - Линия жизни - аккорды и текст аккорды песни для гитары   MuzLandru.mp4
H:\_музыка\2015\Сплин - Линия жизни - аккорды и текст аккорды песни для гитары   MuzLandru.mp4
	(943_482, "016ce56851dbf2b019c6c08e756b122e" ),
H:\_музыка\__ноты\пираты карибского моря\ver.1.pdf
D:\Users\feelus\Repos\muzon\ноты\пираты карибского моря\ver.1.pdf
	(939_735, "21a215c5c592ec3ca808a4e575aca8ec" ),
H:\_музыка\__ноты\пираты карибского моря\ver.2.pdf
D:\Users\feelus\Repos\muzon\ноты\пираты карибского моря\ver.2.pdf
	(259_354, "4f0879627de6f0603fa0e5e0248e37da" ),
H:\_музыка\__ноты\wind if chenge\48-scorpions_wind-of-change-1.jpg
D:\Users\feelus\Repos\muzon\

	(28_578, "7d83cd0c54f3fe43b52f7689bacc54a6" ),
H:\_музыка\__ноты\пираты карибского моря\piraty_karibskogo_morya_-_hes_a_pirate_5.gp5
D:\Users\feelus\Repos\muzon\ноты\пираты карибского моря\piraty_karibskogo_morya_-_hes_a_pirate_5.gp5
	(27_554, "dee0c580d5ff7cca3270a4a0b6b69ce6" ),
D:\Users\feelus\Repos\muzon\ноты\привет с большого бодуна.tif
H:\_музыка\__ноты\привет с большого бодуна.tif
	(27_211, "6c16653831941255222472ec02d2d029" ),
H:\_музыка\__ноты\пираты карибского моря\piraty_karibskogo_morya_-_hes_a_pirate_4.gpx
D:\Users\feelus\Repos\muzon\ноты\пираты карибского моря\piraty_karibskogo_morya_-_hes_a_pirate_4.gpx
	(25_974, "4284e69e1ba09c368b725c4aaacafc6d" ),
D:\Users\feelus\Repos\muzon\ноты\надежда\nadezda.gif
H:\_музыка\__ноты\nadezda.gif
	(24_987, "ee23d65b3b615b916f0317c749151ddf" ),
D:\Users\feelus\Repos\muzon\ноты\Ievan polkka\199.png
H:\_музыка\__ноты\Ievan polkka\199.png
	(22_043, "769b281ca336c03a11659ccd9849d0e4" ),
D:\Users\feelus\Repos\muzon\ноты\Ievan polkka\198.png

# Повторное

In [34]:
raise BaseException(1)

BaseException: 1

In [99]:
def upd_dir(root,dir_upd):
    root_upd = scan(dir_upd)
    dir_upd = dir_upd.split('\\')
    tmp_root = get_subtree(root,dir_upd[:-1])
    tmp_root[dir_upd[-1]] = root_upd

#upd_dir(root,r'H:\_музыка')
upd_dir(root,r'D:\Users\feelus\Repos')

#print('size in memory:',getmemsize(root_YD)/1024/1024,'MB')

root_YD_by_size,dups_YD_by_size = get_diplics_by_size(root_YD)

calculated = calc_hashes(root,old_root,rootpath_YD_list_list,root_YD_by_size,dups_YD_by_size)

dups_YD_by_hash = get_duplics_by_hash(root_YD,root_YD_by_size,dups_YD_by_size)

HTML(value='')

by size:
размер повторений 308629.807 MB
размер лишнего    172938.368 MB
кол-во повторений 1633733
кол-во лишнего    1554643


HTML(value='')

[Errno 13] Permission denied: 'D:\\Users\\feelus\\AppData\\Local\\Google\\Chrome\\User Data\\Default\\Application Cache\\Cache\\data_3'
[Errno 13] Permission denied: 'D:\\Users\\feelus\\AppData\\Local\\Google\\Chrome\\User Data\\Default\\GPUCache\\data_3'
[Errno 13] Permission denied: 'D:\\Users\\feelus\\AppData\\Local\\Google\\Chrome\\User Data\\ShaderCache\\GPUCache\\data_3'
[Errno 13] Permission denied: 'D:\\Users\\feelus\\AppData\\Local\\Google\\Chrome\\User Data\\Default\\Application Cache\\Cache\\data_2'
[Errno 13] Permission denied: 'D:\\Users\\feelus\\AppData\\Local\\Google\\Chrome\\User Data\\Default\\GPUCache\\data_2'
[Errno 13] Permission denied: 'D:\\Users\\feelus\\AppData\\Local\\Google\\Chrome\\User Data\\Default\\Cache\\data_0'
[Errno 13] Permission denied: 'D:\\Users\\feelus\\AppData\\Local\\Google\\Chrome\\User Data\\Default\\Application Cache\\Cache\\data_1'
[Errno 13] Permission denied: 'D:\\Users\\feelus\\AppData\\Local\\Google\\Chrome\\User Data\\Default\\Applicati

In [100]:
if 0:
    del_dir(root,rootpath_YD_list_list,dups_YD_by_hash,
    r'D:\Users\feelus\Repos\__my_repos\Instantfox', 
    r'D:\Users\feelus\Repos\__foreign\Instantfox',
)

In [101]:
len(calculated)

23626

In [96]:
if type(rootpath_YD_list_list)==set:
    rootpath_YD = ''
count = 0
for s_h,pp in dups_YD_by_hash.items():
    s,h = s_h
    for p in pp:
        if count<100 and s_h not in ignored_files and\
          is_subpath(p,r'D:\Users\feelus\Repos\__my_forked'.split('\\')):
          #p[0]=='Users':# and p[1]=='Users':# and p[2]=='Repos':
            lc = 0
            for q in pp:
                for ip in ignored_dirs:
                    #print(q,ip.split('\\'))
                    if is_subpath(q,ip.split('\\')):
                        lc+=1
                        break
            #print('lc=',lc)
            if len(pp)-lc < 2:
                break
            count+=1
            ss = str(s)
            ss = ss if len(ss)<=3 else\
                ss[:-3]+'_'+ss[-3:] if len(ss)<=6 else\
                ss[:-6]+'_'+ss[-6:-3]+'_'+ss[-3:]
            print('\t('+ss+', "'+h+'" ),')
            for q in pp:
                cont = False
                for ip in ignored_dirs:
                    if is_subpath(q,ip):
                        cont = True
                        break
                if cont:
                    continue
                if rootpath_YD=='':
                    print((my_path_join_l(list(q))))
                else:
                    #print([rootpath_YD]+list(q))
                    print((my_path_join_l([rootpath_YD]+list(q))))
            break
if count==100: print('...')
else:          print('--- the end ---')
    

	(308_949, "76bbdad0ad2612c17edb6a7f4c0764ee" ),
D:\Users\feelus\Repos\__foreign\X86-64-Disassembler-JS\DisassembleX86-64.js
D:\Users\feelus\Repos\__my_forked\X86-64-Disassembler-JS\DisassembleX86-64.js
	(23_946, "b53bb23e1db881ae9a7da9b52b1838fe" ),
D:\Users\feelus\cyg-home\src\jjv\lib\jjv.js
D:\Users\feelus\Repos\__my_forked\jjv\lib\jjv.js
I:\homePChome\cy\src\jjv\lib\jjv.js
	(21_643, "0aff83f07c0d3ee98e7b1f5038c5c7bd" ),
D:\Users\feelus\Repos\__foreign\X86-64-Disassembler-JS\Basic Live View.html
D:\Users\feelus\Repos\__my_forked\X86-64-Disassembler-JS\Basic Live View.html
	(12_541, "77a8c99fd11d2bc40edcd4d3011d0a32" ),
I:\homePChome\cy\src\jjv-instance\test\test-mini.js
D:\Users\feelus\cyg-home\src\jjv\test\test-mini.js
I:\homePChome\cy\src\jjv\test\test-mini.js
D:\Users\feelus\cyg-home\src\jjv-instance\test\test-mini.js
D:\Users\feelus\Repos\__my_forked\jjv\test\test-mini.js
	(9_306, "0a51f0f2205a828d75460e590d894f4f" ),
D:\Users\feelus\Repos\__my_forked\jjv\test\fixtures\type.json

I:\homePChome\Downloads\_src\node-v4.2.4\deps\npm\node_modules\request\node_modules\har-validator\node_modules\is-my-json-valid\test\json-schema-draft4\properties.json
D:\Users\feelus\AppData\Local\heroku\cli\lib\npm\node_modules\request\node_modules\har-validator\node_modules\is-my-json-valid\test\json-schema-draft4\properties.json
I:\homePChome\cy\src\jjv\test\fixtures\properties.json
I:\homePChome\cy\src\node\deps\npm\node_modules\request\node_modules\har-validator\node_modules\is-my-json-valid\test\json-schema-draft4\properties.json
	(2_630, "29fe620ec1c63b8b284caa9d2bd1e64e" ),
D:\Users\feelus\Repos\__my_forked\X86-64-Disassembler-JS\README.md
D:\Users\feelus\Repos\__foreign\X86-64-Disassembler-JS\README.md
	(2_613, "4587d491f53de101c3b679990610392d" ),
D:\Users\feelus\Repos\__my_forked\jjv\test\fixtures\uniqueItems.json
D:\Users\feelus\cyg-home\src\node\deps\npm\node_modules\request\node_modules\har-validator\node_modules\is-my-json-valid\test\json-schema-draft4\uniqueItems.json


	(1_351, "231f1f5ff03c46084d478ab16a971886" ),
D:\Users\feelus\cyg-home\src\jjv\test\fixtures\bignum.json
D:\Users\feelus\Repos\__my_forked\jjv\test\fixtures\bignum.json
I:\homePChome\cy\src\jjv-instance\test\fixtures\bignum.json
I:\homePChome\cy\src\jjv\test\fixtures\bignum.json
D:\Users\feelus\cyg-home\src\jjv-instance\test\fixtures\bignum.json
	(1_136, "789b756d656ccb37e5308d481b8ee937" ),
D:\Users\feelus\Downloads\_src\node-v4.2.4\tools\eslint\node_modules\is-my-json-valid\test\json-schema-draft4\items.json
D:\Users\feelus\cyg-home\src\node\deps\npm\node_modules\request\node_modules\har-validator\node_modules\is-my-json-valid\test\json-schema-draft4\items.json
I:\homePChome\Downloads\_src\node-v4.2.4\deps\npm\node_modules\request\node_modules\har-validator\node_modules\is-my-json-valid\test\json-schema-draft4\items.json
D:\Users\feelus\cyg-home\src\jjv\test\fixtures\items.json
D:\Users\feelus\Repos\__my_forked\jjv\test\fixtures\items.json
I:\homePChome\cy\src\node\deps\npm\node_mod

	(582, "1378b9c358e6cce14284d68d3530a1eb" ),
I:\homePChome\cy\src\node\deps\npm\node_modules\request\node_modules\har-validator\node_modules\is-my-json-valid\test\json-schema-draft4\pattern.json
D:\Users\feelus\cyg-home\src\jjv-instance\test\fixtures\pattern.json
D:\Users\feelus\cyg-home\src\node\deps\npm\node_modules\request\node_modules\har-validator\node_modules\is-my-json-valid\test\json-schema-draft4\pattern.json
I:\homePChome\Downloads\_src\node-v4.2.4\tools\eslint\node_modules\is-my-json-valid\test\json-schema-draft4\pattern.json
D:\Users\feelus\AppData\Roaming\npm\node_modules\node-inspector\node_modules\node-pre-gyp\node_modules\request\node_modules\har-validator\node_modules\is-my-json-valid\test\json-schema-draft4\pattern.json
D:\Users\feelus\AppData\Local\heroku\tmp\798193499\heroku\lib\npm\node_modules\request\node_modules\har-validator\node_modules\is-my-json-valid\test\json-schema-draft4\pattern.json
I:\homePChome\Downloads\_src\node-v4.2.4\deps\npm\node_modules\request\

In [138]:
#get_subtree(root,
#            r'D:\Users\feelus\YandexDisk\_programming_arxiv\_разное\_javascript\backup\start_goo_files'\
#            .split('\\'))

In [None]:
#get_subtree(root,
#            r'D:\Users\feelus\YandexDisk\_programming_arxiv\_разное\_javascript\javascript_goo\backup\start_goo_files'\
#            .split('\\'))