# Finding duplicate files and remove useless files

Strongly based on [this code](https://www.pythoncentral.io/finding-duplicate-files-with-python/).

Be really careful with this script, in particular with the `clean()` function, which should be modified by the user to remove files with specific extensions.

In [6]:
#%tb

import os
import sys
import hashlib
from pathlib import Path

def hashfile(path,blocksize=65536):
    """Function to find the MD5 signature of a given file, in order to compare it with another one"""
    afile = open(path,'rb')
    hasher = hashlib.md5()
    buf = afile.read(blocksize)
    while len(buf) > 0:
        hasher.update(buf)
        buf = afile.read(blocksize)
    afile.close()
    return hasher.hexdigest()

def clean(filename):
    """Clean useless files"""
    list2delete = ['.bak','.aux','.pyc','.old','.OLD'] 
    if os.path.splitext(filename)[1] in list2delete:
        print('!!!!!!  removing %s !!!!!' % filename)
        os.remove(filename)
        return(1)
    return

def findDup(parentFolder):
    """a function to scan a directory for duplicate files"""
    dups = {}
    excludeDirs = ('.git')
    for dirName, subdirs, fileList in os.walk(parentFolder):
        if dirName not in excludeDirs:
            print ("Scanning %s..." % dirName)
            #print (fileList)
            for filename in fileList:
                #First, get the path to the file
                path = os.path.join(dirName,filename)
                if clean(path) != 1: # check if I had to delete the file. Such way, do not count it below
                    #print(Path(path).stat().st_size)
                    if Path(path).stat().st_size > 10:
                        #now get its MD5 hash
                        file_hash = hashfile(path)
                        # update the dictionary with the duplicated files
                        if file_hash in dups:
                            dups[file_hash].append(path)
                        else:
                            dups[file_hash]=[path]
    print('search done')
    return dups

def printResults(dict1):
    """MEthod to print the results"""
    results = list(filter(lambda x: len(x) > 1, dict1.values()))
    if len(results) > 0:
        print('# Duplicates Found:')
        print('# The content of these files is duplicated')
        print('# ----------------------------------------')
        for result in results:
            item = 0
            for subresult in result:
                if item == 0:  # let us assume the first one is the one to keep (Check!!)
                    print('# \'%s\'' % subresult)
                    item = 1
                else:
                    print('rm \'%s\'' % subresult)
            print('# ------------')
    else:
        print('No duplicates found')

dictionary=findDup(os.path.expanduser('~')+r'/OneDrive/biblio')
printResults(dictionary)

Scanning /home/jordivilla/OneDrive/biblio...
Scanning /home/jordivilla/OneDrive/biblio/Molecular Neurodegeneration...
Scanning /home/jordivilla/OneDrive/biblio/Molecular Neurodegeneration/2022...
Scanning /home/jordivilla/OneDrive/biblio/Genome Medicine...
Scanning /home/jordivilla/OneDrive/biblio/Genome Medicine/2020...
Scanning /home/jordivilla/OneDrive/biblio/Current Medicinal Chemistry...
Scanning /home/jordivilla/OneDrive/biblio/Current Medicinal Chemistry/2020...
Scanning /home/jordivilla/OneDrive/biblio/Current Medicinal Chemistry/2004...
Scanning /home/jordivilla/OneDrive/biblio/npj Genomic Medicine...
Scanning /home/jordivilla/OneDrive/biblio/npj Genomic Medicine/2022...
Scanning /home/jordivilla/OneDrive/biblio/Cancers...
Scanning /home/jordivilla/OneDrive/biblio/Cancers/2022...
Scanning /home/jordivilla/OneDrive/biblio/Chemical Society Reviews...
Scanning /home/jordivilla/OneDrive/biblio/Chemical Society Reviews/2024...
Scanning /home/jordivilla/OneDrive/biblio/Biotechnology