-
Notifications
You must be signed in to change notification settings - Fork 100
/
Copy pathmain.py
90 lines (71 loc) · 2.54 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import sys
import os
import hashlib
def chunk_reader(fobj, chunk_size=1024):
while True:
chunk = fobj.read(chunk_size)
if not chunk:
return
yield chunk
def get_hash(filename, first_chunk_only=False, hash=hashlib.sha1):
hashobj = hash()
file_object = open(filename, 'rb')
if first_chunk_only:
hashobj.update(file_object.read(1024))
else:
for chunk in chunk_reader(file_object):
hashobj.update(chunk)
hashed = hashobj.digest()
file_object.close()
return hashed
def check_for_duplicates(paths, hash=hashlib.sha1):
hashes_by_size = {}
hashes_on_1k = {}
hashes_full = {}
for path in paths:
for dirpath, dirnames, filenames in os.walk(path):
for filename in filenames:
full_path = os.path.join(dirpath, filename)
try:
full_path = os.path.realpath(full_path)
file_size = os.path.getsize(full_path)
except (OSError,):
# not accessible (permissions, etc) - pass on
continue
duplicate = hashes_by_size.get(file_size)
if duplicate:
hashes_by_size[file_size].append(full_path)
else:
hashes_by_size[file_size] = []
hashes_by_size[file_size].append(full_path)
for __, files in hashes_by_size.items():
if len(files) < 2:
continue
for filename in files:
try:
small_hash = get_hash(filename, first_chunk_only=True)
except (OSError,):
continue
duplicate = hashes_on_1k.get(small_hash)
if duplicate:
hashes_on_1k[small_hash].append(filename)
else:
hashes_on_1k[small_hash] = []
hashes_on_1k[small_hash].append(filename)
for __, files in hashes_on_1k.items():
if len(files) < 2:
continue
for filename in files:
try:
full_hash = get_hash(filename, first_chunk_only=False)
except (OSError,):
continue
duplicate = hashes_full.get(full_hash)
if duplicate:
print ("Duplicates are: %s and %s" % (filename, duplicate))
else:
hashes_full[full_hash] = filename
if sys.argv[1:]:
check_for_duplicates(sys.argv[1:])
else:
print("Please pass the paths to check as parameters to the script")