-
Notifications
You must be signed in to change notification settings - Fork 32
/
prime.py
141 lines (120 loc) · 3.92 KB
/
prime.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#!/usr/bin/env python
# Copyright (C) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See LICENSE-CODE in the project root for license information.
"""
~snider
mppopulate: 2m17
cat: 3m30 376226 73.13 GB
grep: 3m44 376226 73.13 GB
python: 20 376226 73.13 GB
"""
import os
import sys
import time
import multiprocessing
from collections import defaultdict
import optparse
import traceback
import subprocess
MEGABYTE = 2**20
parser = optparse.OptionParser(
option_list = [
optparse.Option('--directory', '-d', help='The directory to walk (default=%default)', default="."),
optparse.Option('--workers', type='int', help="The number of workers (default=%default)", default=10),
optparse.Option('--interval', type='int', help='If specified, how often (in seconds) to update with progress', default=0),
])
(options, args) = parser.parse_args()
directory = options.directory
print "Walking %s" % directory,
print "(using %s workers)" % options.workers
Data = {'files': 0,
'size': 0,
'fsize': 0,
}
SkipDirectories = ['.zfs', '.snapshot']
workers = options.workers
LastCheck = time.time()
def process_result(result):
if not result:
return
for directory in result.get('dirs', []):
r=Pool.apply_async(process_directory, args=[directory])
Results[r] = directory
Data['files'] += result['files']
Data['size'] += result['size']
if 'fsize' in result:
Data['fsize'] += result['fsize']
global LastCheck
if options.interval and (time.time() - LastCheck) > options.interval:
print time.ctime(),"primed %s files (%s)" % (Data['files'], bytes(Data['size']))
LastCheck = time.time()
def count(c):
mod = ""
for mod in ["", "k", "M", "G", "T"]:
if c > 1000:
c /= 1000.0
else:
break
return "%.2f%s" % (c, mod)
def bytes(size, convert=True):
if convert:
size = float(size)
mod = ""
for mod in ["", "KB", "MB", "GB", "TB", "PB"]:
if size >= 1024:
size /= 1024
else:
break
if convert:
return "%.2f %s" % (size,mod)
else:
return "%s %s" % (size,mod)
def readfile(incfilename):
bytes = 0
try:
with open(incfilename, 'r') as f:
while True:
read = f.read(MEGABYTE)
bytes += len(read)
if not read:
break
except:
pass
return bytes
def process_directory(directory):
data = dict(dirs = [], files=0, size=0, fsize=0)
try:
entries = os.listdir(directory)
for entry in entries:
try:
fentry = os.path.join(directory, entry)
stat = os.lstat(fentry)
if not os.path.islink(fentry) and os.path.isdir(fentry) and entry not in SkipDirectories:
data['dirs'].append(fentry)
elif os.path.isfile(fentry) and not os.path.islink(fentry) and os.access(fentry, os.F_OK):
data['files'] += 1
data['size'] += stat.st_size
# subprocess.call("grep -e '^NOTEXPECTEDTOFINDTHISINTHEFILE' '%s' >& /dev/null" % fentry, shell=True)
# subprocess.call("cat '%s' >& /dev/null" % fentry, shell=True)
data['fsize'] += readfile(fentry)
except:
pass
except Exception, e:
print "fail"
traceback.print_exc()
return data
Pool = multiprocessing.Pool(workers)
Results = {}
r=Pool.apply_async(process_directory, args=[directory])
Results[r] = directory
while Results.keys():
for r,d in Results.items():
if not r.ready():
continue
process_result(r.get(timeout=1))
del Results[r]
time.sleep(.1)
Pool.close()
Pool.join()
print "Total files primed: %s" % count(Data['files'])
print "Total data primed: %s" % bytes(Data['size'])