public
Description: Speed testing for a data munging task
Homepage: http://anyall.org/blog/?p=652
Clone URL: git://github.com/brendano/awkspeed.git
awkspeed / 2num.py
100644 21 lines (20 sloc) 0.469 kb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
import sys
from collections import defaultdict
imap = {}
jmap = {}
I = defaultdict(lambda:0)
J = 0
vocab = open("vocab", 'w')
for file in sys.argv[1:]:
  out = open(file+"n", 'w')
  for line in open(file):
    item,feat,val = line.split()
    if (file,item) not in imap:
      I[file] += 1
      imap[file,item] = I[file]
    if feat not in jmap:
      J += 1
      jmap[feat] = J
      print>>vocab, feat
    print>>out, imap[file,item], jmap[feat], val
  out.close()