-
Notifications
You must be signed in to change notification settings - Fork 0
/
tagMeCabParseJSONforCount.py
62 lines (56 loc) · 2.14 KB
/
tagMeCabParseJSONforCount.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#!/usr/bin/env python
#encoding:utf-8
#
__Author__ = "Yoshihiro Tanaka"
__date__ = "2014-11-21"
import json, sys, os, commands, unicodedata
from parseWord import *
dirs = commands.getoutput("ls " + sys.argv[1]).split("\n")
readDirs = []
for i in range(len(dirs)):
if "tag_" in dirs[i]:
dirname = sys.argv[1].rstrip("/") + "/" + dirs[i]
if os.path.isdir(dirname):
readDirs.append(dirname)
dirs = readDirs
tarDict = {}
tagnames = []
for dirname in dirs:
files = [dirname + "/" + r for r in commands.getoutput("ls " + dirname + "/").split("\n") if len(r) != 0]
header = True
tagDict = {}
for filename in files:
with open(filename) as f:
lines = f.readlines()
for line in lines:
try:
data = json.loads(line)
except Exception as e:
sys.stderr.write(e + "\n")
continue
if "values" in data:
if "tags" in data["values"][0]:
for values in data["values"]:
if u"アニメ" in values["tags"]:
tags = values["tags"].split()
for tag in tags:
parts = parseWord(tag)
for part in parts:
tagnames.append(part)
try:
tagDict[part] += 1
except:
tagDict[part] = 0
SUM = sum(tagDict.values())
tarDict[dirname.split("tag_")[1]] = {key: tagDict[key]/float(SUM) for key in tagDict.keys()}
tagnames = [unicodedata.normalize('NFKC', r).encode('utf-8') for r in list(set(tagnames))]
output = ["word"] + tagnames
print("\t".join(output))
for k, tagDict in tarDict.items():
output = [k]
for tag in tagnames:
if tag.decode('utf-8') in tagDict:
output.append(str(tagDict[tag.decode('utf-8')]))
else:
output.append(str(0))
print("\t".join(output))