forked from timwoj/ctrprogress
-
Notifications
You must be signed in to change notification settings - Fork 0
/
buildjson.py
56 lines (45 loc) · 1.7 KB
/
buildjson.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#!/usr/local/bin/python
# This script takes exported HTML from the CTR master raid group list maintained
# by Knate, parses all of the groups, and outputs a block of JSON suitable for
# loading into the datastore on the CTRRanks site.
from lxml import html
import json
import os
jsontext = list()
for file in os.listdir('.'):
if file.endswith('html') == False:
continue
f = open(file,'r')
text = f.read()
if 'Disbanded' in text:
continue
elif 'Team Information' not in text:
continue
tree = html.fromstring(text)
alltrs = tree.xpath("//tbody/tr")
group = dict()
group['toons'] = list()
for i,row in enumerate(alltrs):
if i:
# break down each <tr> row into the individual <td> children
# and then get the text from each one of them. stick that
# text into a list.
row = [c.text_content() for c in row.getchildren()]
if i == 1:
# there's a few groups that are poorly formed in the HTML
# data and should just be skipped for simplicity
if row[2] == None:
break
group['name'] = row[2].encode('utf-8','ignore')
toon = row[4]
if toon == None:
continue
elif toon == 'Aerie Peak':
toon = row[3]
elif toon in ['Tank','Heals','Heals/DPS','DPS','DPS/Tank','Bench/Alt']:
toon = row[2]
if toon != None and len(toon) != 0:
group['toons'].append(toon.encode('utf-8','ignore'))
if len(group['toons']) != 0:
jsontext.append(group)
print json.dumps(jsontext)