-
Notifications
You must be signed in to change notification settings - Fork 0
/
getdataFromWikipedia.py
48 lines (41 loc) · 1.13 KB
/
getdataFromWikipedia.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#!/bin/env python
# encoding:utf-8
#
# Author: CORDEA
# Created: 2014-10-19
#
import os, sys, commands, urllib2
from HTMLParser import HTMLParser, HTMLParseError
import smtplib
from email.mime.text import MIMEText
class ChParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.br = 0
self.sentense = ""
self.flag = False
self.sflag = False
self.count = 0
def handle_starttag(self, tag, attrs):
attrs = dict(attrs)
if 'tr' == tag:
self.flag = True
if 'td' == tag and self.flag:
self.count += 1
if self.count == 2 and 'a' == tag and 'title' in attrs:
self.sflag = True
def handle_endtag(self, tag):
if 'tr' == tag:
self.count = 0
self.flag = False
def handle_data(self, data):
if self.sflag:
print data.encode('utf-8')
self.sflag = False
if __name__ == '__main__':
url = sys.argv[1]
response = urllib2.urlopen(url)
html = response.read()
parser = ChParser()
parser.feed(html.decode('utf-8'))
parser.close()