Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rewrite of TXLeg.py along with .gitignoring config, csv files #1

Open
wants to merge 9 commits into
base: master
from
@@ -1,2 +1,4 @@
*.pyc
*.swp
*.csv
attended/SL/config.py
@@ -1,64 +1,89 @@
#!/usr/bin/env python

from bs4 import BeautifulSoup
from csv import DictWriter
from config import writePath
import urllib2
from urllib2 import urlopen
from unidecode import unidecode
import os.path
import re
import sys

sys.path.append(
os.path.join(sys.path[0], '../../lib')
)
from govbot.util import multiline_strip

def getAKrep(url):
while True:
print url
try:
response = urllib2.urlopen(url, timeout=10)
soup = BeautifulSoup(response.read()).find('div', {'id': 'fullpage'})
district = re.sub(r'^.*District: ([0-9A-Za-z]*).*$', r'\1', soup.get_text().replace('\n', ' '))
party = re.sub(r'^.*Party: ([0-9A-Za-z]*).*$', r'\1', soup.get_text().replace('\n', ' '))
email = ''
tempEmail = soup.find('a', {'href': re.compile('mailto')})
if tempEmail is not None:
email = re.sub('[Mm][Aa][Ii][Ll][Tt][Oo]:', '', tempEmail.get('href'))
return district, party, email
except Exception:
pass
def getAKLeg():
house, senate = map(
lambda body: BeautifulSoup(
urlopen('http://house.legis.state.ak.us/').read()
).find(
'div', {'id': 'tab1-2'}
).find(
'ul', {'class': 'people-holder'}
).find(
'ul', {'class': 'item'}
).find_all('li'),
('house', 'senate')
)

dictList = []

def getAKLeg(partyDict):
houseSoup = BeautifulSoup(urllib2.urlopen('http://house.legis.state.ak.us/').read())
senateSoup = BeautifulSoup(urllib2.urlopen('http://senate.legis.state.ak.us/').read())
for body, table in zip(('House', 'Senate'), (house, senate)):
for item in table:
repInfo = {}
repInfo['Name'] = unidecode(
item.find('strong', {'class': 'name'}).string
).strip()

houseTable = houseSoup.find('div', {'id': 'legislators'}).find_all('div', {'class': 'leg_float'})
senateTable = senateSoup.find('div', {'id': 'legislators'}).find_all('div', {'class': 'leg_float'})
link = item.find('a')
repInfo['Website'] = link.get('href')

dictList = []
dl = item.find('dl')
district = re.search(
r'District:\s*(\w+)', dl.get_text(), re.DOTALL
).group(1)
repInfo['District'] = 'AK State {0} District {1}'.format(
body, district
)

repInfo['Party'] = re.search(
r'Party:\s*(\w+)', dl.get_text(), re.DOTALL
).group(1)

repInfo['Phone'] = re.search(
r'Phone:\s*([0-9-]+)', dl.get_text(), re.DOTALL
).group(1)

for item in houseTable:
repInfo = {}
link = item.find('a')
repInfo['Name'] = link.string.strip().replace(' ', ' ').replace(' ', ' ').replace(' ', ' ').replace(u'\u00f1', 'n').replace(u'\u2018', "'").replace(u'\u2019', "'").replace(u'\u201A', "'").replace(u'\u201B', "'").replace(u'\u2039', "'").replace(u'\u203A', "'").replace(u'\u201C', '"').replace(u'\u201D', '"').replace(u'\u201E', '"').replace(u'\u201F', '"').replace(u'\u00AB', '"').replace(u'\u00BB', '"').replace(u'\u00e0', 'a').replace(u'\u00e1', 'a').replace(u'\u00e8', 'e').replace(u'\u00e9', 'e').replace(u'\u00ec', 'i').replace(u'\u00ed', 'i').replace(u'\u00f2', 'o').replace(u'\u00f3', 'o').replace(u'\u00f9', 'u').replace(u'\u00fa', 'u')
repInfo['Website'] = link.get('href')
tempdist, tempparty, repInfo['Email'] = getAKrep(repInfo['Website'])
repInfo['District'] = 'AK State House District ' + tempdist
repInfo['Party'] = partyDict[str(tempparty)]
dictList.append(repInfo)
repInfo['Email'] = dl.find('a').get('href').replace('mailto:', '')

for item in senateTable:
repInfo = {}
link = item.find('a')
repInfo['Name'] = link.string.strip().replace(' ', ' ').replace(' ', ' ').replace(' ', ' ').replace(u'\u00f1', 'n').replace(u'\u2018', "'").replace(u'\u2019', "'").replace(u'\u201A', "'").replace(u'\u201B', "'").replace(u'\u2039', "'").replace(u'\u203A', "'").replace(u'\u201C', '"').replace(u'\u201D', '"').replace(u'\u201E', '"').replace(u'\u201F', '"').replace(u'\u00AB', '"').replace(u'\u00BB', '"').replace(u'\u00e0', 'a').replace(u'\u00e1', 'a').replace(u'\u00e8', 'e').replace(u'\u00e9', 'e').replace(u'\u00ec', 'i').replace(u'\u00ed', 'i').replace(u'\u00f2', 'o').replace(u'\u00f3', 'o').replace(u'\u00f9', 'u').replace(u'\u00fa', 'u')
repInfo['Website'] = link.get('href')
tempdist, tempparty, repInfo['Email'] = getAKrep(repInfo['Website'])
repInfo['District'] = 'AK State Senate District ' + tempdist
repInfo['Party'] = partyDict[str(tempparty)]
dictList.append(repInfo)
member_soup = BeautifulSoup(urlopen(repInfo['Website']).read())
repInfo['Address'] = multiline_strip(
re.search(
r'Session Contact(.+99801)',
member_soup.find_all('div', {'class': 'bioleft'})[1].get_text(),
re.DOTALL
).group(1)
)
print str(repInfo) + '\n'
dictList.append(repInfo)

return dictList


if __name__ == '__main__':
partyDict = {'(R)': 'Republican', '(D)': 'Democratic', '(I)': 'Independent', 'R': 'Republican', 'D': 'Democratic', '': 'Unknown', 'I': 'Independent', 'Democrat': 'Democratic', 'Republican': 'Republican', 'Democratic': 'Democratic', 'Independent': 'Independent'}
dictList = getAKLeg(partyDict)
with open(writePath + 'AKLeg.csv', 'w') as csvFile:
dwObject = DictWriter(csvFile, ['District', 'Name', 'Party', 'Website', 'Email', 'Phone', 'Address'], restval='')
dictList = getAKLeg()
with open(os.path.join(writePath, 'AKLeg.csv'), 'w') as csvFile:
dwObject = DictWriter(
csvFile,
[
'District', 'Name', 'Party', 'Website', 'Email', 'Phone',
'Address'
],
restval='',
lineterminator='\n'
)
dwObject.writeheader()
for row in dictList:
dwObject.writerow(row)
@@ -1,62 +1,185 @@
#!/usr/bin/env python

from bs4 import BeautifulSoup
from csv import DictWriter
from config import writePath
import urllib2
from unidecode import unidecode
from urllib2 import urlopen
import csv
import os.path
import re
import sys

sys.path.append(
os.path.join(sys.path[0], '../../lib')
)
from govbot.util import multiline_strip

def get_tx_rep(url, body):

print 'Fetching ' + url + ' ...'

def getTXRep(url, partyDict, body):
# In case of connection failure:
while True:
print url
try:
response = urllib2.urlopen(url)
soup = BeautifulSoup(response.read(), 'lxml')
distSpan = soup.find('span', {'id': 'lblDistrict'})
district = ''
name = ''
phone = ''
address = ''
if distSpan is not None:
district = 'TX State {0} District {1}'.format(body, distSpan.get_text().strip())
name = re.sub(r'^.*(Rep\.|Sen\.)', '', soup.find('title').string.strip()).strip().replace(u'\u00A0', ' ').replace(' ', ' ').replace(' ', ' ').replace(u'\u0144', 'n').replace(u'\u00f1', 'n').replace(u'\u2018', "'").replace(u'\u2019', "'").replace(u'\u201A', "'").replace(u'\u201B', "'").replace(u'\u2039', "'").replace(u'\u203A', "'").replace(u'\u201C', '"').replace(u'\u201D', '"').replace(u'\u201E', '"').replace(u'\u201F', '"').replace(u'\u00AB', '"').replace(u'\u00BB', '"').replace(u'\u00e0', 'a').replace(u'\u00e1', 'a').replace(u'\u00e8', 'e').replace(u'\u00e9', 'e').replace(u'\u00ec', 'i').replace(u'\u00ed', 'i').replace(u'\u00f2', 'o').replace(u'\u00f3', 'o').replace(u'\u00f9', 'u').replace(u'\u00fa', 'u')
phone = soup.find('span', {'id': 'lblCapitolPhone'}).get_text().strip()
address = '{0} {1}'.format(soup.find('span', {'id': 'lblCapitolAddress1'}).get_text().strip(), soup.find('span', {'id': 'lblCapitolAddress2'}).get_text().strip())
return district, name, phone, address
except Exception:
pass


def getTXLeg(partyDict):
houseSoup = BeautifulSoup(urllib2.urlopen('http://www.capitol.state.tx.us/Members/Members.aspx?Chamber=H').read(), 'lxml')
senateSoup = BeautifulSoup(urllib2.urlopen('http://www.capitol.state.tx.us/Members/Members.aspx?Chamber=S').read(), 'lxml')
houseTable = houseSoup.find('table', {'id': 'dataListMembers'}).find_all('td')
senateTable = senateSoup.find('table', {'id': 'dataListMembers'}).find_all('td')
dictList = []

for item in houseTable:
repInfo = {}
link = item.find('a')
if link is not None:
repInfo['Website'] = 'http://www.capitol.state.tx.us/Members/' + link.get('href')
repInfo['District'], repInfo['Name'], repInfo['Phone'], repInfo['Address'] = getTXRep(repInfo['Website'], partyDict, 'House')
dictList.append(repInfo)

for item in senateTable:
repInfo = {}
link = item.find('a')
if link is not None:
repInfo['Website'] = 'http://www.capitol.state.tx.us/Members/' + link.get('href')
repInfo['District'], repInfo['Name'], repInfo['Phone'], repInfo['Address'] = getTXRep(repInfo['Website'], partyDict, 'Senate')
dictList.append(repInfo)

return dictList


if __name__ == "__main__":
partyDict = {'D': 'Democratic', 'R': 'Republican', 'I': 'Independent'}
dictList = getTXLeg(partyDict)
with open(writePath + 'TXLeg.csv', 'w') as csvFile:
dwObject = DictWriter(csvFile, ['District', 'Name', 'Party', 'Website', 'Phone', 'Address', 'Email', 'Facebook', 'Twitter'], restval='')
dwObject.writeheader()
for row in dictList:
dwObject.writerow(row)
response = urlopen(url)
break
except:
continue

soup = BeautifulSoup(response.read(), 'lxml')

return {
'house': get_house_rep,
'senate': get_senator
}[body](soup)


# Strip HTML tags, leading and trailing spaces on each line, redundant spacing:
def multiline_strip(string):
string = re.sub(r'\<.+?>', '', string)
string = re.sub(r'[ \t]+', ' ', string)
string = re.sub(r'^\s+|\s+$', '', string, flags=re.MULTILINE)
string = re.sub('[\n\r]+', '\n', string)
return string


def get_house_rep(soup):
member_info = soup.find('div', {'class': 'member-info'})

number = re.search(
r'District (\d+)', str(member_info)
).group(1)
district = 'TX State House District %s' % number

# TX House member names are in "Last, First" format:
def rewrite_name(string):
search = re.search('Rep. (.+?)(?:, (?!Jr.))(.+)', string)
if search is None:
return None

first, last = search.group(2).strip(), search.group(1).strip()
return unidecode(first + ' ' + last).strip()

name = rewrite_name(member_info.find('h2').get_text())

phone = re.search(
r'\([0-9]{3}\)\s[0-9]{3}-[0-9]{4}',
str(member_info)
).group()

address = multiline_strip(
re.search(
r'Capitol Address:(.+?787\d{2})',
str(member_info),
re.DOTALL
).group(1)
)

return {
'District': district,
'Name': name,
'Phone': phone,
'Address': address
}


def get_senator(soup):
memtitle = soup.find('div', {'class': 'memtitle'})

number = re.search(r'District (\d+)', memtitle.string).group(1)
district = 'TX State Senate District %s' % number

name = unidecode(
re.search(r'Senator (.+):', memtitle.string).group(1).strip()
)

memoffice = re.sub(
r'<.+?>',
'\n',
str(soup.find('td', {'class': 'memoffice'}))
).strip()

search = re.search(
r'(The Honorable.+787\d{2}).*(\(\d{3}\).+\d{3}-\d{4})',
memoffice,
re.DOTALL
)

address = thorough_strip(search.group(1))

phone = search.group(2).strip()

return {
'District': district,
'Name': name,
'Phone': phone,
'Address': address
}


# Start with the state-provided directories of members and then go to each
# member's page:
def get_tx_leg():

base_urls = {
'house': 'http://www.house.state.tx.us',
'senate': 'http://www.senate.state.tx.us/75r/Senate/'
}
tables = {
'house': BeautifulSoup(
urlopen('http://www.house.state.tx.us/members').read(),
'lxml'
).find(
'table', {'cellspacing': '10'}
).find_all('td'),

'senate': BeautifulSoup(
urlopen(
'http://www.senate.state.tx.us/75r/Senate/Members.htm'
).read(),
'lxml'
).find(
'table', {'summary': '3 column layout of List of senators by name'}
).find_all('li')
}

dict_list = []

for body in ('house', 'senate'):
for item in tables[body]:
rep_info = {}
link = item.find('a')

if link is None:
continue

url = base_urls[body] + link.get('href')
rep_info = {'Website': url}
rep_info.update(get_tx_rep(url, body))

# Skip entries with None values:
if len(filter(lambda val: val is None, rep_info.values())) > 0:
continue

print str(rep_info) + '\n'

dict_list.append(rep_info)

return dict_list


if __name__ == '__main__':
dict_list = get_tx_leg()
with open(os.path.join(writePath, 'TXLeg.csv'), 'w') as csv_file:
csv = csv.DictWriter(
csv_file,
[
'District', 'Name', 'Party', 'Website', 'Phone', 'Address',
'Email', 'Facebook', 'Twitter'
],
restval='',
lineterminator='\n'
)
csv.writeheader()
for row in dict_list:
csv.writerow(row)
@@ -1,3 +1,5 @@
import os

# Copy this file to config.py and edit the line below.

writePath = os.getenv('DROPBOX') + '/noBIP/gbOutput/'
No changes.
@@ -0,0 +1,8 @@
import re

# Strip leading and trailing spaces on each line, redundant spacing:
def multiline_strip(string):
string = re.sub(r'[ \t]+', ' ', string)
string = re.sub(r'^\s+|\s+$', '', string, flags=re.MULTILINE)
string = re.sub('[\n\r]+', '\n', string)
return string
ProTip! Use n and p to navigate between commits in a pull request.
You can’t perform that action at this time.