# Scrape the following into CSV files. Each one is broken up into multiple tiers – the more you scrape the tougher it is!

https://www.congress.gov/members 

Tier 1: Scrape their name and full profile URL, and additional

Tier 2: Separate their state/party/etc into separate columns

Advanced: Scrape each person's actual data from their personal project

In [11]:
#import what's needed
import pandas as pd
from bs4 import BeautifulSoup
import requests

In [12]:
response = requests.get('https://www.congress.gov/members')
doc = BeautifulSoup(response.text)
print(doc.prettify())

<!DOCTYPE html>
<html class="no-js" lang="en">
 <head>
  <title>
   Members of the U.S. Congress | Congress.gov | Library of Congress
  </title>
  <meta charset="utf-8"/>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="http://www.congress.gov/" name="canonical"/>
  <meta content="http://www.congress.gov/" name="dc.identifier"/>
  <meta content="eng" name="dc.language"/>
  <meta content="Text is government work" name="dc.rights"/>
  <meta content="Members of Congress" name="dc.subject"/>
  <meta content="Legislative Data" name="dc.subject"/>
  <meta content="Congress" name="dc.subject"/>
  <meta content="Members of the U.S. Congress" name="dc.title"/>
  <meta content="legislation" name="dc.type"/>
  <meta content="webpage" name="dc.type"/>
  <meta content="Profiles of U.S. Representatives and Senators that include their legislative activity." name="description"/>
  <meta content="width=device-width,initial-scale=1" name="viewport"/>
  <meta conten

In [30]:
#find the highest category that contains all the info I want
doc.find_all("li", class_="expanded")

[<li class="expanded"> <div><span class="visualIndicator">MEMBER</span></div>
     1.
     <span class="result-heading"><a href="/member/james-abdnor/A000009">Senator Abdnor, James</a></span>
 <div class="quick-search-member">
 <div class="member-image"><img alt="Abdnor, James" src="/img/member/a000009_200.jpg"/></div>
 <div class="member-profile member-image-exists">
 <span class="result-item">
 <strong>State:</strong>
 <span>South Dakota</span>
 </span>
 <span class="result-item">
 <strong>Party:</strong>
 <span>Republican</span>
 </span>
 <span class="result-item">
 <strong>Served:</strong>
 <span>
 <ul class="member-served">
 <li>Senate: 1981-1987</li> <li>House: 1973-1981</li> </ul>
 </span>
 </span>
 </div>
 <div class="clear"></div>
 </div>
 </li>,
 <li class="expanded"> <div><span class="visualIndicator">MEMBER</span></div>
     2.
     <span class="result-heading"><a href="/member/neil-abercrombie/A000014">Representative Abercrombie, Neil</a></span>
 <div class="quick-search-m

In [39]:
#because it's a list, do the for loop
for headline in doc.find_all("li", class_="expanded"):
    #name
    print (headline.find("span").text)
    #url
    print (headline.find("a")["href"])
    
    #another for loop inside the span to print each strong and span which are the details
    for span in headline.find_all("span", class_="result-item"):
        print (span.find("strong").text) 
        print (span.find("span").text) 
    print ("----")

MEMBER
/member/james-abdnor/A000009
State:
South Dakota
Party:
Republican
Served:


Senate: 1981-1987 House: 1973-1981 

----
MEMBER
/member/neil-abercrombie/A000014
State:
Hawaii
District:
1
Party:
Democratic
Served:


House: 1985-1987, 1991-2011 

----
MEMBER
/member/james-abourezk/A000017
State:
South Dakota
Party:
Democratic
Served:


Senate: 1973-1979 House: 1971-1973 

----
MEMBER
/member/ralph-abraham/A000374
State:
Louisiana
District:
5
Party:
Republican
Served:


House: 2015-2021 

----
MEMBER
/member/spencer-abraham/A000355
State:
Michigan
Party:
Republican
Served:


Senate: 1995-2001 

----
MEMBER
/member/bella-abzug/A000018
State:
New York
District:
20
Party:
Democratic
Served:


House: 1971-1977 

----
MEMBER
/member/anibal-acevedo-vila/A000359
State:
Puerto Rico
District:
At Large
Party:
Democratic
Served:


House: 2001-2005 

----
MEMBER
/member/gary-ackerman/A000022
State:
New York
District:
5
Party:
Democratic
Served:


House: 1983-2013 

----
MEMBER
/member/alma-adams

In [50]:
#save that info  - the dataframe as a list of dictionary
#complete the url
members = []

for headline in doc.find_all("li", class_="expanded"):
    name = (headline.find("span", class_="result-heading").text)
    url = (headline.find("a")["href"])

    if url.startswith('/'):
        url = "https://www.congress.gov" + url
    
    member = {
            'name' : name,
            'url': url,
        }
        
    for span in headline.find_all("span", class_="result-item"):
        x = (span.find("strong").text) 
        y = (span.find("span").text) 
        
        member[x]=y
        
    
    members.append(member)
    
df = pd.DataFrame(members)
df

Unnamed: 0,name,url,State:,Party:,Served:,District:
0,"Senator Abdnor, James",https://www.congress.gov/member/james-abdnor/A...,South Dakota,Republican,\n\nSenate: 1981-1987 House: 1973-1981 \n,
1,"Representative Abercrombie, Neil",https://www.congress.gov/member/neil-abercromb...,Hawaii,Democratic,"\n\nHouse: 1985-1987, 1991-2011 \n",1
2,"Senator Abourezk, James",https://www.congress.gov/member/james-abourezk...,South Dakota,Democratic,\n\nSenate: 1973-1979 House: 1971-1973 \n,
3,"Representative Abraham, Ralph Lee",https://www.congress.gov/member/ralph-abraham/...,Louisiana,Republican,\n\nHouse: 2015-2021 \n,5
4,"Senator Abraham, Spencer",https://www.congress.gov/member/spencer-abraha...,Michigan,Republican,\n\nSenate: 1995-2001 \n,
...,...,...,...,...,...,...
95,"Senator Barkley, Dean M.",https://www.congress.gov/member/dean-barkley/B...,Minnesota,Independent,\n\nSenate: 2002-2003 \n,
96,"Representative Barletta, Lou",https://www.congress.gov/member/lou-barletta/B...,Pennsylvania,Republican,\n\nHouse: 2011-2019 \n,11
97,"Representative Barlow, Tom",https://www.congress.gov/member/tom-barlow/B00...,Kentucky,Democratic,\n\nHouse: 1993-1995 \n,1
98,"Representative Barnard, Doug, Jr.",https://www.congress.gov/member/doug-barnard/B...,Georgia,Democratic,\n\nHouse: 1977-1993 \n,10


In [52]:
#check the first of the list in the column title
df.name[0]

'Senator Abdnor, James'

In [54]:
#get rid of the spaces in front of and the end of the title
df.name = df.name.str.strip()

In [55]:
df.to_csv("congress_members.csv" , index=False)