### Pull debate data from The American Presidency Project
http://www.presidency.ucsb.edu/index.php

In [2]:
from bs4 import BeautifulSoup
from bs4 import NavigableString
import urllib
import pandas
import re
import os
LOCAL_DATA_PATH = 'C:\Users\JoAnna\political_history\data' #path to your local repository

In [2]:
#get data from website (websites with b tags)
url_list_b = ['http://www.presidency.ucsb.edu/ws/index.php?pid=118971',
              'http://www.presidency.ucsb.edu/ws/index.php?pid=119038',
              'http://www.presidency.ucsb.edu/ws/index.php?pid=119012',
              'http://www.presidency.ucsb.edu/ws/index.php?pid=102322',
              'http://www.presidency.ucsb.edu/ws/index.php?pid=84526',
              'http://www.presidency.ucsb.edu/ws/index.php?pid=84482',
              'http://www.presidency.ucsb.edu/ws/index.php?pid=78691',
              'http://www.presidency.ucsb.edu/ws/index.php?pid=84382',
              'http://www.presidency.ucsb.edu/ws/index.php?pid=119039']         

In [3]:
#define parsing function
def parse_website_b(url):
     """
    Grabs data from website (url) and parses it.
        Data format: in class_='displaytext', debate speaker is seperated by text with <b/> tag.
    
    Args:
        url: 
        
    Returns:
        json files saved to directory with debate info parsed by paragraph:
        title, date, speaker, text
        
    """
    fetched = urllib.urlopen(url).read()
    soup = BeautifulSoup(fetched, "lxml")

    #parsing the data
    titles = unicode(soup.title.string)
    dates = unicode(soup.find("span", class_="docdate").string)
    body = soup.find("span", class_="displaytext")
    paragraphs = soup.find("span", class_="displaytext").findChildren("p")

    #make dataframe
    text_list = []
    speaker_list = []
    child_list = []

    #pull text and speaker from html
    for paragraph in paragraphs:
        text = unicode(paragraph.find(text=True, recursive=False))
        text_list.append(text)
        children = paragraph.findChildren('b')
        for child in children:
            child_list.append(child)
        if child_list == []:
            prevchild = body.find_previous_sibling('b')
            speaker_list.append(prevchild)
        else:
            speakers = unicode(paragraph.b.get_text())
            speaker_list.append(speakers)
        child_list[:] = []

    #replace 'None' in speaker list
    start = next(element for element in speaker_list if element is not None)
    for i, element in enumerate(speaker_list):
        if element is None:
            speaker_list[i] = start
        else:
            start = element

    # pandas dataframe
    columns = {'text': text_list, 'speaker': speaker_list, 'title': titles, 'date': dates}
    debates = pandas.DataFrame(columns)

    #export to json
    directory_name = 'C:\Users\JoAnna\political_history\data'
    base_filename = str(re.findall(r'\d+', url))
    suffix = '.json'
    save_path = os.path.join(directory_name, base_filename + suffix)

    debates.to_json(save_path, orient='index')

In [4]:
#write json objects
map(parse_website_b, url_list_b)

[None, None, None, None, None, None, None, None, None]

In [5]:
#list of websites with <i/> tags
url_list_i = ['http://www.presidency.ucsb.edu/ws/index.php?pid=102344',
              'http://www.presidency.ucsb.edu/ws/index.php?pid=102343',
              'http://www.presidency.ucsb.edu/ws/index.php?pid=102317',
              'http://www.presidency.ucsb.edu/ws/index.php?pid=63163',
              'http://www.presidency.ucsb.edu/ws/index.php?pid=72776',
              'http://www.presidency.ucsb.edu/ws/index.php?pid=72770']             

In [6]:
#define parsing function
def parse_website_i(url):
    """
    Grabs data from website (url) and parses it.
        Data format: in class_='displaytext', debate speaker is seperated by text with <i/> tag.
    
    Args:
        url: 
        
    Returns:
        json files saved to directory with debate info parsed by paragraph:
        title, date, speaker, text
        
    """
    fetched = urllib.urlopen(url).read()
    soup = BeautifulSoup(fetched, "lxml")

    #parsing the data
    titles = unicode(soup.title.string)
    dates = unicode(soup.find("span", class_="docdate").string)
    body = soup.find("span", class_="displaytext")
    paragraphs = soup.find("span", class_="displaytext").findChildren("p")

    #make dataframe
    text_list = []
    speaker_list = []
    child_list = []

    #pull text and speaker from html
    for paragraph in paragraphs:
        text = unicode(paragraph.find(text=True, recursive=False))
        text_list.append(text)
        children = paragraph.findChildren('i')
        for child in children:
            child_list.append(child)
        if child_list == []:
            prevchild = body.find_previous_sibling('i')
            speaker_list.append(prevchild)
        else:
            speakers = unicode(paragraph.i.get_text())
            speaker_list.append(speakers)
        child_list[:] = []

    #replace 'None' in speaker list
    start = next(element for element in speaker_list if element is not None)
    for i, element in enumerate(speaker_list):
        if element is None:
            speaker_list[i] = start
        else:
            start = element

    # pandas dataframe
    columns = {'text': text_list, 'speaker': speaker_list, 'title': titles, 'date': dates}
    debates = pandas.DataFrame(columns)

    #export to json
    directory_name = 'C:\Users\JoAnna\political_history\data'
    base_filename = str(re.findall(r'\d+', url))
    suffix = '.json'
    save_path = os.path.join(directory_name, base_filename + suffix)
    debates.to_json(save_path, orient='index')

In [7]:
#write json objects
map(parse_website_i, url_list_i)

[None, None, None, None, None, None]

In [5]:
#url list with no separation of speaker tags
url_list_none = ['http://www.presidency.ucsb.edu/ws/index.php?pid=29428',
                 'http://www.presidency.ucsb.edu/ws/index.php?pid=29420',
                 'http://www.presidency.ucsb.edu/ws/index.php?pid=29419',
                 'http://www.presidency.ucsb.edu/ws/index.php?pid=29418',
                 'http://www.presidency.ucsb.edu/ws/index.php?pid=29421']

In [16]:
#define parsing function
def parse_website_none(url):
    """
    Grabs data from website (url) and parses it.
        Data format: in class_='displaytext', debate speaker is not seperated from text.
            uses re to grab speaker info from text
    
    Args:
        url: 
        
    Returns:
        json files saved to directory with debate info parsed by paragraph:
        title, date, speaker, text
        
    """
    fetched = urllib.urlopen(url).read()
    soup = BeautifulSoup(fetched, "lxml")

    #parsing the data
    titles = unicode(soup.title.string)
    dates = unicode(soup.find("span", class_="docdate").string)
    body = soup.find("span", class_="displaytext")
    paragraphs = soup.find("span", class_="displaytext").findChildren("p")

    #make dataframe
    text_list = []
    speaker_list = []
    child_list = []

    #pull text and speaker from html
    for paragraph in paragraphs:
        text = unicode(paragraph.find(text=True, recursive=False))
        text_list.append(text)
        
    regex = re.compile('[A-Z ]+:.?')                   
    for element in text_list:
        speaker = re.findall(regex, element)
        if speaker == []:
            speaker_list.append(None)
        else:
            speaker_list.append(speaker)
        
    #replace 'None' in speaker list
    start = next(element for element in speaker_list if element is not None)
    for i, element in enumerate(speaker_list):
        if element is None:
            speaker_list[i] = start
        else:
            start = element

    # pandas dataframe
    columns = {'text': text_list, 'speaker': speaker_list, 'title': titles, 'date': dates}
    debates = pandas.DataFrame(columns)
    debates['speaker'] = debates['speaker'].str.get(0)

    #export to json
    directory_name = LOCAL_DATA_PATH
    base_filename = str(re.findall(r'\d+', url))
    suffix = '.json'
    save_path = os.path.join(directory_name, base_filename + suffix)
    debates.to_json(save_path, orient='index')

In [17]:
#write json objects
map(parse_website_none, url_list_none)

[None, None, None, None, None]