## XML and HTML: Web Scraping

In [0]:
import lxml
import bs4
import html5lib

In [0]:
import numpy as np
import pandas as pd

In [0]:
from google.colab import files

uploaded = files.upload()

Saving fdic_failed_bank_list.html to fdic_failed_bank_list (2).html


In [0]:
tables = pd.read_html('fdic_failed_bank_list.html')

In [0]:
len(tables)

1

In [0]:
failures = tables[0]

In [0]:
failures.head()

Unnamed: 0,Bank Name,City,ST,CERT,Acquiring Institution,Closing Date,Updated Date
0,Allied Bank,Mulberry,AR,91,Today's Bank,"September 23, 2016","November 17, 2016"
1,The Woodbury Banking Company,Woodbury,GA,11297,United Bank,"August 19, 2016","November 17, 2016"
2,First CornerStone Bank,King of Prussia,PA,35312,First-Citizens Bank & Trust Company,"May 6, 2016","September 6, 2016"
3,Trust Company Bank,Memphis,TN,9956,The Bank of Fayette County,"April 29, 2016","September 6, 2016"
4,North Milwaukee State Bank,Milwaukee,WI,20364,First-Citizens Bank & Trust Company,"March 11, 2016","June 16, 2016"


In [0]:
close_timestamps = pd.to_datetime(failures['Closing Date'])

In [0]:
close_timestamps.dt.year.value_counts()

2010    157
2009    140
2011     92
2012     51
2008     25
2013     24
2014     18
2002     11
2015      8
2016      5
2004      4
2001      4
2007      3
2003      3
2000      2
Name: Closing Date, dtype: int64

## Parsing XML with lxml.objectify

In [0]:
uploaded = files.upload()

Saving Performance_MNR.xml to Performance_MNR (1).xml


In [0]:
from lxml import objectify

In [0]:
path = 'Performance_MNR.xml'

parsed = objectify.parse(open(path))

root = parsed.getroot()

In [0]:
data = []

skip_fields = ['PARENT_SEQ', 'INDICATOR_SEQ', 
               'DESIRED_CHANGE', 'DECIMAL_PLACES']

for elt in root.INDICATOR:
  el_data = {}
  for child in elt.getchildren():
    if child.tag in skip_fields:
      continue
    el_data[child.tag] = child.pyval
  data.append(el_data)

In [0]:
perf = pd.DataFrame(data)

perf.head()

Unnamed: 0,AGENCY_NAME,CATEGORY,DESCRIPTION,FREQUENCY,INDICATOR_NAME,INDICATOR_UNIT,MONTHLY_ACTUAL,MONTHLY_TARGET,PERIOD_MONTH,PERIOD_YEAR,YTD_ACTUAL,YTD_TARGET
0,Metro-North Railroad,Service Indicators,Percent of commuter trains that arrive at thei...,M,On-Time Performance (West of Hudson),%,96.9,95,1,2008,96.9,95
1,Metro-North Railroad,Service Indicators,Percent of commuter trains that arrive at thei...,M,On-Time Performance (West of Hudson),%,95.0,95,2,2008,96.0,95
2,Metro-North Railroad,Service Indicators,Percent of commuter trains that arrive at thei...,M,On-Time Performance (West of Hudson),%,96.9,95,3,2008,96.3,95
3,Metro-North Railroad,Service Indicators,Percent of commuter trains that arrive at thei...,M,On-Time Performance (West of Hudson),%,98.3,95,4,2008,96.8,95
4,Metro-North Railroad,Service Indicators,Percent of commuter trains that arrive at thei...,M,On-Time Performance (West of Hudson),%,95.8,95,5,2008,96.6,95


In [0]:
from io import StringIO

In [0]:
tag = '<a href="http://www.google.com">Google</a>'

root = objectify.parse(StringIO(tag)).getroot()

In [0]:
root

<Element a at 0x7f927323bf08>

In [0]:
root.get('href')

'http://www.google.com'

In [0]:
root.text

'Google'

# 6.2 Binary Data Formats

In [0]:
uploaded = files.upload()

Saving ex1.csv to ex1 (2).csv


In [0]:
frame = pd.read_csv('ex1.csv')

frame

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [0]:
frame.to_pickle('frame_pickle')

In [0]:
pd.read_pickle('frame_pickle')

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


## Using HDF5 Format

In [0]:
frame = pd.DataFrame({'a': np.random.randn(100)})

In [0]:
store = pd.HDFStore('mydata.h5')

In [0]:
store['obj1'] = frame

In [0]:
store['obj1_col'] = frame['a']

In [0]:
store

<class 'pandas.io.pytables.HDFStore'>
File path: mydata.h5

In [0]:
store['obj1']

Unnamed: 0,a
0,1.484834
1,-1.001247
2,-1.354043
3,1.151093
4,-0.532609
5,-1.015412
6,-1.908856
7,1.703475
8,0.464229
9,1.271375


In [0]:
store.put('obj2', frame, format = 'table')

In [0]:
store.select('obj2', where = ['index >= 10 and index <= 15'])

Unnamed: 0,a
10,-0.179128
11,-0.177409
12,-0.4264
13,-0.180413
14,0.420256
15,-0.624921


In [0]:
store.close()

In [0]:
frame.to_hdf('mydata.h5', 'obj3', format = 'table')

In [0]:
pd.read_hdf('mydata.h5', 'obj3', where = ['index < 5'])

Unnamed: 0,a
0,1.484834
1,-1.001247
2,-1.354043
3,1.151093
4,-0.532609


## Reading Microsoft Excel Files

In [0]:
uploaded = files.upload()

Saving ex1.xlsx to ex1 (2).xlsx


In [0]:
xlsx = pd.ExcelFile('ex1.xlsx')

In [0]:
pd.read_excel(xlsx, 'Sheet1')

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [0]:
frame = pd.read_excel('ex1.xlsx', 'Sheet1')

frame

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [0]:
writer = pd.ExcelWriter('ex2.xlsx')

In [0]:
frame.to_excel(writer, 'Sheet1')

In [0]:
writer.save()

In [0]:
frame.to_excel('ex2.xlsx')

# 6.3 Interacting with Web APIs

In [0]:
import requests

In [0]:
url = 'https://api.github.com/repos/pandas-dev/pandas/issues'

In [0]:
resp = requests.get(url)

resp

<Response [200]>

In [0]:
data = resp.json()

In [0]:
data[0]['title']

'Grouped dataframe "name" attribute overrides column access / not well documented'

In [0]:
issues = pd.DataFrame(data, columns = ['number', 'title',
                                       'labels', 'state'])

issues

Unnamed: 0,number,title,labels,state
0,25457,"Grouped dataframe ""name"" attribute overrides c...",[],open
1,25456,Support tuples in iloc,[],open
2,25455,CI: add __init__.py to isort skip list,"[{'id': 48070600, 'node_id': 'MDU6TGFiZWw0ODA3...",open
3,25454,Dataframe agg method passing as list,[],open
4,25453,read_excel na_filter undocumented?,[],open
5,25452,STY: use pytest.raises context manager (tests/...,[],open
6,25451,dropna(inplace=True) doesn't work when loading...,[],open
7,25449,read_excel throws ValueError: cannot specify u...,"[{'id': 307649777, 'node_id': 'MDU6TGFiZWwzMDc...",open
8,25448,Serialize/deserialize a Categorical whose valu...,[],open
9,25447,STY: use pytest.raises context manager (indexes),"[{'id': 48070600, 'node_id': 'MDU6TGFiZWw0ODA3...",open


# 6.4 Interacting with Databases

In [0]:
import sqlite3

In [0]:
query = """
  CREATE TABLE test1
  (a VARCHAR(20), b VARCHAR(20),
   c REAL,        d INTEGER);
"""

In [0]:
con = sqlite3.connect('mydata.sqlite')

In [0]:
con.execute(query)

<sqlite3.Cursor at 0x7f926fc21110>

In [0]:
con.commit()

In [0]:
data = [('Atlanta', 'Georgia', 1.25, 6),
        ('Tallahassee', 'Florida', 2.6, 3),
        ('Sacramento', 'California', 1.7, 5)]

In [0]:
stmt = "INSERT INTO test1 VALUES(?, ?, ?, ?)"

In [0]:
con.executemany(stmt, data)

<sqlite3.Cursor at 0x7f92700859d0>

In [0]:
con.commit()

In [0]:
cursor = con.execute('select * from test1')

In [0]:
rows = cursor.fetchall()

In [0]:
rows

[('Atlanta', 'Georgia', 1.25, 6),
 ('Tallahassee', 'Florida', 2.6, 3),
 ('Sacramento', 'California', 1.7, 5)]

In [0]:
cursor.description

(('a', None, None, None, None, None, None),
 ('b', None, None, None, None, None, None),
 ('c', None, None, None, None, None, None),
 ('d', None, None, None, None, None, None))

In [0]:
pd.DataFrame(rows, columns = [x[0] for x in cursor.description])

Unnamed: 0,a,b,c,d
0,Atlanta,Georgia,1.25,6
1,Tallahassee,Florida,2.6,3
2,Sacramento,California,1.7,5


In [0]:
import sqlalchemy as sqla

In [0]:
db = sqla.create_engine('sqlite:///mydata.sqlite')

In [0]:
pd.read_sql('select * from test1', db)

Unnamed: 0,a,b,c,d
0,Atlanta,Georgia,1.25,6
1,Tallahassee,Florida,2.6,3
2,Sacramento,California,1.7,5
