In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
#json数据

In [4]:
obj = """
{"name": "Wes",
 "places_lived": ["United States", "Spain", "Germany"],
 "pet": null,
 "siblings": [{"name": "Scott", "age": 30, "pets": ["Zeus", "Zuko"]},
              {"name": "Katie", "age": 38,
               "pets": ["Sixes", "Stache", "Cisco"]}]
}
"""

In [5]:
import json

In [6]:
result = json.loads(obj)    #使用json.loads方法将JSON字符串转换为Python形式

In [7]:
result

{'name': 'Wes',
 'places_lived': ['United States', 'Spain', 'Germany'],
 'pet': None,
 'siblings': [{'name': 'Scott', 'age': 30, 'pets': ['Zeus', 'Zuko']},
  {'name': 'Katie', 'age': 38, 'pets': ['Sixes', 'Stache', 'Cisco']}]}

In [8]:
asjson = json.dumps(result)    #将Python对象转为JSON

In [9]:
siblings = pd.DataFrame(result['siblings'], columns = ['name', 'age'])#将字典构造的列表(之前是JSON对象)传入构造函数，并选出数据字段的子集

In [10]:
siblings

Unnamed: 0,name,age
0,Scott,30
1,Katie,38


In [11]:
!type examples\example.json

[{"a": 1, "b": 2, "c": 3},
 {"a": 4, "b": 5, "c": 6},
 {"a": 7, "b": 8, "c": 9}]


In [12]:
data = pd.read_json('examples/example.json')    #pandas.read_json可以自动将JSON数据集按照指定次序转换为Series或DataFrame

In [13]:
data       #pandas.read_json的默认选项是假设JSON数组中的每个对象是表里的一行

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


In [14]:
print(data.to_json())            #可以对Series和DataFrame使用to_json方法从pandas中将数据导出为JSON

{"a":{"0":1,"1":4,"2":7},"b":{"0":2,"1":5,"2":8},"c":{"0":3,"1":6,"2":9}}


In [15]:
print(data.to_json(orient = 'records'))

[{"a":1,"b":2,"c":3},{"a":4,"b":5,"c":6},{"a":7,"b":8,"c":9}]


In [16]:
#XML和HTML:网络抓取

In [17]:
#pandas的内建函数read_html可以使用lxml和Beautiful Soup等库将HTML中的表自动解析为DataFrame对象

In [18]:
tables = pd.read_html('examples/fdic_failed_bank_list.html')

In [19]:
len(tables)

1

In [20]:
failures = tables[0]

In [21]:
failures.head()       

Unnamed: 0,Bank Name,City,ST,CERT,Acquiring Institution,Closing Date,Updated Date
0,Allied Bank,Mulberry,AR,91,Today's Bank,"September 23, 2016","November 17, 2016"
1,The Woodbury Banking Company,Woodbury,GA,11297,United Bank,"August 19, 2016","November 17, 2016"
2,First CornerStone Bank,King of Prussia,PA,35312,First-Citizens Bank & Trust Company,"May 6, 2016","September 6, 2016"
3,Trust Company Bank,Memphis,TN,9956,The Bank of Fayette County,"April 29, 2016","September 6, 2016"
4,North Milwaukee State Bank,Milwaukee,WI,20364,First-Citizens Bank & Trust Company,"March 11, 2016","June 16, 2016"


In [22]:
close_timestamps = pd.to_datetime(failures['Closing Date']) 

In [23]:
close_timestamps.dt.year.value_counts()

2010    157
2009    140
2011     92
2012     51
2008     25
2013     24
2014     18
2002     11
2015      8
2016      5
2004      4
2001      4
2007      3
2003      3
2000      2
Name: Closing Date, dtype: int64

In [24]:
#使用lxml.objectify解析XML

In [25]:
from lxml import objectify

In [37]:
path = 'examples/mta_perf/Performance_MNR.xml'

In [27]:
parsed = objectify.parse(open(path))      #解析文件

In [28]:
root = parsed.getroot()           #getroot获得对XML文件根节点引用

In [29]:
data = []

In [30]:
skip_fields = ['PARENT_SEQ', 'INDICATOR_SEQ', 'DESIRED_CHANGE', 'DECIMAL_PLACES']

In [31]:
#root.INDICATOR返回一个生成器，可以产生每一个<INDICATOR>XML元素。对于每条记录，可以将标签名称的字典(如YTD_ACTUAL)填充为数据值(不包括几个标签)

In [33]:
for elt in root.INDICATOR:
    el_data = {}
    for child in elt.getchildren():
        if child.tag in skip_fields:
            continue
        el_data[child.tag] = child.pyval
    data.append(el_data)

In [34]:
perf = pd.DataFrame(data)

In [35]:
perf.head()

Unnamed: 0,AGENCY_NAME,CATEGORY,DESCRIPTION,FREQUENCY,INDICATOR_NAME,INDICATOR_UNIT,MONTHLY_ACTUAL,MONTHLY_TARGET,PERIOD_MONTH,PERIOD_YEAR,YTD_ACTUAL,YTD_TARGET
0,Metro-North Railroad,Service Indicators,Percent of commuter trains that arrive at thei...,M,On-Time Performance (West of Hudson),%,96.9,95,1,2008,96.9,95
1,Metro-North Railroad,Service Indicators,Percent of commuter trains that arrive at thei...,M,On-Time Performance (West of Hudson),%,95.0,95,2,2008,96.0,95
2,Metro-North Railroad,Service Indicators,Percent of commuter trains that arrive at thei...,M,On-Time Performance (West of Hudson),%,96.9,95,3,2008,96.3,95
3,Metro-North Railroad,Service Indicators,Percent of commuter trains that arrive at thei...,M,On-Time Performance (West of Hudson),%,98.3,95,4,2008,96.8,95
4,Metro-North Railroad,Service Indicators,Percent of commuter trains that arrive at thei...,M,On-Time Performance (West of Hudson),%,95.8,95,5,2008,96.6,95


In [39]:
from io import StringIO

In [40]:
tag = '<a href = "http:// www.google.com">Google</a>'

In [41]:
root = objectify.parse(StringIO(tag)).getroot()

In [42]:
root

<Element a at 0x18ba9881088>

In [43]:
root.get('href')

'http:// www.google.com'

In [44]:
root.text

'Google'

In [53]:
#二进制格式

In [None]:
#pickle仅被推荐作为短期的存储格式，高效方便

In [46]:
frame = pd.read_csv('examples/ex1.csv')    

In [47]:
frame

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [51]:
frame.to_pickle('examples/frame_pickle')  #to_pickle方法可以将数据以pickle格式写入硬盘(序列化)

In [50]:
pd.read_pickle('examples/frame_pickle')

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [55]:
#使用HDF5格式   

In [57]:
#HDF代表分层数据格式，每个HDF5文件可以存储多个数据集并支持元数据，有其他语言接口包括Java, Julia, MATLABA, Python

In [58]:
#适合处理不适合在内存中存储的超大型数据，可以高效的读写大型数组的一小块

In [85]:
frame = pd.DataFrame({'a' : np.random.randn(100)})

In [86]:
store = pd.HDFStore('mydata.h5')      #HDFStore类像字典一样工作并处理低级别细节

In [87]:
store['obj1'] = frame

In [62]:
store['obj1_col'] = frame['a']

In [68]:
store

<class 'pandas.io.pytables.HDFStore'>
File path: mydata.h5

In [69]:
pd.options.display.max_rows = 10

In [70]:
store['obj1']      #包含在HDF5文件中的对象可以使用相同的字典型API进行检索

Unnamed: 0,a
0,0.958814
1,-0.072656
2,1.403716
3,-0.700236
4,-1.241221
...,...
95,-1.012713
96,-0.764658
97,-1.599436
98,-0.877209


In [71]:
#HDFStore支持两种存储模式，'fixed' 和'table'，后者速度慢，单支持一种特殊语法的查询操作：

In [72]:
store.put('obj2', frame, format = 'table')

In [74]:
store.select('obj2', where = ['index >= 10 and index <= 15'])

Unnamed: 0,a
10,1.105285
11,-0.066595
12,0.923485
13,0.546772
14,1.666138
15,-1.233598


In [96]:
store.close()

In [97]:
#put是store['obj2']=frame方法的显式版本，可以设置其他选项，如存储格式

In [98]:
frame.to_hdf('mydata.h5', 'obj3', format = 'table')

In [99]:
pd.read_hdf('mydata.h5', 'obj3', where = ['index < 5'])

ValueError: The file 'mydata.h5' is already opened, but not in read-only mode (as requested).