# Chapter 5: pandas: Reading and Writing Data 

<div id="toc"></div>

In [42]:
import numpy as np
import pandas as pd

## 5.1 I/O API Tools

| Readers        | Writers      |  
|---|---|
| read_csv       | to_csv       |  
| read_excel     | to_excel     |    
| read_hdf       | to_hdf       |  
| read_sql       | to_sql       |  
| read_json      | to_json      |    
| read_html      | to_html      |    
| read_stata     | to_stata     |    
| read_clipboard | to_clipboard |        
| read_pickle    | to_pickle |    
| read_msgpack   | to_msgpack (experimental) |     
| read_gbq       | to_gbq (experimental) | 

## 5.2 CSV and Textual Files

* read_csv
* read_table
* to_csv

In [21]:
import pandas as pd
csvframe = pd.read_csv('data/myCSV_01.csv')
csvframe

Unnamed: 0,white,red,blue,green,animal
0,1,5,2,3,cat
1,2,7,8,5,dog
2,3,3,6,7,horse
3,2,2,8,3,duck
4,4,4,2,1,mouse


In [22]:
pd.read_table('data/ch05_01.csv',sep=',')

Unnamed: 0,white,red,blue,green,animal
0,1,5,2,3,cat
1,2,7,8,5,dog
2,3,3,6,7,horse
3,2,2,8,3,duck
4,4,4,2,1,mouse


## 5.3 Reading Data in CSV or Text Files

In [23]:
import pandas as pd

In [24]:
csvframe = pd.read_csv('data/myCSV_01.csv')
csvframe

Unnamed: 0,white,red,blue,green,animal
0,1,5,2,3,cat
1,2,7,8,5,dog
2,3,3,6,7,horse
3,2,2,8,3,duck
4,4,4,2,1,mouse


In [25]:
pd.read_table('data/ch05_01.csv',sep=',')

Unnamed: 0,white,red,blue,green,animal
0,1,5,2,3,cat
1,2,7,8,5,dog
2,3,3,6,7,horse
3,2,2,8,3,duck
4,4,4,2,1,mouse


In [26]:
pd.read_csv('data/ch05_02.csv')

Unnamed: 0,1,5,2,3,cat
0,2,7,8,5,dog
1,3,3,6,7,horse
2,2,2,8,3,duck
3,4,4,2,1,mouse


In [27]:
pd.read_csv('data/ch05_02.csv', header=None)

Unnamed: 0,0,1,2,3,4
0,1,5,2,3,cat
1,2,7,8,5,dog
2,3,3,6,7,horse
3,2,2,8,3,duck
4,4,4,2,1,mouse


In [28]:
pd.read_csv('data/ch05_02.csv', names=['white','red','blue','green','animal'])

Unnamed: 0,white,red,blue,green,animal
0,1,5,2,3,cat
1,2,7,8,5,dog
2,3,3,6,7,horse
3,2,2,8,3,duck
4,4,4,2,1,mouse


In [29]:
pd.read_csv('data/ch05_03.csv', index_col=['color','status'])

Unnamed: 0_level_0,Unnamed: 1_level_0,item1,item2,item3
color,status,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
black,up,3,4,6
black,down,2,6,7
white,up,5,5,5
white,down,3,3,2
white,left,1,2,1
red,up,2,2,2
red,down,1,1,4


### Using RegExp for Parsing TXT Files

| symbol | meanning |
|---|---|
| .         | single character, except newline |
| \d        | digit |
| \D        | non-digit character |
| \s        | whitespace character |
| \S        | non-whitespace character |
| \n        | new line character |
| \t        | tab character |
| \uxxxx    | unicode character specified by the hexadecimal number xxxx |

In [30]:
pd.read_table('data/ch05_04.txt',sep='\s*')

  if __name__ == '__main__':
  yield pat.split(line.strip())
  yield pat.split(line.strip())


Unnamed: 0,white,red,blue,green
0,1,5,2,3
1,2,7,8,5
2,3,3,6,7


In [31]:
pd.read_table('data/ch05_05.txt',sep='\D*',header=None)

  if __name__ == '__main__':
  yield pat.split(line.strip())
  yield pat.split(line.strip())


Unnamed: 0,0,1,2
0,0,123,122
1,1,124,321
2,2,125,333


In [32]:
pd.read_table('data/ch05_06.txt',sep=',',skiprows=[0,1,3,6])

Unnamed: 0,white,red,blue,green,animal
0,1,5,2,3,cat
1,2,7,8,5,dog
2,3,3,6,7,horse
3,2,2,8,3,duck
4,4,4,2,1,mouse


### Reading TXT Files into Parts or Partially

In [33]:
pd.read_csv('data/ch05_02.csv',skiprows=[2],nrows=3,header=None)

Unnamed: 0,0,1,2,3,4
0,1,5,2,3,cat
1,2,7,8,5,dog
2,2,2,8,3,duck


In [34]:
out = pd.Series()
i = 0
pieces = pd.read_csv('data/ch05_01.csv',chunksize=3)

for piece in pieces:
    out.set_value(i,piece['white'].sum())
    i = i + 1
    print(i)

1
2


### Writing Data in CSV

In [36]:
data = {'color' : ['blue','green','yellow','red','white'],
        'object' : ['ball','pen','pencil','paper','mug'],
        'price' : [1.2,1.0,0.6,0.9,1.7]}

In [37]:
frame2 = pd.DataFrame(data, index=['one','two','three','four','five'])
frame2

Unnamed: 0,color,object,price
one,blue,ball,1.2
two,green,pen,1.0
three,yellow,pencil,0.6
four,red,paper,0.9
five,white,mug,1.7


In [38]:
frame2.to_csv('ch05_07.csv')

In [39]:
frame2.to_csv('ch05_07b.csv', index=False, header=False)

In [43]:
frame3 = pd.DataFrame(np.arange(16).reshape((4,4)),
                      index=['red','blue','yellow','white'],
                      columns=['ball','pen','pencil','paper'])
frame3

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
blue,4,5,6,7
yellow,8,9,10,11
white,12,13,14,15


In [44]:
frame3

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
blue,4,5,6,7
yellow,8,9,10,11
white,12,13,14,15


In [45]:
frame3.to_csv('ch05_08.csv')

In [46]:
frame3.to_csv('ch05_09.csv', na_rep ='NaN')

## 5.4 Reading and Writing HTML Files

### Writing Data in HTML

* read_html()
* to_html()

In [None]:
conda install html5lib

In [48]:
frame = pd.DataFrame(np.arange(4).reshape(2,2))

In [49]:
print(frame.to_html())

<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>0</th>
      <th>1</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>0</td>
      <td>1</td>
    </tr>
    <tr>
      <th>1</th>
      <td>2</td>
      <td>3</td>
    </tr>
  </tbody>
</table>


In [50]:
frame = pd.DataFrame( np.random.random((4,4)),
                     index = ['white','black','red','blue'],
                     columns = ['up','down','right','left'])
frame

Unnamed: 0,up,down,right,left
white,0.788961,0.57758,0.499871,0.205716
black,0.34614,0.194116,0.277512,0.094746
red,0.277262,0.262186,0.883444,0.419923
blue,0.076713,0.926889,0.919097,0.790951


In [51]:
s = ['<HTML>']
s.append('<HEAD><TITLE>My DataFrame</TITLE></HEAD>')
s.append('<BODY>')
s.append(frame.to_html())
s.append('</BODY></HTML>')
html = ''.join(s)

In [52]:
html_file = open('myFrame.html','w')
html_file.write(html)
html_file.close()

### Reading Data from an HTML File

In [53]:
web_frames = pd.read_html('myFrame.html')
web_frames[0]

Unnamed: 0.1,Unnamed: 0,up,down,right,left
0,white,0.788961,0.57758,0.499871,0.205716
1,black,0.34614,0.194116,0.277512,0.094746
2,red,0.277262,0.262186,0.883444,0.419923
3,blue,0.076713,0.926889,0.919097,0.790951


In [55]:
ranking = pd.read_html('http://www.meccanismocomplesso.org/en/meccanismo-complesso-sito-2/classifica-punteggio/')

In [None]:
ranking[0]

## 5.5 Reading Data from XML

http://lxml.de/index.html

In [None]:
# %load data/books.xml
<?xml version="1.0"?>
<Catalog>
  <Book id="ISBN9872122367564">
  <Author>272103_1_EnRoss, Mark</Author>
    <Title>XML Cookbook</Title>
    <Genre>Computer</Genre>
    <Price>23.56</Price>
    <PublishDate>2014-22-01</PublishDate>
  </Book>
  <Book id="ISBN9872122367564">
    <Author>272103_1_EnBracket, Barbara</Author>
    <Title>XML for Dummies</Title>
    <Genre>Computer</Genre>
    <Price>35.95</Price>
    <PublishDate>2014-12-16</PublishDate>
  </Book>
</Catalog>

In [64]:
from lxml import objectify

In [65]:
xml = objectify.parse('data/books.xml')
xml

<lxml.etree._ElementTree at 0xe6e622f688>

In [67]:
root = xml.getroot()

In [68]:
root.Book.Author

'272103_1_EnRoss, Mark'

In [69]:
root.Book.PublishDate

'2014-22-01'

In [70]:
root.getchildren()

[<Element Book at 0xe6e6276f08>, <Element Book at 0xe6e6276b08>]

In [71]:
[child.tag for child in root.Book.getchildren()]

['Author', 'Title', 'Genre', 'Price', 'PublishDate']

In [72]:
[child.text for child in root.Book.getchildren()]

['272103_1_EnRoss, Mark', 'XML Cookbook', 'Computer', '23.56', '2014-22-01']

In [75]:
def etree2df(root):
    column_names = []
    for i in range(0,len(root.getchildren()[0].getchildren())):
        column_names.append(root.getchildren()[0].getchildren()[i].tag)
    xml:frame = pd.DataFrame(columns=column_names)
    for j in range(0, len(root.getchildren())):
        obj = root.getchildren()[j].getchildren()
        texts = []
        for k in range(0, len(column_names)):
            texts.append(obj[k].text)
        row = dict(zip(column_names, texts))
        row_s = pd.Series(row)
        row_s.name = j
        xml:frame = xml:frame.append(row_s)
    return xml:frame

SyntaxError: invalid syntax (<ipython-input-75-110bab389c61>, line 14)

In [None]:
etree2df(root)

## 5.6 Reading and Writing Data on Microsoft Excel Files

* to_excel()
* read_excel()

In [81]:
import xlrd

In [83]:
pd.read_excel('data/data.xls')

Unnamed: 0,white,read,green,black
a,12,23,17,18
b,22,16,19,18
c,14,23,22,21


In [84]:
pd.read_excel('data/data.xls','Sheet2')

Unnamed: 0,yellow,purple,blue,orange
A,12,23,17,18
B,22,16,19,18
C,14,23,22,21


In [85]:
pd.read_excel('data/data.xls',1)

Unnamed: 0,yellow,purple,blue,orange
A,12,23,17,18
B,22,16,19,18
C,14,23,22,21


In [86]:
frame = pd.DataFrame(np.random.random((4,4)),
                     index = ['exp1','exp2','exp3','exp4'],
                     columns = ['Jan2015','Fab2015','Mar2015','Apr2005'])
frame

Unnamed: 0,Jan2015,Fab2015,Mar2015,Apr2005
exp1,0.509758,0.473159,0.381452,0.755255
exp2,0.325269,0.043197,0.10624,0.351238
exp3,0.379101,0.69005,0.475262,0.702887
exp4,0.418955,0.652997,0.991569,0.595785


In [87]:
frame.to_excel('data/data2.xlsx')

## 5.7 JSON Data

http://jsonviewer.stack.hu/

In [88]:
frame = pd.DataFrame(np.arange(16).reshape(4,4),
                     index=['white','black','red','blue'],
                     columns=['up','down','right','left'])
frame.to_json('frame.json')

In [89]:
pd.read_json('frame.json')

Unnamed: 0,down,left,right,up
black,5,7,6,4
blue,13,15,14,12
red,9,11,10,8
white,1,3,2,0


In [90]:
from pandas.io.json import json_normalize

In [None]:
file = open('data/books.json','r')
text = file.read()
text = json.loads(text)

In [None]:
json_normalize(text,'books')

## 5.8 The Format HDF5

In [94]:
from pandas.io.pytables import HDFStore

In [96]:
frame = pd.DataFrame(np.arange(16).reshape(4,4),
                     index=['white','black','red','blue'],
                     columns=['up','down','right','left'])

In [97]:
store = HDFStore('mydata.h5')
store['obj1'] = frame


In [98]:
frame2

Unnamed: 0,color,object,price
one,blue,ball,1.2
two,green,pen,1.0
three,yellow,pencil,0.6
four,red,paper,0.9
five,white,mug,1.7


In [99]:
store['obj2'] = frame2

In [100]:
store

<class 'pandas.io.pytables.HDFStore'>
File path: mydata.h5
/obj1            frame        (shape->[4,4])
/obj2            frame        (shape->[5,3])

In [101]:
store['obj2']

Unnamed: 0,color,object,price
one,blue,ball,1.2
two,green,pen,1.0
three,yellow,pencil,0.6
four,red,paper,0.9
five,white,mug,1.7


## 5.9 Pickle—Python Object Serialization

### Serialize a Python Object with cPickle

In [None]:
import cPickle as pickle

In [103]:
data = { 'color': ['white','red'], 'value': [5, 7]}

In [None]:
pickled_data = pickle.dumps(data)

In [None]:
nframe = pickle.loads(pickled_data)
nframe

### Pickling with pandas

In [None]:
frame = pd.DataFrame(np.arange(16).reshape(4,4), index = ['up','down','left','right'])
frame.to_pickle('frame.pkl')

In [None]:
pd.read_pickle('frame.pkl')

## 5.10 Interacting with Databases

In [104]:
from sqlalchemy import create_engine

In [None]:
engine = create_engine('postgresql://scott:tiger@localhost:5432/mydatabase')

In [None]:
engine = create_engine('mysql+mysqldb://scott:tiger@localhost/foo')

In [None]:
engine = create_engine('oracle://scott:tiger@127.0.0.1:1521/sidname')

In [None]:
engine = create_engine('mssql+pyodbc://mydsn')

In [None]:
engine = create_engine('sqlite:///foo.db')

### Loading and Writing Data with SQLite3

In [109]:
frame = pd.DataFrame( np.arange(20).reshape(4,5),
                     columns=['white','red','blue','black','green'])
frame

Unnamed: 0,white,red,blue,black,green
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19


In [110]:
engine = create_engine('sqlite:///foo.db')

In [111]:
frame.to_sql('colors',engine)

In [113]:
pd.read_sql('colors',engine)

Unnamed: 0,index,white,red,blue,black,green
0,0,0,1,2,3,4
1,1,5,6,7,8,9
2,2,10,11,12,13,14
3,3,15,16,17,18,19


In [114]:
import sqlite3
query = """
CREATE TABLE test
(a VARCHAR(20), b VARCHAR(20),
c REAL, d INTEGER
);"""
con = sqlite3.connect(':memory:')
con.execute(query)

<sqlite3.Cursor at 0xe6e6cea420>

In [115]:
con.commit()

In [116]:
data = [('white','up',1,3),
('black','down',2,8),
('green','up',4,4),
('red','down',5,5)]
stmt = "INSERT INTO test VALUES(?,?,?,?)"
con.executemany(stmt, data)

<sqlite3.Cursor at 0xe6e6cea5e0>

In [117]:
con.commit()

In [118]:
cursor = con.execute('select * from test')
cursor

<sqlite3.Cursor at 0xe6e6cea570>

In [119]:
rows = cursor.fetchall()
rows

[('white', 'up', 1.0, 3),
 ('black', 'down', 2.0, 8),
 ('green', 'up', 4.0, 4),
 ('red', 'down', 5.0, 5)]

In [120]:
cursor.description

(('a', None, None, None, None, None, None),
 ('b', None, None, None, None, None, None),
 ('c', None, None, None, None, None, None),
 ('d', None, None, None, None, None, None))

In [None]:
pd.DataFrame(rows, columns=zip(*cursor.description)[0])

### Loading and Writing Data with PostgreSQL

In [124]:
pd.__version__

'0.19.2'

In [None]:
engine = create_engine('postgresql://postgres:password@localhost:5432/postgres')

In [126]:
frame = pd.DataFrame(np.random.random((4,4)),
                     index=['exp1','exp2','exp3','exp4'],
                     columns=['feb','mar','apr','may']);

In [None]:
frame.to_sql('dataframe',engine)

In [None]:
psql -U postgres

In [None]:
pd.read_sql_table('dataframe',engine)

In [None]:
pd.read_sql_query('SELECT index,apr,may FROM DATAFRAME WHERE apr > 0.5',engine)

## 5.11 Reading and Writing Data with a NoSQL Database: MongoDB

In [None]:
mongod --dbpath C:\MongoDB_data

In [None]:
import pymongo
client = MongoClient('localhost',27017)

In [None]:
db = client.mydatabase
db

In [None]:
client['mydatabase']

In [None]:
collection = db.mycollection

In [None]:
db['mycollection']
collection

In [None]:
frame = pd.DataFrame( np.arange(20).reshape(4,5),
frame

In [None]:
import json

In [None]:
record = json.loads(frame.T.to_json()).values()
record

In [None]:
collection.mydocument.insert(record)

In [None]:
cursor = collection['mydocument'].find()
dataframe = (list(cursor))
del dataframe['_id']
dataframe

## 5.12 Conclusions