# 6.1 Reading and Writing Data in Text Format

Table 6-1. Parsing functions in pandas

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
pwd

'D:\\Learning from Books\\Python Book Solutions\\Python for Data Analysis by Wes McKinney\\Coding Practice\\Chp6 - Data Loading, Storage, and File Formats'

In [5]:
df = pd.read_csv('D:\\Learning from Books\\Python Book Solutions\\Python for Data Analysis by Wes McKinney\\Coding Practice\\Chp6 - Data Loading, Storage, and File Formats\\ex1.csv')

In [6]:
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [9]:
df = pd.read_table('D:\\Learning from Books\\Python Book Solutions\\Python for Data Analysis by Wes McKinney\\Coding Practice\\Chp6 - Data Loading, Storage, and File Formats\\ex1.csv',
                   sep=',')

In [10]:
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [11]:
df = pd.read_csv('D:\\Learning from Books\\Python Book Solutions\\Python for Data Analysis by Wes McKinney\\Coding Practice\\Chp6 - Data Loading, Storage, and File Formats\\ex1.csv', 
                header=None)

In [12]:
df

Unnamed: 0,0,1,2,3,4
0,a,b,c,d,message
1,1,2,3,4,hello
2,5,6,7,8,world
3,9,10,11,12,foo


In [17]:
df = pd.read_csv('ex2.csv')

In [18]:
df

Unnamed: 0,1,2,3,4,hello
0,5,6,7,8,world
1,9,10,11,12,foo


In [19]:
df = pd.read_csv('ex2.csv', header=None)

In [20]:
df

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [23]:
df = pd.read_csv('ex2.csv', header=None, names=['a', 'b', 'c', 'd', 'message'])

In [24]:
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [25]:
df = pd.read_csv('ex2.csv', header=None, names=['a', 'b', 'c', 'd', 'message'], index_col='message')

In [26]:
df

Unnamed: 0_level_0,a,b,c,d
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2,3,4
world,5,6,7,8
foo,9,10,11,12


In [31]:
df = pd.read_csv('ex3.csv', index_col=['key1', 'key2'])

In [32]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,value1,value2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


In [34]:
list(open('ex4.txt'))

['\tA \tB \tC\n',
 'aaa -0.264438 -1.026059 -0.619500\n',
 'bbb 0.927272 0.302904 -0.032399\n',
 'ccc -0.264273 -0.386314 -0.217601\n',
 'ddd -0.871858 -0.348382 1.100491\n']

In [35]:
pd.read_table('ex4.txt', sep='\s+') #'\s+' -> \(means one) s(means whitespace) +(means more times), whitespace one or more times

Unnamed: 0,A,B,C
aaa,-0.264438,-1.026059,-0.6195
bbb,0.927272,0.302904,-0.032399
ccc,-0.264273,-0.386314,-0.217601
ddd,-0.871858,-0.348382,1.100491


In [42]:
list(open('ex5.csv'))

['# hey!\n',
 'a,b,c,d,message\n',
 '# just wanted to make things more difficult for you\n',
 '# who reads CSV files with computers, anyway?\n',
 '1,2,3,4,hello\n',
 '5,6,7,8,world\n',
 '9,10,11,12,foo\n']

In [48]:
pd.read_csv('ex5.csv', skiprows=[0,2,3])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [51]:
list(open('ex6.csv'))

['something,a,b,c,d,message\n',
 'one,1,2,3,4,NA\n',
 'two,5,6,,8,world\n',
 'three,9,10,11,12,foo']

By default, pandas uses a set of commonly occurring sentinels,
such as NA and NULL

In [52]:
result = pd.read_csv('ex6.csv')

In [53]:
result

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [54]:
pd.isnull(result)

Unnamed: 0,something,a,b,c,d,message
0,False,False,False,False,False,True
1,False,False,False,True,False,False
2,False,False,False,False,False,False


In [59]:
result = pd.read_csv('ex6.csv',na_values=[5]) #Will convert all value 5 to NaN

In [60]:
result

Unnamed: 0,something,a,b,c,d,message
0,one,1.0,2,3.0,4,
1,two,,6,,8,world
2,three,9.0,10,11.0,12,foo


In [73]:
sentinels = {'something':['one'], 'message':['NULL', 'foo']}
result = pd.read_csv('ex6.csv',na_values=sentinels)

In [74]:
result

Unnamed: 0,something,a,b,c,d,message
0,,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,


Table 6-2. Some read_csv/read_table function arguments

In [75]:
result = pd.read_csv('ex6.csv', converters={'b':lambda x: int(x)*100})

In [76]:
result

Unnamed: 0,something,a,b,c,d,message
0,one,1,200,3.0,4,
1,two,5,600,,8,world
2,three,9,1000,11.0,12,foo


In [86]:
result = pd.read_csv('ex6.csv', nrows=1)

In [87]:
result

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3,4,


## Reading Text Files in Pieces

When processing very large files or figuring out the right set of arguments to correctly process a large file, you may only want to read in a small piece of a file or iterate
through smaller chunks of the file.

In [109]:
pd.options.display.max_rows=20

In [110]:
result = pd.read_csv('ex7.csv')

In [108]:
result

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.501840,0.659254,-0.421691,-0.057688,G
3,0.204886,1.074134,1.388361,-0.982404,R
4,0.354628,-0.133116,0.283763,-0.837063,Q
...,...,...,...,...,...
9995,2.311896,-0.417070,-1.409599,-0.515821,L
9996,-0.479893,-0.650419,0.745152,-0.646038,E
9997,0.523331,0.787112,0.486066,1.093156,K
9998,-0.362559,0.598894,-1.843201,0.887292,G


In [111]:
pd.read_csv('ex7.csv', nrows=9)

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.50184,0.659254,-0.421691,-0.057688,G
3,0.204886,1.074134,1.388361,-0.982404,R
4,0.354628,-0.133116,0.283763,-0.837063,Q
5,1.81748,0.742273,0.419395,-2.251035,Q
6,-0.776764,0.935518,-0.332872,-1.875641,U
7,-0.913135,1.530624,-0.572657,0.477252,K
8,0.35848,-0.497572,-0.367016,0.507702,S


In [150]:
chunker = pd.read_csv('ex7.csv', chunksize=1000)

In [151]:
chunker

<pandas.io.parsers.TextFileReader at 0x1dc4a9aa508>

In [152]:
tot=pd.Series([])
tot

  """Entry point for launching an IPython kernel.


Series([], dtype: float64)

In [153]:
for piece in chunker:
    tot = tot.add(piece['key'].value_counts(), fill_value=0)

In [157]:
tot = tot.sort_values(ascending=False)

In [158]:
tot[:10]

E    368.0
X    364.0
L    346.0
O    343.0
Q    340.0
M    338.0
J    337.0
F    335.0
K    334.0
H    330.0
dtype: float64

In [159]:
tot

E    368.0
X    364.0
L    346.0
O    343.0
Q    340.0
     ...  
5    157.0
2    152.0
0    151.0
9    150.0
1    146.0
Length: 36, dtype: float64

## Writing Data to Text Format

In [168]:
result = pd.read_csv('ex6.csv')

In [169]:
result

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [170]:
result.to_csv('out.csv')

In [171]:
import sys

In [172]:
result.to_csv(sys.stdout, sep='|')

|something|a|b|c|d|message
0|one|1|2|3.0|4|
1|two|5|6||8|world
2|three|9|10|11.0|12|foo


In [174]:
result.to_csv(sys.stdout, sep='|', na_rep='NaN')

|something|a|b|c|d|message
0|one|1|2|3.0|4|NaN
1|two|5|6|NaN|8|world
2|three|9|10|11.0|12|foo


In [175]:
result

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [192]:
result.to_csv('out2.csv', header=False, index=False, na_rep='NaN')

In [195]:
result.to_csv(sys.stdout, header=False, index=False, na_rep='NaN')

one,1,2,3.0,4,NaN
two,5,6,NaN,8,world
three,9,10,11.0,12,foo


In [219]:
dates = pd.date_range('1/1/2000', periods=10)

In [220]:
ser = pd.Series(data=np.arange(10), index=dates)

In [221]:
ser

2000-01-01    0
2000-01-02    1
2000-01-03    2
2000-01-04    3
2000-01-05    4
2000-01-06    5
2000-01-07    6
2000-01-08    7
2000-01-09    8
2000-01-10    9
Freq: D, dtype: int32

In [224]:
ser.to_csv('out3.csv', header=False)

In [225]:
ser.to_csv(sys.stdout, header=False)

2000-01-01,0
2000-01-02,1
2000-01-03,2
2000-01-04,3
2000-01-05,4
2000-01-06,5
2000-01-07,6
2000-01-08,7
2000-01-09,8
2000-01-10,9


## Working with Delimited Formats

In [233]:
import csv

In [234]:
f = open('ex8.csv')

In [235]:
f

<_io.TextIOWrapper name='ex8.csv' mode='r' encoding='cp1252'>

In [236]:
reader = csv.reader(f)

In [237]:
reader

<_csv.reader at 0x1dc4a6ab898>

In [252]:
with open('ex8.csv') as f:
    lines = list(csv.reader(f))

In [253]:
lines

[['a', 'b', 'c'], ['1', '2', '3'], ['1', '2', '3']]

In [254]:
header, values = lines[0], lines[1:]

In [255]:
header

['a', 'b', 'c']

In [256]:
values

[['1', '2', '3'], ['1', '2', '3']]

In [257]:
data_dict = {h: v for h, v in zip(header, zip(*values))}

In [258]:
data_dict

{'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}

In [293]:
f = open('ex8.csv')

In [294]:
class my_dialect(csv.Dialect):
    lineterminator = '\n'
    delimiter = ';'
    quotechar = '"'
    quoting = csv.QUOTE_MINIMAL
reader = csv.reader(f, dialect=my_dialect)

In [295]:
reader = csv.reader(f, delimiter='|')

Table 6-3. CSV dialect options

In [296]:
with open('ex8.csv', 'w') as f:
    writer = csv.writer(f, dialect=my_dialect)
    writer.writerow(('one', 'two', 'three'))
    writer.writerow(('1', '2', '3'))
    writer.writerow(('4', '5', '6'))
    writer.writerow(('7', '8', '9'))

## JSON Data

In [297]:
#JSON -> Java Script Object Notation
obj = """
{"name": "Wes",
 "places_lived": ["United States", "Spain", "Germany"],
 "pet": null,
 "siblings": [{"name": "Scott", "age": 30, "pets": ["Zeus", "Zuko"]},
 {"name": "Katie", "age": 38,
 "pets": ["Sixes", "Stache", "Cisco"]}]
}
"""

In [299]:
obj

'\n{"name": "Wes",\n "places_lived": ["United States", "Spain", "Germany"],\n "pet": null,\n "siblings": [{"name": "Scott", "age": 30, "pets": ["Zeus", "Zuko"]},\n {"name": "Katie", "age": 38,\n "pets": ["Sixes", "Stache", "Cisco"]}]\n}\n'

In [300]:
import json

In [301]:
result = json.loads(obj) #Loading JSON data to Python Dict format

In [302]:
result

{'name': 'Wes',
 'places_lived': ['United States', 'Spain', 'Germany'],
 'pet': None,
 'siblings': [{'name': 'Scott', 'age': 30, 'pets': ['Zeus', 'Zuko']},
  {'name': 'Katie', 'age': 38, 'pets': ['Sixes', 'Stache', 'Cisco']}]}

In [303]:
asjson = json.dumps(result) #Loading Python Dict to JSON format

In [304]:
asjson

'{"name": "Wes", "places_lived": ["United States", "Spain", "Germany"], "pet": null, "siblings": [{"name": "Scott", "age": 30, "pets": ["Zeus", "Zuko"]}, {"name": "Katie", "age": 38, "pets": ["Sixes", "Stache", "Cisco"]}]}'

In [319]:
list(open('example.json'))

['[{"a": 1, "b": 2, "c": 3},\n',
 ' {"a": 4, "b": 5, "c": 6},\n',
 ' {"a": 7, "b": 8, "c": 9}]\n']

In [311]:
data = pd.read_json('example.json')

In [312]:
data

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


In [315]:
print(data.to_json())

{"a":{"0":1,"1":4,"2":7},"b":{"0":2,"1":5,"2":8},"c":{"0":3,"1":6,"2":9}}


In [317]:
print(data.to_json(orient='records'))

[{"a":1,"b":2,"c":3},{"a":4,"b":5,"c":6},{"a":7,"b":8,"c":9}]


## XML and HTML: Web Scraping

In [322]:
tables = pd.read_html('https://www.fdic.gov/bank/individual/failed/banklist.html')

In [325]:
len(tables)

1

In [327]:
failures = tables[0]

In [328]:
failures.head()

Unnamed: 0,Bank Name,City,ST,CERT,Acquiring Institution,Closing Date
0,The First State Bank,Barboursville,WV,14361,"MVB Bank, Inc.","April 3, 2020"
1,Ericson State Bank,Ericson,NE,18265,Farmers and Merchants Bank,"February 14, 2020"
2,City National Bank of New Jersey,Newark,NJ,21111,Industrial Bank,"November 1, 2019"
3,Resolute Bank,Maumee,OH,58317,Buckeye State Bank,"October 25, 2019"
4,Louisa Community Bank,Louisa,KY,58112,Kentucky Farmers Bank Corporation,"October 25, 2019"


In [344]:
failures.columns

Index(['Bank Name', 'City', 'ST', 'CERT', 'Acquiring Institution',
       'Closing Date'],
      dtype='object')

In [345]:
failures.shape

(561, 6)

In [336]:
close_timestamp = pd.to_datetime(failures['Closing Date'])

In [346]:
close_timestamp

0     2020-04-03
1     2020-02-14
2     2019-11-01
3     2019-10-25
4     2019-10-25
         ...    
556   2001-07-27
557   2001-05-03
558   2001-02-02
559   2000-12-14
560   2000-10-13
Name: Closing Date, Length: 561, dtype: datetime64[ns]

In [340]:
close_timestamp.dt.year.value_counts()

2010    157
2009    140
2011     92
2012     51
2008     25
2013     24
2014     18
2002     11
2017      8
2015      8
2016      5
2001      4
2019      4
2004      4
2007      3
2003      3
2020      2
2000      2
Name: Closing Date, dtype: int64

### Parsing XML with lxml.objectify

XML (eXtensible Markup Language)