# 6.1 Reading and Writing Data in Text Format

Table 6-1. Parsing functions in pandas

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
pwd

'D:\\Learning from Books\\Python Book Solutions\\Python for Data Analysis by Wes McKinney\\Coding Practice\\Chp6 - Data Loading, Storage, and File Formats'

In [3]:
df = pd.read_csv('D:\\Learning from Books\\Python Book Solutions\\Python for Data Analysis by Wes McKinney\\Coding Practice\\Chp6 - Data Loading, Storage, and File Formats\\ex1.csv')

In [4]:
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [5]:
df = pd.read_table('D:\\Learning from Books\\Python Book Solutions\\Python for Data Analysis by Wes McKinney\\Coding Practice\\Chp6 - Data Loading, Storage, and File Formats\\ex1.csv',
                   sep=',')

In [6]:
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [7]:
df = pd.read_csv('D:\\Learning from Books\\Python Book Solutions\\Python for Data Analysis by Wes McKinney\\Coding Practice\\Chp6 - Data Loading, Storage, and File Formats\\ex1.csv', 
                header=None)

In [8]:
df

Unnamed: 0,0,1,2,3,4
0,a,b,c,d,message
1,1,2,3,4,hello
2,5,6,7,8,world
3,9,10,11,12,foo


In [9]:
df = pd.read_csv('ex2.csv')

In [10]:
df

Unnamed: 0,1,2,3,4,hello
0,5,6,7,8,world
1,9,10,11,12,foo


In [11]:
df = pd.read_csv('ex2.csv', header=None)

In [12]:
df

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [13]:
df = pd.read_csv('ex2.csv', header=None, names=['a', 'b', 'c', 'd', 'message'])

In [14]:
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [15]:
df = pd.read_csv('ex2.csv', header=None, names=['a', 'b', 'c', 'd', 'message'], index_col='message')

In [16]:
df

Unnamed: 0_level_0,a,b,c,d
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2,3,4
world,5,6,7,8
foo,9,10,11,12


In [17]:
df = pd.read_csv('ex3.csv', index_col=['key1', 'key2'])

In [18]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,value1,value2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


In [19]:
list(open('ex4.txt'))

['\tA \tB \tC\n',
 'aaa -0.264438 -1.026059 -0.619500\n',
 'bbb 0.927272 0.302904 -0.032399\n',
 'ccc -0.264273 -0.386314 -0.217601\n',
 'ddd -0.871858 -0.348382 1.100491\n']

In [20]:
pd.read_table('ex4.txt', sep='\s+') #'\s+' -> \(means one) s(means whitespace) +(means more times), whitespace one or more times

Unnamed: 0,A,B,C
aaa,-0.264438,-1.026059,-0.6195
bbb,0.927272,0.302904,-0.032399
ccc,-0.264273,-0.386314,-0.217601
ddd,-0.871858,-0.348382,1.100491


In [21]:
list(open('ex5.csv'))

['# hey!\n',
 'a,b,c,d,message\n',
 '# just wanted to make things more difficult for you\n',
 '# who reads CSV files with computers, anyway?\n',
 '1,2,3,4,hello\n',
 '5,6,7,8,world\n',
 '9,10,11,12,foo\n']

In [22]:
pd.read_csv('ex5.csv', skiprows=[0,2,3])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [23]:
list(open('ex6.csv'))

['something,a,b,c,d,message\n',
 'one,1,2,3,4,NA\n',
 'two,5,6,,8,world\n',
 'three,9,10,11,12,foo']

By default, pandas uses a set of commonly occurring sentinels,
such as NA and NULL

In [24]:
result = pd.read_csv('ex6.csv')

In [25]:
result

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [26]:
pd.isnull(result)

Unnamed: 0,something,a,b,c,d,message
0,False,False,False,False,False,True
1,False,False,False,True,False,False
2,False,False,False,False,False,False


In [27]:
result = pd.read_csv('ex6.csv',na_values=[5]) #Will convert all value 5 to NaN

In [28]:
result

Unnamed: 0,something,a,b,c,d,message
0,one,1.0,2,3.0,4,
1,two,,6,,8,world
2,three,9.0,10,11.0,12,foo


In [29]:
sentinels = {'something':['one'], 'message':['NULL', 'foo']}
result = pd.read_csv('ex6.csv',na_values=sentinels)

In [30]:
result

Unnamed: 0,something,a,b,c,d,message
0,,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,


Table 6-2. Some read_csv/read_table function arguments

In [31]:
result = pd.read_csv('ex6.csv', converters={'b':lambda x: int(x)*100})

In [32]:
result

Unnamed: 0,something,a,b,c,d,message
0,one,1,200,3.0,4,
1,two,5,600,,8,world
2,three,9,1000,11.0,12,foo


In [33]:
result = pd.read_csv('ex6.csv', nrows=1)

In [34]:
result

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3,4,


## Reading Text Files in Pieces

When processing very large files or figuring out the right set of arguments to correctly process a large file, you may only want to read in a small piece of a file or iterate
through smaller chunks of the file.

In [35]:
pd.options.display.max_rows=20

In [36]:
result = pd.read_csv('ex7.csv')

In [37]:
result

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.501840,0.659254,-0.421691,-0.057688,G
3,0.204886,1.074134,1.388361,-0.982404,R
4,0.354628,-0.133116,0.283763,-0.837063,Q
...,...,...,...,...,...
9995,2.311896,-0.417070,-1.409599,-0.515821,L
9996,-0.479893,-0.650419,0.745152,-0.646038,E
9997,0.523331,0.787112,0.486066,1.093156,K
9998,-0.362559,0.598894,-1.843201,0.887292,G


In [38]:
pd.read_csv('ex7.csv', nrows=9)

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.50184,0.659254,-0.421691,-0.057688,G
3,0.204886,1.074134,1.388361,-0.982404,R
4,0.354628,-0.133116,0.283763,-0.837063,Q
5,1.81748,0.742273,0.419395,-2.251035,Q
6,-0.776764,0.935518,-0.332872,-1.875641,U
7,-0.913135,1.530624,-0.572657,0.477252,K
8,0.35848,-0.497572,-0.367016,0.507702,S


In [39]:
chunker = pd.read_csv('ex7.csv', chunksize=1000)

In [40]:
chunker

<pandas.io.parsers.TextFileReader at 0x1a2b908ed48>

In [41]:
tot=pd.Series([])
tot

  """Entry point for launching an IPython kernel.


Series([], dtype: float64)

In [42]:
for piece in chunker:
    tot = tot.add(piece['key'].value_counts(), fill_value=0)

In [43]:
tot = tot.sort_values(ascending=False)

In [44]:
tot[:10]

E    368.0
X    364.0
L    346.0
O    343.0
Q    340.0
M    338.0
J    337.0
F    335.0
K    334.0
H    330.0
dtype: float64

In [45]:
tot

E    368.0
X    364.0
L    346.0
O    343.0
Q    340.0
     ...  
5    157.0
2    152.0
0    151.0
9    150.0
1    146.0
Length: 36, dtype: float64

## Writing Data to Text Format

In [46]:
result = pd.read_csv('ex6.csv')

In [47]:
result

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [48]:
result.to_csv('out.csv')

In [49]:
import sys

In [50]:
result.to_csv(sys.stdout, sep='|')

|something|a|b|c|d|message
0|one|1|2|3.0|4|
1|two|5|6||8|world
2|three|9|10|11.0|12|foo


In [51]:
result.to_csv(sys.stdout, sep='|', na_rep='NaN')

|something|a|b|c|d|message
0|one|1|2|3.0|4|NaN
1|two|5|6|NaN|8|world
2|three|9|10|11.0|12|foo


In [52]:
result

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [53]:
result.to_csv('out2.csv', header=False, index=False, na_rep='NaN')

In [54]:
result.to_csv(sys.stdout, header=False, index=False, na_rep='NaN')

one,1,2,3.0,4,NaN
two,5,6,NaN,8,world
three,9,10,11.0,12,foo


In [55]:
dates = pd.date_range('1/1/2000', periods=10)

In [56]:
ser = pd.Series(data=np.arange(10), index=dates)

In [57]:
ser

2000-01-01    0
2000-01-02    1
2000-01-03    2
2000-01-04    3
2000-01-05    4
2000-01-06    5
2000-01-07    6
2000-01-08    7
2000-01-09    8
2000-01-10    9
Freq: D, dtype: int32

In [58]:
ser.to_csv('out3.csv', header=False)

In [59]:
ser.to_csv(sys.stdout, header=False)

2000-01-01,0
2000-01-02,1
2000-01-03,2
2000-01-04,3
2000-01-05,4
2000-01-06,5
2000-01-07,6
2000-01-08,7
2000-01-09,8
2000-01-10,9


## Working with Delimited Formats

In [60]:
import csv

In [61]:
f = open('ex8.csv')

In [62]:
f

<_io.TextIOWrapper name='ex8.csv' mode='r' encoding='cp1252'>

In [63]:
reader = csv.reader(f)

In [64]:
reader

<_csv.reader at 0x1a2b90a5f98>

In [65]:
with open('ex8.csv') as f:
    lines = list(csv.reader(f))

In [66]:
lines

[['one;two;three'], ['1;2;3'], ['4;5;6'], ['7;8;9']]

In [67]:
header, values = lines[0], lines[1:]

In [68]:
header

['one;two;three']

In [69]:
values

[['1;2;3'], ['4;5;6'], ['7;8;9']]

In [70]:
data_dict = {h: v for h, v in zip(header, zip(*values))}

In [71]:
data_dict

{'one;two;three': ('1;2;3', '4;5;6', '7;8;9')}

In [72]:
f = open('ex8.csv')

In [73]:
class my_dialect(csv.Dialect):
    lineterminator = '\n'
    delimiter = ';'
    quotechar = '"'
    quoting = csv.QUOTE_MINIMAL
reader = csv.reader(f, dialect=my_dialect)

In [74]:
reader = csv.reader(f, delimiter='|')

Table 6-3. CSV dialect options

In [75]:
with open('ex8.csv', 'w') as f:
    writer = csv.writer(f, dialect=my_dialect)
    writer.writerow(('one', 'two', 'three'))
    writer.writerow(('1', '2', '3'))
    writer.writerow(('4', '5', '6'))
    writer.writerow(('7', '8', '9'))

## JSON Data

In [76]:
#JSON -> Java Script Object Notation
obj = """
{"name": "Wes",
 "places_lived": ["United States", "Spain", "Germany"],
 "pet": null,
 "siblings": [{"name": "Scott", "age": 30, "pets": ["Zeus", "Zuko"]},
 {"name": "Katie", "age": 38,
 "pets": ["Sixes", "Stache", "Cisco"]}]
}
"""

In [77]:
obj

'\n{"name": "Wes",\n "places_lived": ["United States", "Spain", "Germany"],\n "pet": null,\n "siblings": [{"name": "Scott", "age": 30, "pets": ["Zeus", "Zuko"]},\n {"name": "Katie", "age": 38,\n "pets": ["Sixes", "Stache", "Cisco"]}]\n}\n'

In [78]:
import json

In [79]:
result = json.loads(obj) #Loading JSON data to Python Dict format

In [80]:
result

{'name': 'Wes',
 'places_lived': ['United States', 'Spain', 'Germany'],
 'pet': None,
 'siblings': [{'name': 'Scott', 'age': 30, 'pets': ['Zeus', 'Zuko']},
  {'name': 'Katie', 'age': 38, 'pets': ['Sixes', 'Stache', 'Cisco']}]}

In [81]:
asjson = json.dumps(result) #Loading Python Dict to JSON format

In [82]:
asjson

'{"name": "Wes", "places_lived": ["United States", "Spain", "Germany"], "pet": null, "siblings": [{"name": "Scott", "age": 30, "pets": ["Zeus", "Zuko"]}, {"name": "Katie", "age": 38, "pets": ["Sixes", "Stache", "Cisco"]}]}'

In [83]:
list(open('example.json'))

['[{"a": 1, "b": 2, "c": 3},\n',
 ' {"a": 4, "b": 5, "c": 6},\n',
 ' {"a": 7, "b": 8, "c": 9}]\n']

In [84]:
data = pd.read_json('example.json')

In [85]:
data

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


In [86]:
print(data.to_json())

{"a":{"0":1,"1":4,"2":7},"b":{"0":2,"1":5,"2":8},"c":{"0":3,"1":6,"2":9}}


In [87]:
print(data.to_json(orient='records'))

[{"a":1,"b":2,"c":3},{"a":4,"b":5,"c":6},{"a":7,"b":8,"c":9}]


## XML and HTML: Web Scraping

In [88]:
tables = pd.read_html('https://www.fdic.gov/bank/individual/failed/banklist.html')

In [89]:
len(tables)

1

In [90]:
failures = tables[0]

In [91]:
failures.head()

Unnamed: 0,Bank Name,City,ST,CERT,Acquiring Institution,Closing Date
0,The First State Bank,Barboursville,WV,14361,"MVB Bank, Inc.","April 3, 2020"
1,Ericson State Bank,Ericson,NE,18265,Farmers and Merchants Bank,"February 14, 2020"
2,City National Bank of New Jersey,Newark,NJ,21111,Industrial Bank,"November 1, 2019"
3,Resolute Bank,Maumee,OH,58317,Buckeye State Bank,"October 25, 2019"
4,Louisa Community Bank,Louisa,KY,58112,Kentucky Farmers Bank Corporation,"October 25, 2019"


In [92]:
failures.columns

Index(['Bank Name', 'City', 'ST', 'CERT', 'Acquiring Institution',
       'Closing Date'],
      dtype='object')

In [93]:
failures.shape

(561, 6)

In [94]:
close_timestamp = pd.to_datetime(failures['Closing Date'])

In [95]:
close_timestamp

0     2020-04-03
1     2020-02-14
2     2019-11-01
3     2019-10-25
4     2019-10-25
         ...    
556   2001-07-27
557   2001-05-03
558   2001-02-02
559   2000-12-14
560   2000-10-13
Name: Closing Date, Length: 561, dtype: datetime64[ns]

In [96]:
close_timestamp.dt.year.value_counts()

2010    157
2009    140
2011     92
2012     51
2008     25
2013     24
2014     18
2002     11
2017      8
2015      8
2016      5
2001      4
2019      4
2004      4
2007      3
2003      3
2020      2
2000      2
Name: Closing Date, dtype: int64

### Parsing XML with lxml.objectify

XML (eXtensible Markup Language)

In [97]:
from lxml import objectify

In [103]:
path = 'D:\Learning from Books\Python Book Solutions\Python for Data Analysis by Wes McKinney\Coding Practice\Chp6 - Data Loading, Storage, and File Formats\Performance_MNR.xml'

In [104]:
parsed = objectify.parse(open(path))
root = parsed.getroot()

In [107]:
data=[]

In [108]:
skip_fields = ['PARENT_SEQ', 'INDICATOR_SEQ', 'DESIRED_CHANGE', 'DECIMAL_PLACES']

In [111]:
for elt in root.INDICATOR:
    el_data = {}
    for child in elt.getchildren():
        if child.tag in skip_fields:
            continue
        el_data[child.tag] = child.pyval
    data.append(el_data)

In [114]:
perf = pd.DataFrame(data)

In [115]:
perf

Unnamed: 0,AGENCY_NAME,INDICATOR_NAME,DESCRIPTION,PERIOD_YEAR,PERIOD_MONTH,CATEGORY,FREQUENCY,INDICATOR_UNIT,YTD_TARGET,YTD_ACTUAL,MONTHLY_TARGET,MONTHLY_ACTUAL
0,Metro-North Railroad,On-Time Performance (West of Hudson),Percent of commuter trains that arrive at thei...,2008,1,Service Indicators,M,%,95,96.9,95,96.9
1,Metro-North Railroad,On-Time Performance (West of Hudson),Percent of commuter trains that arrive at thei...,2008,2,Service Indicators,M,%,95,96,95,95
2,Metro-North Railroad,On-Time Performance (West of Hudson),Percent of commuter trains that arrive at thei...,2008,3,Service Indicators,M,%,95,96.3,95,96.9
3,Metro-North Railroad,On-Time Performance (West of Hudson),Percent of commuter trains that arrive at thei...,2008,4,Service Indicators,M,%,95,96.8,95,98.3
4,Metro-North Railroad,On-Time Performance (West of Hudson),Percent of commuter trains that arrive at thei...,2008,5,Service Indicators,M,%,95,96.6,95,95.8
...,...,...,...,...,...,...,...,...,...,...,...,...
643,Metro-North Railroad,Escalator Availability,Percent of the time that escalators are operat...,2011,8,Service Indicators,M,%,97,,97,
644,Metro-North Railroad,Escalator Availability,Percent of the time that escalators are operat...,2011,9,Service Indicators,M,%,97,,97,
645,Metro-North Railroad,Escalator Availability,Percent of the time that escalators are operat...,2011,10,Service Indicators,M,%,97,,97,
646,Metro-North Railroad,Escalator Availability,Percent of the time that escalators are operat...,2011,11,Service Indicators,M,%,97,,97,


In [117]:
perf.head()

Unnamed: 0,AGENCY_NAME,INDICATOR_NAME,DESCRIPTION,PERIOD_YEAR,PERIOD_MONTH,CATEGORY,FREQUENCY,INDICATOR_UNIT,YTD_TARGET,YTD_ACTUAL,MONTHLY_TARGET,MONTHLY_ACTUAL
0,Metro-North Railroad,On-Time Performance (West of Hudson),Percent of commuter trains that arrive at thei...,2008,1,Service Indicators,M,%,95,96.9,95,96.9
1,Metro-North Railroad,On-Time Performance (West of Hudson),Percent of commuter trains that arrive at thei...,2008,2,Service Indicators,M,%,95,96.0,95,95.0
2,Metro-North Railroad,On-Time Performance (West of Hudson),Percent of commuter trains that arrive at thei...,2008,3,Service Indicators,M,%,95,96.3,95,96.9
3,Metro-North Railroad,On-Time Performance (West of Hudson),Percent of commuter trains that arrive at thei...,2008,4,Service Indicators,M,%,95,96.8,95,98.3
4,Metro-North Railroad,On-Time Performance (West of Hudson),Percent of commuter trains that arrive at thei...,2008,5,Service Indicators,M,%,95,96.6,95,95.8


In [118]:
from io import StringIO

In [119]:
path = '<a href="https://www.google.com/">Google</a>'

In [121]:
root = objectify.parse(StringIO(path)).getroot()

In [122]:
root

<Element a at 0x1a2bb9ed808>

In [123]:
root.get('href')

'https://www.google.com/'

In [125]:
root.text

'Google'

# 6.2 Binary Data Formats

In [126]:
df = pd.read_csv('ex1.csv')

In [131]:
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


Pickle is a module used for serializabilty to convert python data object in serial stream of characters.

In [128]:
df.to_pickle('ex1_pickle')

In [129]:
df2 = pd.read_pickle('ex1_pickle')

In [130]:
df2

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


## Using HDF5 Format

The “HDF” in HDF5 stands for hierarchical data format. HDF5 is a well-regarded file format intended for storing large quantities of scientific
array data. 

In [132]:
df = pd.DataFrame({'a': np.random.randn(100)})

In [133]:
df

Unnamed: 0,a
0,0.120529
1,1.555121
2,0.221475
3,0.323995
4,-0.881134
...,...
95,-1.311547
96,-0.033747
97,-0.243855
98,-0.160024


In [136]:
import h5py

In [138]:
d1 = np.random.random(size=(1000,20))

In [145]:
d1.shape

(1000, 20)

In [146]:
d2 = np.random.random(size=(1000,200))

In [147]:
d2.shape

(1000, 200)

In [148]:
hf = h5py.File('data.h5', 'w')

In [151]:
hf.create_dataset('data_set1', data=d1)

<HDF5 dataset "data_set1": shape (1000, 20), type "<f8">

In [152]:
hf.create_dataset('data_set2', data=d2)

<HDF5 dataset "data_set2": shape (1000, 200), type "<f8">

In [153]:
hf.close()

In [154]:
hf = h5py.File('data.h5', 'r')

In [155]:
hf.keys()

<KeysViewHDF5 ['data_set1', 'data_set2']>

In [156]:
n1 = hf.get('data_set1')

In [157]:
n1

<HDF5 dataset "data_set1": shape (1000, 20), type "<f8">

In [158]:
n1 = np.array(n1)

In [159]:
n1

array([[0.76853693, 0.83352346, 0.95412016, ..., 0.2170643 , 0.19811873,
        0.15931744],
       [0.78178852, 0.53652385, 0.93298853, ..., 0.9252314 , 0.6506316 ,
        0.74806727],
       [0.31891824, 0.01903655, 0.76597477, ..., 0.66090125, 0.92185808,
        0.86844108],
       ...,
       [0.31886707, 0.8045823 , 0.95827227, ..., 0.89644588, 0.13108838,
        0.41186011],
       [0.66595011, 0.33342658, 0.08071331, ..., 0.70287937, 0.29614373,
        0.51229341],
       [0.64243391, 0.54985703, 0.0856332 , ..., 0.2231183 , 0.60994592,
        0.40691665]])

In [161]:
d1 = np.random.random((100, 33))

In [162]:
d2 = np.random.random((1000, 333))

In [163]:
d3 = np.random.random((10000, 3333))

In [164]:
hf = h5py.File('data2.h5', 'w')

In [165]:
g1 = hf.create_group('group_1')

In [168]:
g1.create_dataset('data1', data=d1)

<HDF5 dataset "data1": shape (100, 33), type "<f8">

In [169]:
g2 = hf.create_group('group_2')

In [170]:
g2.create_dataset('data2', data=d2)

<HDF5 dataset "data2": shape (1000, 333), type "<f8">

In [171]:
g3 = hf.create_group('group_3/subfolder')

In [172]:
g3.create_dataset('data3', data=d3)

<HDF5 dataset "data3": shape (10000, 3333), type "<f8">

In [174]:
hf.keys()

<KeysViewHDF5 ['group_1', 'group_2', 'group_3']>

In [176]:
hf['group_1'].keys()

<KeysViewHDF5 ['data1']>

In [177]:
hf['group_2'].keys()

<KeysViewHDF5 ['data2']>

In [180]:
hf['group_3/subfolder'].keys()

<KeysViewHDF5 ['data3']>

pd.HDF5 not wokring used h5py.

HDF5 is not a database. It is best suited for write-once, read-many
datasets. While data can be added to a file at any time, if multiple
writers do so simultaneously, the file can become corrupted.


## Reading Microsoft Excel Files

In [193]:
xlsx = pd.ExcelFile('ex1.xlsx')

In [194]:
pd.read_excel(xlsx, sheet_name='Sheet1')

Unnamed: 0.1,Unnamed: 0,a,b,c,d,message
0,0,1,2,3,4,hello
1,1,5,6,7,8,world
2,2,9,10,11,12,foo


In [195]:
pd.read_excel('ex1.xlsx', sheet_name='Sheet1')

Unnamed: 0.1,Unnamed: 0,a,b,c,d,message
0,0,1,2,3,4,hello
1,1,5,6,7,8,world
2,2,9,10,11,12,foo


In [197]:
writer = pd.ExcelWriter('ex2.xlsx')

In [198]:
perf.to_excel(writer, sheet_name='Sheet1')

In [199]:
writer.save()

In [201]:
failures.to_excel('List_of_Failed_Banks.xlsx', sheet_name='Sheet1') #Easy way

# 6.3 Interacting with Web APIs

In [202]:
import requests

In [203]:
url = 'https://api.github.com/repos/pandas-dev/pandas/issues'

In [204]:
response = requests.get(url)

In [206]:
response

<Response [200]>

In [207]:
data = response.json()

In [208]:
data

[{'url': 'https://api.github.com/repos/pandas-dev/pandas/issues/34423',
  'repository_url': 'https://api.github.com/repos/pandas-dev/pandas',
  'labels_url': 'https://api.github.com/repos/pandas-dev/pandas/issues/34423/labels{/name}',
  'comments_url': 'https://api.github.com/repos/pandas-dev/pandas/issues/34423/comments',
  'events_url': 'https://api.github.com/repos/pandas-dev/pandas/issues/34423/events',
  'html_url': 'https://github.com/pandas-dev/pandas/pull/34423',
  'id': 626204029,
  'node_id': 'MDExOlB1bGxSZXF1ZXN0NDI0MjM5OTEz',
  'number': 34423,
  'title': 'BUG: Fix failing MacPython 32bit wheels for groupby rolling',
  'user': {'login': 'mroeschke',
   'id': 10647082,
   'node_id': 'MDQ6VXNlcjEwNjQ3MDgy',
   'avatar_url': 'https://avatars0.githubusercontent.com/u/10647082?v=4',
   'gravatar_id': '',
   'url': 'https://api.github.com/users/mroeschke',
   'html_url': 'https://github.com/mroeschke',
   'followers_url': 'https://api.github.com/users/mroeschke/followers',
   'fo

In [213]:
data[0]['created_at']

'2020-05-28T04:26:34Z'

In [216]:
issues = pd.DataFrame(data=data, columns=['number', 'id', 'title', 'created_at', 'labels', 'state'])

In [222]:
issues.head()

Unnamed: 0,number,id,title,created_at,labels,state
0,34423,626204029,BUG: Fix failing MacPython 32bit wheels for gr...,2020-05-28T04:26:34Z,"[{'id': 563047854, 'node_id': 'MDU6TGFiZWw1NjM...",open
1,34422,626197784,BUG: SeriesGroupBy works with any column name ...,2020-05-28T04:09:08Z,"[{'id': 76811, 'node_id': 'MDU6TGFiZWw3NjgxMQ=...",open
2,34421,626180510,BUG: taking slices in _slice_take_blocks_ax0,2020-05-28T03:16:31Z,[],open
3,34420,626160397,REF: move to_offset to liboffsets,2020-05-28T02:16:20Z,[],open
4,34419,626122785,REF: make remaining offset classes cdef,2020-05-28T00:20:04Z,[],open


# 6.4 Interacting with Databases

In [None]:
#Will be using mySQL