In [1]:
import numpy as np
import pandas as pd
# https://github.com/DitDotz/pydata-book/tree/2nd-edition/examples

# Text format

## Read first line as header

In [28]:
pd.read_csv('ex2.csv')

Unnamed: 0,1,2,3,4,hello
0,5,6,7,8,world
1,9,10,11,12,foo


### Read entire file as data

In [29]:
pd.read_csv('ex2.csv',header=None)
#i.e. first line is not the header

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


### Read entire file as data, write your own header

In [30]:
pd.read_csv('ex2.csv', names=['a', 'b', 'c', 'd', 'message'])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


### Read entire file as data, write your own header + change 1 col to index

In [31]:
names = ['a', 'b', 'c', 'd', 'message']
pd.read_csv('ex2.csv', names=names, index_col='message')

Unnamed: 0_level_0,a,b,c,d
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2,3,4
world,5,6,7,8
foo,9,10,11,12


## Reading white spaced data

In [32]:
pd.read_csv('ex3.txt',sep='\s+')

# In the case that there is a column of values without a column header, the values will 
# be used as the index

Unnamed: 0,A,B,C
aaa,-0.264438,-1.026059,-0.6195
bbb,0.927272,0.302904,-0.032399
ccc,-0.264273,-0.386314,-0.217601
ddd,-0.871858,-0.348382,1.100491


## Skipping specific rows

In [33]:
pd.read_csv('ex4.txt',skiprows=[0, 2, 3])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


## Handling missing values

In [34]:
result = pd.read_csv('ex5.txt')
result

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


###  Replacing filler values in real data with NaN

In [26]:
sentinels = {'message': ['foo', 'NA'], 'something': ['two']}
pd.read_csv('ex5.txt', na_values=sentinels)

# Specify column > column value to be replaced by NaN

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,,5,6,,8,world
2,three,9,10,11.0,12,


## Reading a portion of the file

### Specify number of rows to read from a large file

In [36]:
pd.read_csv('ex6.txt',nrows=5)

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.50184,0.659254,-0.421691,-0.057688,G
3,0.204886,1.074134,1.388361,-0.982404,R
4,0.354628,-0.133116,0.283763,-0.837063,Q


# Writing data to a different format

In [43]:
data = pd.read_csv('ex5.txt')
data
# Compare with the file that appears in Home

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [41]:
data.to_csv('out.csv',sep='|',na_rep='NULL')
# Missing values appear as empty strings in the output. You might want to denote them
# by some other sentinel value

## Specify whether you want header to be included

In [42]:
data.to_csv('out.csv', index=False, header=False)

## Specify specific cols to be included

In [46]:
data.to_csv('out.csv', index=False, columns=['a', 'b', 'c'])

# Excel format

In [41]:
file = pd.read_excel('ex1.xlsx',sheet_name='Sheet1')
file

Unnamed: 0.1,Unnamed: 0,a,b,c,d,message
0,0,1,2,3,4,hello
1,1,5,6,7,8,world
2,2,9,10,11,12,foo


# JSON format

In [3]:
# JSON (short for JavaScript Object Notation) has become one 
# of the standard formats for sending data by HTTP request between web browsers and other applications.

import json

In [4]:
obj = """
{"name": "Wes",
"places_lived": ["United States", "Spain", "Germany"],
"pet": null,
"siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"},
{"name": "Katie", "age": 33, "pet": "Cisco"}]
}
"""

## Read JSON format to Python object

In [6]:
result = json.loads(obj)
result

{'name': 'Wes',
 'places_lived': ['United States', 'Spain', 'Germany'],
 'pet': None,
 'siblings': [{'name': 'Scott', 'age': 25, 'pet': 'Zuko'},
  {'name': 'Katie', 'age': 33, 'pet': 'Cisco'}]}

## Read Python object to JSON format

In [10]:
asjson = json.dumps(result)
asjson

'{"name": "Wes", "places_lived": ["United States", "Spain", "Germany"], "pet": null, "siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"}, {"name": "Katie", "age": 33, "pet": "Cisco"}]}'

## Convert JSON to dataframe

In [13]:
# you can pass a list of JSON objects
# to the DataFrame constructor and select a subset of the data fields

siblings = pd.DataFrame(result['siblings'], columns=['name', 'age'])
siblings

Unnamed: 0,name,age
0,Scott,25
1,Katie,33


# HTML format (incomplete)

In [15]:
from lxml.html import parse
from urllib.request import urlopen
# Necessary imports

In [17]:
parsed = parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options'))
doc = parsed.getroot()
# Creates Python-readable object

## Find all specified tags

### Links - 'a' tag

#### Create Python object for the 'a' tag

In [20]:
links = doc.findall('.//a')
# Creates Python object of URLs, additonal 'get' method required to show the actual links

links[15:20]

[<Element a at 0x25f615d1ea8>,
 <Element a at 0x25f615d1ef8>,
 <Element a at 0x25f615d1f48>,
 <Element a at 0x25f615d1f98>,
 <Element a at 0x25f6160e048>]

#### 'Get' Method to process Python object

In [21]:
lnk = links[0]

In [22]:
lnk.get('href')

'https://finance.yahoo.com/'

In [23]:
lnk.text_content()

'Yahoo'

#### List comprehension to get all objects (combining the methods above)

In [27]:
urls = [lnk.get('href') for lnk in doc.findall('.//a')]
urls[-10:]

['/',
 '/watchlists',
 '/portfolios',
 '/screener',
 '/calendar',
 '/industries',
 '/videos/',
 '/news/',
 '/personal-finance',
 '/tech']

### Table tag

In [33]:
tables = doc.findall('.//table')
calls = tables[3]

IndexError: list index out of range

### Row tag

In [34]:
rows = calls.findall('.//tr')

NameError: name 'calls' is not defined