In [None]:
import pandas as pd
import numpy as np

# 6.1 Reading and Writing Data in Text Format

## read_

read_csv: load delimited data from a file, URL, or file-like object; use a comma as deault delimiter.

read_fwf: Read data in fixed-width col

read_clipboard , read_excel, read_hdf (panda), read_html, read_json, read_feather, read_orc(Apache ORC), read_parquet(Apache Parquet), read_pickle(Python), read_sas, read_spss, read_sql, read_sql_table(SQLAlchemy), read_stata, read_xml

## Read in and name cols
pd.read_csv("examples/ex1.csv")

pd.read_csv("examples/ex2.csv", header=None)
This one allows Panda to name the header

pd.read_csv("examples/ex2.csv", names=["a", "b", "c", "d", "message"])
This names the cols as it's read in

## Rearrange the cols
names = ["a", "b", "c", "d", "message"]

pd.read_csv("examples/ex2.csv", names=names, index_col="message")
message will now be first col.

## Hierarchical Indexing
(Original form)
key1,key2,value1,value2
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16

(New form)
parsed = pd.read_csv("examples/csv_mindex.csv", index_col=["key1", "key2"])

parsed
           value1  value2
key1 key2                
one  a          1       2
     b          3       4
     c          5       6
     d          7       8
two  a          9      10
     b         11      12
     c         13      14
     d         15      16

## Tables with whitespace delimiters
(Original form)
A         B         C
aaa -0.264438 -1.026059 -0.619500
bbb  0.927272  0.302904 -0.032399
ccc -0.264273 -0.386314 -0.217601
ddd -0.871858 -0.348382  1.100491

No commas seperating the data, so reading it in takes a bit more.

result = pd.read_csv("examples/ex3.txt", sep="\s+")
The sep="\s+" deliminates along the whitespace.

result 
            A         B         C
aaa -0.264438 -1.026059 -0.619500
bbb  0.927272  0.302904 -0.032399
ccc -0.264273 -0.386314 -0.217601
ddd -0.871858 -0.348382  1.100491

## Exclude rows when reading
(Original)
~hey!
a,b,c,d,message
~just wanted to make things more difficult for you
~who reads CSV files with computers, anyway?
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo

(New)
pd.read_csv("examples/ex4.csv", skiprows=[0, 2, 3])
 
   a   b   c   d message
0  1   2   3   4   hello
1  5   6   7   8   world
2  9  10  11  12     foo


## Handeling NAs
(original)
something,a,b,c,d,message
one,1,2,3,4,NA
two,5,6,,8,world
three,9,10,11,12,foo

(Common result)
result = pd.read_csv("examples/ex5.csv")

result 
  something  a   b     c   d message
0       one  1   2   3.0   4     NaN
1       two  5   6   NaN   8   world
2     three  9  10  11.0  12     foo

.isna will show two of these fields are True. We can prevent that to protect the data from other functions that might delete rows/cols with nas.

result = pd.read_csv("examples/ex5.csv", na_values=["Banana"])

result
  something  a   b     c   d message
0       one  1   2   3.0   4  Banana
1       two  5   6 Banana  8   world
2     three  9  10  11.0  12     foo

You can also force isna to skip these nas as well.

result2 = pd.read_csv("examples/ex5.csv", keep_default_na=False)