# Data Loading and Storage & File Formats 

In [2]:
# Python has become a beloved language for text and file munging due to its simple syntax
# for interacting with files, intuitive data structures, and convenient features like tuple
# packing and unpacking. 
# Function Description
# read_csv >>>>> Load delimited data from a file, URL, or file-like object. Use comma as default delimiter
# read_table >>>>> Load delimited data from a file, URL, or file-like object. Use tab ('\t') as default delimiter
# read_fwf >>>>>> Read data in fixed-width column format (that is, no delimiters)
# read_clipboard >>>>> Version of read_table that reads data from the clipboard. Useful for converting tables from web pages 

In [5]:
import pandas as pd
# df = pd.read_csv('examples/ex1.csv') # read_csv 

In [1]:
# We could also have used read_table and specified the delimiter: 
# df = pd.read_table('examples/ex1.csv', sep=',') # read_table

In [3]:
# A file will not always have a header row. Consider this file: 
# To read this file, you have a couple of options. You can allow pandas to assign default
# column names, or you can specify names yourself:
# df = pd.read_csv('examples/ex2.csv', header=None) # read_csv

In [4]:
# pd.read_csv('examples/ex2.csv', names=['a', 'b', 'c', 'd', 'message']) # read_csv

In [5]:
# Suppose you wanted the message column to be the index of the returned DataFrame. 
# You can either indicate you want the column at index 4 or names 'message' using the index_col argument: 
names = ['a', 'b', 'c', 'd', 'message'] 
# df = pd.read_csv('examples/ex2.csv', names=names, index_col='message') # read_csv with names and index_col

In [6]:
# In the event that you want to form a hierarchical index from multiple columns, pass a list of column numbers or names:
# parsed = pd.read_csv('examples/csv_mindex.csv', index_col=['key1', 'key2']) # read_csv with hierarchical index 

In [7]:
# In some cases a table might not have a fixed delimiter, using whitespace or some other pattern to separate fields.
# In these cases, you can pass a regular expression as a delimiter for read_table.
# Consider a text file that looks like this: 
# list(open('examples/ex3.txt')) # read_table with regular expression as delimiter 

In [8]:
# While you could do some munging by hand, the fields are separated by a variable amount of whitespace. 
# In these cases, you can pass a regular expression as a delimiter for read_table. 
# This can be expressed by the regular expression \s+, so we have then: 
# result = pd.read_table('examples/ex3.txt', sep='\s+') # read_table with regular expression as delimiter 

In [9]:
# Because there was one fewer column name than the number of data rows, 
# read_table infers that the first column should be the DataFrame's index in this special case. 
# You can explicitly indicate that you want the first column of data to be the index using the index_col argument: 
# result = pd.read_table('examples/ex3.txt', sep='\s+', index_col=[0, 1]) 
# # read_table with regular expression as delimiter and hierarchical index  

In [10]:
# hey 
# skiprows allows you to skip over rows, either at the beginning, end, or at specific indices.
# For example, we might want to skip the first, third, and fourth rows of a file like this:
# list(open('examples/ex4.csv')) # read_csv with skiprows
# df = pd.read_csv('examples/ex4.csv', skiprows=[0, 2, 3]) # read_csv with skiprows 

In [11]:
# Handling Missing Values is an important and frequently nuanced part of the file parsing process. 
# Missing data is usually either not present (empty string) or marked by some sentinel value 
# Most tabular data formats represent missing data as NA:
# result = pd.read_csv('examples/ex5.csv') # read_csv with missing values 
# result = pd.read_csv('examples/ex5.csv', na_values=['NULL']) # read_csv with missing values and na_values


In [12]:
# Different NA sentinels can be specified for each column in a dict: 
# sentinels = {'message': ['foo', 'NA'], 'something': ['two']} 
# result = pd.read_csv('examples/ex5.csv', na_values=sentinels) # read_csv with missing values and na_values in dict 