In [1]:
import pandas as pd
pd.__version__

'1.0.5'

### Parameters skipped to be revisited...
`true_values`  
`false_values`  
`skipinitialspace`

# `pd.read_csv()`

Read a [comma-separated values (csv)](https://en.wikipedia.org/wiki/Comma-separated_values) file into DataFrame.

## Reading in files

In [2]:
df_from_local_file = pd.read_csv('data/pf.csv')
df_from_local_file

Unnamed: 0,Item,Quantity
0,Book,10
1,Pen,9
2,Ruler,5
3,Bag,2


In [3]:
df_from_online_file = pd.read_csv('https://egmgem.com/pf.csv')
df_from_online_file

Unnamed: 0,Item,Quantity
0,Book,10
1,Pen,9
2,Ruler,5
3,Bag,2


## Reading in using the `sep` or `delimiter` parameters
By default, the `sep` and the `delimiter` parameters use the **`comma`** as the separator. You can specify what to use as the separator.

In [4]:
df_comma = pd.read_csv('data/pf-comma.csv', sep = ',')
df_comma

Unnamed: 0,Item,Quantity
0,Book,10
1,Pen,9
2,Ruler,5
3,Bag,2


In [5]:
df_3 = pd.read_csv('data/pf-3.csv', sep = '3')
df_3

Unnamed: 0,Item,Quantity
0,Book,10
1,Pen,9
2,Ruler,5
3,Bag,2


In [6]:
df_z = pd.read_csv('data/pf-z.csv', sep = 'z')
df_z

Unnamed: 0,Item,Quantity
0,Book,10
1,Pen,9
2,Ruler,5
3,Bag,2


In [7]:
df_colon = pd.read_csv('data/pf-colon.csv', sep = ':')
df_colon

Unnamed: 0,Item,Quantity
0,Book,10
1,Pen,9
2,Ruler,5
3,Bag,2


In [8]:
df_semicolon = pd.read_csv('data/pf-semicolon.csv', sep = ';')
df_semicolon

Unnamed: 0,Item,Quantity
0,Book,10
1,Pen,9
2,Ruler,5
3,Bag,2


In [9]:
df_space = pd.read_csv('data/pf-space.csv', sep = ' ')
df_space

Unnamed: 0,Item,Quantity
0,Book,10
1,Pen,9
2,Ruler,5
3,Bag,2


In [10]:
df_dash = pd.read_csv('data/pf-dash.csv', delimiter = '-')
df_dash

Unnamed: 0,Item,Quantity
0,Book,10
1,Pen,9
2,Ruler,5
3,Bag,2


In [11]:
df_tab = pd.read_csv('data/pf-tab.csv', delimiter = '\t')
df_tab

Unnamed: 0,Item,Quantity
0,Book,10
1,Pen,9
2,Ruler,5
3,Bag,2


## Reading in using the `header` parameter

By default `header = 0`. You have to specify the line number to use from which to get the column names. If set to `None`, then numbers will be used to indicate the column names.

In [12]:
df_header = pd.read_csv('data/pf-with-header.csv')
df_header

Unnamed: 0,Extra line
Item,Quantity
Book,10
Pen,9
Ruler,5
Bag,2


In [13]:
df_header = pd.read_csv('data/pf-with-header.csv', header = 1)
df_header

Unnamed: 0,Item,Quantity
0,Book,10
1,Pen,9
2,Ruler,5
3,Bag,2


In [14]:
df = pd.read_csv('data/pf-no-header.csv', header = None)
df

Unnamed: 0,0,1
0,Book,10
1,Pen,9
2,Ruler,5
3,Bag,2


## Reading in using the `names` parameter

For this to work, `header` must be set to the row number containing column names which will be over-written by the command.

In [15]:
df_names = pd.read_csv('data/pf-with-header.csv', names = ['Thing', 'Amount'])
df_names

Unnamed: 0,Thing,Amount
0,Extra line,
1,Item,Quantity
2,Book,10
3,Pen,9
4,Ruler,5
5,Bag,2


In [16]:
df_names = pd.read_csv('data/pf-with-header.csv', names = ['Thing', 'Amount'], header = 1)
df_names

Unnamed: 0,Thing,Amount
0,Book,10
1,Pen,9
2,Ruler,5
3,Bag,2


## Reading in using the `index_col` parameter

In [17]:
df_col = pd.read_csv('data/pf-col-index.csv')
df_col

Unnamed: 0,Index,Item,Quantity
0,a,Book,10
1,b,Pen,9
2,c,Ruler,5
3,d,Bag,2


In [18]:
df_col = pd.read_csv('data/pf-col-index-switched.csv', index_col = 'Index')
df_col

Unnamed: 0_level_0,Item,Quantity
Index,Unnamed: 1_level_1,Unnamed: 2_level_1
a,Book,10
b,Pen,9
c,Ruler,5
d,Bag,2


In [19]:
df_col = pd.read_csv('data/pf-col-index-switched.csv', index_col = 2)
df_col

Unnamed: 0_level_0,Item,Quantity
Index,Unnamed: 1_level_1,Unnamed: 2_level_1
a,Book,10
b,Pen,9
c,Ruler,5
d,Bag,2


In [20]:
df_col = pd.read_csv('data/pf-col-index-switched-multi.csv', index_col = ['Index', 'Code'])
df_col

Unnamed: 0_level_0,Unnamed: 1_level_0,Item,Quantity
Index,Code,Unnamed: 2_level_1,Unnamed: 3_level_1
a,z1,Book,10
b,y2,Pen,9
c,x3,Ruler,5
d,w4,Bag,2


In [21]:
df_col = pd.read_csv('data/pf-col-index-switched-multi.csv', index_col = [2,3])
df_col

Unnamed: 0_level_0,Unnamed: 1_level_0,Item,Quantity
Index,Code,Unnamed: 2_level_1,Unnamed: 3_level_1
a,z1,Book,10
b,y2,Pen,9
c,x3,Ruler,5
d,w4,Bag,2


## Reading in using the `usecols` parameter

In [22]:
df_uc = pd.read_csv('data/pf-col-index-switched-multi.csv', usecols = [0,1])
df_uc

Unnamed: 0,Item,Quantity
0,Book,10
1,Pen,9
2,Ruler,5
3,Bag,2


In [23]:
df_uc = pd.read_csv('data/pf-col-index-switched-multi.csv', usecols = ['Item', 'Quantity'])
df_uc

Unnamed: 0,Item,Quantity
0,Book,10
1,Pen,9
2,Ruler,5
3,Bag,2


## Reading in using the `squeeze` parameter
By default it is `False`. If set to `True`, and if only one column is read in, the result is returned as a **Series**.

In [24]:
df_sqd = pd.read_csv('data/pf.csv', usecols = ['Item'])
df_sqd

Unnamed: 0,Item
0,Book
1,Pen
2,Ruler
3,Bag


In [25]:
df_sqs = pd.read_csv('data/pf.csv', usecols = ['Item'], squeeze = True)
df_sqs

0     Book
1      Pen
2    Ruler
3      Bag
Name: Item, dtype: object

In [26]:
type(df_sqd), type(df_sqs)

(pandas.core.frame.DataFrame, pandas.core.series.Series)

## Reading in using the `prefix` parameter

In [27]:
df = pd.read_csv('data/pf-no-header.csv', header = None, prefix = 'pfc_')
df

Unnamed: 0,pfc_0,pfc_1
0,Book,10
1,Pen,9
2,Ruler,5
3,Bag,2


## Reading in using the `mangle_dupe_cols` parameter
By default, it is set to `True`. It appends a number to the column name to differentiate it from previous columns. If set to `False`, then data will be overwritten. However, setting it to `False` is not supported in implementation.

In [28]:
df = pd.read_csv('data/pf-duplicate.csv', mangle_dupe_cols = True)
df

Unnamed: 0,Item,Quantity,Quantity.1
0,Book,10,13
1,Pen,9,4
2,Ruler,5,8
3,Bag,2,3


## Reading in using the `dtype` parameter

In [29]:
df = pd.read_csv('data/pf-with-price.csv', dtype = {'Item' : str, 'Quantity' : int, 'Price' : float})
df

Unnamed: 0,Item,Quantity,Price
0,Book,10,300.0
1,Pen,9,45.0
2,Ruler,5,50.0
3,Bag,2,500.0


In [30]:
df.dtypes

Item         object
Quantity      int64
Price       float64
dtype: object

## Reading in using the `engine` parameter
Parser engine to use. The C engine is faster while the python engine is currently more feature-complete.  
You don't see anything different in the output, since the effect is in the background. However, for parameters that have been implemented in python and not C, a warning will be raised.

In [38]:
df = pd.read_csv('data/pf.csv', engine = 'c')
df

Unnamed: 0,Item,Quantity
0,Book,10
1,Pen,9
2,Ruler,5
3,Bag,2


In [39]:
df = pd.read_csv('data/pf.csv', engine = 'python')
df

Unnamed: 0,Item,Quantity
0,Book,10
1,Pen,9
2,Ruler,5
3,Bag,2


## Reading in using the `converters` parameter
Dictionary of functions for converting values in certain columns. Keys can either be integers or column labels.

In [31]:
def double(x):
    x = int(x)
    return x * 2

def square_root(x):
    x = int(x)
    return round(x ** (1/2),2)

df = pd.read_csv('data/pf-with-price.csv', converters = {'Quantity' : double, 2 : square_root})
df

Unnamed: 0,Item,Quantity,Price
0,Book,20,17.32
1,Pen,18,6.71
2,Ruler,10,7.07
3,Bag,4,22.36


## Reading in using the `skiprows` parameter.

Line numbers to skip, or number of lines to skip at the start of the file.

In [32]:
df = pd.read_csv('data/pf-with-header.csv', skiprows = 1)
df

Unnamed: 0,Item,Quantity
0,Book,10
1,Pen,9
2,Ruler,5
3,Bag,2


In [33]:
df = pd.read_csv('data/pf-with-header.csv', skiprows = [0,2])
df

Unnamed: 0,Item,Quantity
0,Pen,9
1,Ruler,5
2,Bag,2


In [34]:
df = pd.read_csv('data/pf-with-header.csv', skiprows = lambda x: x in [0,2,4])
df

Unnamed: 0,Item,Quantity
0,Pen,9
1,Bag,2


## Reading in using the `skipfooter` parameter
Example where use of `engine` parameter is useful, as the documentation says: Number of lines at bottom of file to skip (Unsupported with `engine='c'`).

In [40]:
df = pd.read_csv('data/pf.csv', skipfooter = 1)
df

  """Entry point for launching an IPython kernel.


Unnamed: 0,Item,Quantity
0,Book,10
1,Pen,9
2,Ruler,5


In [41]:
df = pd.read_csv('data/pf.csv', skipfooter = 1, engine = 'python')
df

Unnamed: 0,Item,Quantity
0,Book,10
1,Pen,9
2,Ruler,5


## Reading in using the `nrows` parameter
Number of rows of file to read.

In [44]:
df = pd.read_csv('data/pf.csv', nrows = 2)
df

Unnamed: 0,Item,Quantity
0,Book,10
1,Pen,9
