# Pandas read_csv() tricks

This is a notebook for the medium article [All the Pandas read_csv() you should know to speed up your data analysis](https://medium.com/@bindiatwork/all-the-pandas-read-csv-you-should-know-to-speed-up-your-data-analysis-1e16fe1039f3)

Please check out article for instructions

**License**: [BSD 2-Clause](https://opensource.org/licenses/BSD-2-Clause)

In [1]:
import pandas as pd
import numpy as np

### 1. Encoding

In [2]:
# Create a DataFrame with different encoding
df = pd.DataFrame({'name': '一 二 三 四'.split(), 'number': [2, 0, 2, 3]})

df.to_csv('data/data_1.csv', encoding='gb2312', index=False)

In [3]:
# Read it with default encoding='utf8'
# You should get an error 
pd.read_csv('data/data_1.csv')

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xb6 in position 0: invalid start byte

In [4]:
# Read it correctly with encoding='gb2312'
pd.read_csv('data/data_1.csv', encoding='gb2312')

Unnamed: 0,name,number
0,一,2
1,二,0
2,三,2
3,四,3


### 2. Dealing with headers

In [5]:
pd.read_csv('data/data_2_no_headers.csv',header=None)

Unnamed: 0,0,1,2,3
0,a,10,5,1
1,b,20,12,2
2,c,30,20,3
3,d,40,30,4


In [6]:
pd.read_csv('data/data_2.csv',header=1)

Unnamed: 0,product,price,cost,profit
0,a,10,5,1
1,b,20,12,2
2,c,30,20,3
3,d,40,30,4


### 3. Dealing with columns

In [7]:
pd.read_csv('data/data_2.csv',
            header=1,
            usecols=['product', 'cost'])

Unnamed: 0,product,cost
0,a,5
1,b,12
2,c,20
3,d,30


In [8]:
pd.read_csv('data/data_2.csv',
            header=1,
            usecols=[0, 1])

Unnamed: 0,product,price
0,a,10
1,b,20
2,c,30
3,d,40


### 4. Parsing date columns

In [9]:
df = pd.read_csv('data/data_3.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   date     4 non-null      object
 1   product  4 non-null      object
 2   price    4 non-null      int64 
 3   cost     4 non-null      int64 
 4   profit   4 non-null      int64 
dtypes: int64(3), object(2)
memory usage: 288.0+ bytes


In [10]:
df = pd.read_csv('data/data_3.csv', parse_dates=['date'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   date     4 non-null      datetime64[ns]
 1   product  4 non-null      object        
 2   price    4 non-null      int64         
 3   cost     4 non-null      int64         
 4   profit   4 non-null      int64         
dtypes: datetime64[ns](1), int64(3), object(1)
memory usage: 288.0+ bytes


In [11]:
df

Unnamed: 0,date,product,price,cost,profit
0,2019-01-01,A,10,5,1
1,2019-01-02,B,20,12,2
2,2019-01-03,C,30,20,3
3,2019-01-04,D,40,30,4


In [12]:
df = pd.read_csv('data/data_4.csv',
                 parse_dates=[['year', 'month', 'day']])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   year_month_day  4 non-null      datetime64[ns]
 1   product         4 non-null      object        
 2   price           4 non-null      int64         
 3   cost            4 non-null      int64         
 4   profit          4 non-null      int64         
dtypes: datetime64[ns](1), int64(3), object(1)
memory usage: 288.0+ bytes


In [13]:
df = pd.read_csv('data/data_4.csv',
                 parse_dates={ 'date': ['year', 'month', 'day']})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   date     4 non-null      datetime64[ns]
 1   product  4 non-null      object        
 2   price    4 non-null      int64         
 3   cost     4 non-null      int64         
 4   profit   4 non-null      int64         
dtypes: datetime64[ns](1), int64(3), object(1)
memory usage: 288.0+ bytes


In [14]:
df = pd.read_csv('data/data_5.csv',
                 parse_dates={
                     'DoB': ['dob_year', 'dob_month', 'dob_day'],
                     'JoinDate': ['join_year', 'join_month', 'join_day']
                 })
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   DoB       4 non-null      datetime64[ns]
 1   JoinDate  4 non-null      datetime64[ns]
 2   name      4 non-null      object        
 3   sex       4 non-null      object        
dtypes: datetime64[ns](2), object(2)
memory usage: 256.0+ bytes


In [15]:
df

Unnamed: 0,DoB,JoinDate,name,sex
0,2000-01-01,2019-01-01,A,F
1,2002-01-02,2017-02-03,B,F
2,1985-01-03,2015-04-10,C,M
3,1988-01-04,2016-01-02,D,M


In [16]:
# Customizing a date parser
from datetime import datetime
custom_date_parser = lambda x: datetime.strptime(x, "%Y %m %d %H:%M:%S")

df = pd.read_csv('data/data_6.csv',
                 parse_dates=['date'],
                date_parser=custom_date_parser)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    3 non-null      datetime64[ns]
 1   name    3 non-null      object        
dtypes: datetime64[ns](1), object(1)
memory usage: 176.0+ bytes


In [17]:
df

Unnamed: 0,date,name
0,2016-06-10 20:30:00,A
1,2016-07-11 19:45:30,B
2,2013-10-12 04:05:01,C


### 5. Setting data type

In [18]:
df = pd.read_csv('data/data_7.csv',
                 dtype={
                     'Name': str,
                     'Grade': int
                 })

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112 entries, 0 to 111
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    112 non-null    object
 1   Grade   112 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 1.9+ KB


In [19]:
df

Unnamed: 0,Name,Grade
0,A1,100
1,A2,56
2,A3,78
3,A4,58
4,A5,56
...,...,...
107,A108,57
108,A109,79
109,A110,62
110,A111,67


### 6. Finding and locating invalid values

In [20]:
# ValueError: invalid literal for int() with base 10: 'x'
df = pd.read_csv('data/data_8.csv',
                 dtype={
                     'Name': str,
                     'Grade': int
                 })
df.info()

ValueError: invalid literal for int() with base 10: 'a'

In [21]:
# Find and locate error
df = pd.read_csv('data/data_8.csv')

is_error = pd.to_numeric(df['Grade'], errors='coerce').isna()

df[is_error]

Unnamed: 0,Name,Grade
7,A8,a
15,A16,57b
26,A27,91c


### 7. appending data to csv file

In [22]:
new_record = pd.DataFrame([['New name', pd.to_datetime('today')]],
                      columns=['Name', 'Date'])

new_record.to_csv('data/existing_data.csv',
              mode='a',
              header=None,
              index=False)

### 8. Loading a huge CSV file with chunksize

In [35]:
# Make up a huge dataset
nums =  100_000 

for name in 'a b c d'.split():
    df = pd.DataFrame({
        'col_1': [1]*nums,
        'col_2': np.random.randint(100, 2000, size=nums)
    })

    df['name'] = name
    df.to_csv('data/big_file.csv',
              mode='a',
              index=False,
              header= name=='a')

In [36]:
# Loading data by chunks
dfs = pd.read_csv('data/big_file.csv',
                  chunksize=50_000,
                  dtype={
                      'col_1': int,
                      'col_2': int,
                      'name': str
                  })

res_dfs = []
for chunk in dfs:
    res = chunk.groupby('name').col_2.agg(['count', 'sum'])
    res_dfs.append(res)

pd.concat(res_dfs).groupby(level=0).sum()

Unnamed: 0_level_0,count,sum
name,Unnamed: 1_level_1,Unnamed: 2_level_1
a,100000,105286017
b,100000,104859941
c,100000,105018197
d,100000,104986218


In [37]:
# Validate result againt the solution without chunks
pd.read_csv('data/big_file.csv',
              dtype={
                  'col_1': int,
                  'col_2': int,
                  'name': str
              }).groupby('name').col_2.agg(['count', 'sum'])

Unnamed: 0_level_0,count,sum
name,Unnamed: 1_level_1,Unnamed: 2_level_1
a,100000,105286017
b,100000,104859941
c,100000,105018197
d,100000,104986218


[1, 1]