## Load the same CSV file 10X times faster and with 10X less memory

# 1. use cols:
Rather than loading data and removing unnecessary columns that aren’t useful when processing your data. load only the useful columns.

In [3]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [4]:
df = pd.read_csv("swiggy.csv")
df

Unnamed: 0,id,name,city,rating,rating_count,cost,cuisine,lic_no,link,address,menu
0,567335,AB FOODS POINT,Abohar,--,Too Few Ratings,₹ 200,"Beverages,Pizzas",22122652000138,https://www.swiggy.com/restaurants/ab-foods-po...,"AB FOODS POINT, NEAR RISHI NARANG DENTAL CLINI...",Menu/567335.json
1,531342,Janta Sweet House,Abohar,4.4,50+ ratings,₹ 200,"Sweets,Bakery",12117201000112,https://www.swiggy.com/restaurants/janta-sweet...,"Janta Sweet House, Bazar No.9, Circullar Road,...",Menu/531342.json
2,158203,theka coffee desi,Abohar,3.8,100+ ratings,₹ 100,Beverages,22121652000190,https://www.swiggy.com/restaurants/theka-coffe...,"theka coffee desi, sahtiya sadan road city",Menu/158203.json
3,187912,Singh Hut,Abohar,3.7,20+ ratings,₹ 250,"Fast Food,Indian",22119652000167,https://www.swiggy.com/restaurants/singh-hut-n...,"Singh Hut, CIRCULAR ROAD NEAR NEHRU PARK ABOHAR",Menu/187912.json
4,543530,GRILL MASTERS,Abohar,--,Too Few Ratings,₹ 250,"Italian-American,Fast Food",12122201000053,https://www.swiggy.com/restaurants/grill-maste...,"GRILL MASTERS, ADA Heights, Abohar - Hanumanga...",Menu/543530.json
...,...,...,...,...,...,...,...,...,...,...,...
148536,553122,The Food Delight,Yavatmal,--,Too Few Ratings,₹ 200,"Fast Food,Snacks",21522053000452,https://www.swiggy.com/restaurants/the-food-de...,"The Food Delight, 94MC+X35, New Singhania Naga...",Menu/553122.json
148537,562647,MAITRI FOODS & BEVERAGES,Yavatmal,--,Too Few Ratings,₹ 300,Pizzas,license,https://www.swiggy.com/restaurants/maitri-food...,"MAITRI FOODS & BEVERAGES, POLIC MITRYA SOCIETY...",Menu/562647.json
148538,559435,Cafe Bella Ciao,Yavatmal,--,Too Few Ratings,₹ 300,"Fast Food,Snacks",21522251000378,https://www.swiggy.com/restaurants/cafe-bella-...,"Cafe Bella Ciao, SHOP NO 2 NEMANI MARKET SBI S...",Menu/559435.json
148539,418989,GRILL ZILLA,Yavatmal,--,Too Few Ratings,₹ 250,Continental,21521251000241,https://www.swiggy.com/restaurants/grill-zilla...,"GRILL ZILLA, SHO NO 2/6, POSTEL GROUND CHOWPAT...",Menu/418989.json


In [5]:
df.info(verbose=False, memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148541 entries, 0 to 148540
Columns: 11 entries, id to menu
dtypes: int64(1), object(10)
memory usage: 126.3 MB


In [6]:
df.columns

Index(['id', 'name', 'city', 'rating', 'rating_count', 'cost', 'cuisine',
       'lic_no', 'link', 'address', 'menu'],
      dtype='object')

In [7]:
req_cols = ['id', 'name', 'city','rating','cost', 'cuisine']
len(req_cols)

6

In [8]:
df = pd.read_csv("swiggy.csv", usecols=req_cols)

In [9]:
df.info(verbose=False, memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148541 entries, 0 to 148540
Columns: 6 entries, id to cuisine
dtypes: int64(1), object(5)
memory usage: 53.4 MB


## 2. Using correct dtypes for numerical data:
Every column has it’s own dtype in a pandas DataFrame, for example, integers have int64, int32, int16 etc…

In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import re

In [2]:
df = pd.read_csv("swiggy.csv")

In [3]:
df['cost'].replace(to_replace ='₹ ', value = '', regex = True,inplace=True)

In [13]:
df['cost'].memory_usage(index=False, deep=True)

8908842

In [14]:
df['cost'] = df['cost'].astype(float)

In [4]:
df

Unnamed: 0,id,name,city,rating,rating_count,cost,cuisine,lic_no,link,address,menu
0,567335,AB FOODS POINT,Abohar,--,Too Few Ratings,200,"Beverages,Pizzas",22122652000138,https://www.swiggy.com/restaurants/ab-foods-po...,"AB FOODS POINT, NEAR RISHI NARANG DENTAL CLINI...",Menu/567335.json
1,531342,Janta Sweet House,Abohar,4.4,50+ ratings,200,"Sweets,Bakery",12117201000112,https://www.swiggy.com/restaurants/janta-sweet...,"Janta Sweet House, Bazar No.9, Circullar Road,...",Menu/531342.json
2,158203,theka coffee desi,Abohar,3.8,100+ ratings,100,Beverages,22121652000190,https://www.swiggy.com/restaurants/theka-coffe...,"theka coffee desi, sahtiya sadan road city",Menu/158203.json
3,187912,Singh Hut,Abohar,3.7,20+ ratings,250,"Fast Food,Indian",22119652000167,https://www.swiggy.com/restaurants/singh-hut-n...,"Singh Hut, CIRCULAR ROAD NEAR NEHRU PARK ABOHAR",Menu/187912.json
4,543530,GRILL MASTERS,Abohar,--,Too Few Ratings,250,"Italian-American,Fast Food",12122201000053,https://www.swiggy.com/restaurants/grill-maste...,"GRILL MASTERS, ADA Heights, Abohar - Hanumanga...",Menu/543530.json
...,...,...,...,...,...,...,...,...,...,...,...
148536,553122,The Food Delight,Yavatmal,--,Too Few Ratings,200,"Fast Food,Snacks",21522053000452,https://www.swiggy.com/restaurants/the-food-de...,"The Food Delight, 94MC+X35, New Singhania Naga...",Menu/553122.json
148537,562647,MAITRI FOODS & BEVERAGES,Yavatmal,--,Too Few Ratings,300,Pizzas,license,https://www.swiggy.com/restaurants/maitri-food...,"MAITRI FOODS & BEVERAGES, POLIC MITRYA SOCIETY...",Menu/562647.json
148538,559435,Cafe Bella Ciao,Yavatmal,--,Too Few Ratings,300,"Fast Food,Snacks",21522251000378,https://www.swiggy.com/restaurants/cafe-bella-...,"Cafe Bella Ciao, SHOP NO 2 NEMANI MARKET SBI S...",Menu/559435.json
148539,418989,GRILL ZILLA,Yavatmal,--,Too Few Ratings,250,Continental,21521251000241,https://www.swiggy.com/restaurants/grill-zilla...,"GRILL ZILLA, SHO NO 2/6, POSTEL GROUND CHOWPAT...",Menu/418989.json


In [16]:
df['cost'].min()

1.0

In [17]:
df['cost'].max()

300350.0

In [18]:
df['cost'].memory_usage(index=False, deep=True)

1188328

In [19]:
(8908842-1188328)/8908842*100

86.66125182150498

## 3. Using correct dtypes for categorical columns:

In [20]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [21]:
df1 = pd.read_csv("Car Dekho.csv")
df1

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner
...,...,...,...,...,...,...,...,...
4335,Hyundai i20 Magna 1.4 CRDi (Diesel),2014,409999,80000,Diesel,Individual,Manual,Second Owner
4336,Hyundai i20 Magna 1.4 CRDi,2014,409999,80000,Diesel,Individual,Manual,Second Owner
4337,Maruti 800 AC BSIII,2009,110000,83000,Petrol,Individual,Manual,Second Owner
4338,Hyundai Creta 1.6 CRDi SX Option,2016,865000,90000,Diesel,Individual,Manual,First Owner


In [22]:
df1['fuel'].memory_usage(index=False, deep=True)

273233

In [23]:
df1= pd.read_csv("Car Dekho.csv", dtype={"fuel": "category"})

In [24]:
df1['fuel'].memory_usage(index=False, deep=True)

4823

In [25]:
(273233-4823)/273233*100

98.23483986194933

In [26]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [27]:
df3 = pd.read_csv("swiggy.csv")
df3.isnull().sum()

id                0
name             86
city              0
rating           86
rating_count     86
cost            131
cuisine          99
lic_no          229
link              0
address          86
menu              0
dtype: int64

In [28]:
series = df3['rating_count']

In [29]:
series.memory_usage(index=False, deep=True)

10470707

In [30]:
len(series)

148541

In [31]:
len(series.dropna())

148455

In [32]:
sparse_series = series.astype("Sparse[str]")

In [33]:
len(sparse_series)

148541

In [34]:
sparse_series.memory_usage(index=False, deep=True)

11063839

In [35]:
(11063839-10470707)/11063839*100

5.360996305170385

# 4. nrows, skip rows

In [36]:
import pandas as pd
df4 = pd.read_csv("swiggy.csv", nrows=1000)
len(df4)

1000

In [37]:
#skiprows Line numbers to skip (0-indexed) or the number of lines to skip (int) at the start of the file.

In [38]:
# Can be either list or first N rows.
df5 = pd.read_csv('train.csv') 
# It might remove headings

In [39]:
df5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 28 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   ID                        100000 non-null  int64  
 1   Customer_ID               100000 non-null  int64  
 2   Month                     100000 non-null  int64  
 3   Name                      100000 non-null  object 
 4   Age                       100000 non-null  float64
 5   SSN                       100000 non-null  float64
 6   Occupation                100000 non-null  object 
 7   Annual_Income             100000 non-null  float64
 8   Monthly_Inhand_Salary     100000 non-null  float64
 9   Num_Bank_Accounts         100000 non-null  float64
 10  Num_Credit_Card           100000 non-null  float64
 11  Interest_Rate             100000 non-null  float64
 12  Num_of_Loan               100000 non-null  float64
 13  Type_of_Loan              100000 non-null  ob

In [40]:
sample = pd.read_csv("train.csv", nrows=100) # Load Sample data
dtypes = sample.dtypes # Get the dtypes
cols = sample.columns # Get the columns
dtype_dictionary = {} 
for c in cols:
    if str(dtypes[c]) == 'int64':
        dtype_dictionary[c] = 'float32' # Handle NANs in int columns
    else:
        dtype_dictionary[c] = str(dtypes[c])
# Load Data with increased speed and reduced memory.
df = pd.read_csv("train.csv", dtype=dtype_dictionary, 
                 keep_default_na=False, 
                 error_bad_lines=False,
                 na_values=['na',''])

TypeError: read_csv() got an unexpected keyword argument 'error_bad_lines'

## 5. Loading Data in Chunks:

In [None]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [None]:
df5 = pd.read_csv("swiggy.csv")

In [None]:
len(df5)

### Check the total length after loading chunk by chunk

In [None]:
df6 = pd.read_csv("swiggy.csv", chunksize=1000)

In [None]:
total_len = 0
for chunk in df6:
    # Do some preprocessing to reduce the memory size of each chunk
    total_len += len(chunk)
print(total_len)

### Concatnate each chunk one by one

In [None]:
tp = pd.read_csv('swiggy.csv', iterator=True, chunksize=1000)  # gives TextFileReader
df7 = pd.concat(tp, ignore_index=True)

In [None]:
len(df7)

# 6. Multiprocessing using pandas:

In [None]:
#we can utilize multiprocessinglibrary to handle chunk size operations asynchronously on multi-threads which can reduce the run time by half.

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import pandas as pd
import multiprocessing as mp

In [None]:
%%time
df = pd.read_csv("train.csv", chunksize=1000)
total_length = 0
for chunk in df:
    total_length += len(chunk)
print(total_length)

In [None]:
%%time
LARGE_FILE = "train.csv"
CHUNKSIZE = 1000 # processing 100,000 rows at a time

def process_frame(df):
        # process data frame
        return len(df)

if __name__ == '__main__':
    reader = pd.read_table(LARGE_FILE, chunksize=CHUNKSIZE)
    pool = mp.Pool(4)  # use 4 processes

    funclist = []
    for df in reader:
        # process each data frame
        f = pool.apply_async(process_frame, [df])
        funclist.append(f)

    result = 0
    for f in funclist:
        result += f.get()  # no timeout specified

    print(f"There are {result} rows of data")


In [None]:
if __name__ == '__main__':
    reader = pd.read_table(LARGE_FILE, chunksize=CHUNKSIZE)
    pool = mp.Pool(4)  # use 4 processes

    funclist = []
    for df in reader:
        # process each data frame
        f = pool.apply_async(process_frame, [df])
        funclist.append(f)

    result = 0
    for f in funclist:
        result += f.get()  # no timeout specified

    print(f"There are {result} rows of data")


## 7. Dask Instead of Pandas

In [2]:
%%time
import dask.dataframe as dd
data = dd.read_csv("train.csv",dtype={'MachineHoursCurrentMeter': 'float64'},assume_missing=True)
data.compute()

CPU times: total: 453 ms
Wall time: 452 ms


Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,5634.0,3392.0,1.0,Aaron Maashoh,23.0,821000265.0,Scientist,19114.12,1824.843333,3.0,...,Good,809.98,26.822620,265.0,No,49.574949,21.465380,High_spent_Small_value_payments,312.494089,Good
1,5635.0,3392.0,2.0,Aaron Maashoh,23.0,821000265.0,Scientist,19114.12,1824.843333,3.0,...,Good,809.98,31.944960,266.0,No,49.574949,21.465380,Low_spent_Large_value_payments,284.629162,Good
2,5636.0,3392.0,3.0,Aaron Maashoh,23.0,821000265.0,Scientist,19114.12,1824.843333,3.0,...,Good,809.98,28.609352,267.0,No,49.574949,21.465380,Low_spent_Medium_value_payments,331.209863,Good
3,5637.0,3392.0,4.0,Aaron Maashoh,23.0,821000265.0,Scientist,19114.12,1824.843333,3.0,...,Good,809.98,31.377862,268.0,No,49.574949,21.465380,Low_spent_Small_value_payments,223.451310,Good
4,5638.0,3392.0,5.0,Aaron Maashoh,23.0,821000265.0,Scientist,19114.12,1824.843333,3.0,...,Good,809.98,24.797347,269.0,No,49.574949,21.465380,High_spent_Medium_value_payments,341.489231,Good
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,155625.0,37932.0,4.0,Nicks,25.0,78735990.0,Mechanic,39628.99,3359.415833,4.0,...,Good,502.38,34.663572,378.0,No,35.104023,24.028477,High_spent_Large_value_payments,479.866228,Poor
99996,155626.0,37932.0,5.0,Nicks,25.0,78735990.0,Mechanic,39628.99,3359.415833,4.0,...,Good,502.38,40.565631,379.0,No,35.104023,24.028477,High_spent_Medium_value_payments,496.651610,Poor
99997,155627.0,37932.0,6.0,Nicks,25.0,78735990.0,Mechanic,39628.99,3359.415833,4.0,...,Good,502.38,41.255522,380.0,No,35.104023,24.028477,High_spent_Large_value_payments,516.809083,Poor
99998,155628.0,37932.0,7.0,Nicks,25.0,78735990.0,Mechanic,39628.99,3359.415833,4.0,...,Good,502.38,33.638208,381.0,No,35.104023,24.028477,Low_spent_Large_value_payments,319.164979,Standard
