In [1]:
import pandas as pd
import numpy as np

## 1. Create Dataset for testing

In [2]:
def get_data(size = 10000):
    df  = pd.DataFrame()
    df['age'] = np.random.randint(0, 100, size)
    df['time_in_bed'] = np.random.randint(0, 9, size)
    df['pct_sleeping'] = np.random.rand(size)
    df['favorite_food'] = np.random.choice(['pizza', 'taco', 'ice-cream'], size)
    df['hate_food'] = np.random.choice(['broccoli', 'candy corn', 'eggs'], size)
    return df

#### Problem Statement

Reward calculation
- If they were in bed  for more than 5 hours and they were sleeping for more than 50% we give them their favorite food.
- Otherwise we give them hate food
- If they are over 90 years, give them favorite food regardless.
                            

In [3]:
def reward_calc(row):
    if row['age'] >= 90:
        return row['favorite_food']
    if (row['time_in_bed']>5) & (row['pct_sleeping']>0.5):
        return row['favorite_food']
    return row['hate_food']

#### Level 1- loop

In [4]:
%%timeit

df = get_data()

for index, row in df.iterrows():
    df.loc[index, 'reward'] = reward_calc(row)

1.36 s ± 34.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### Level 2- apply

In [5]:
%%timeit

df = get_data()
df['reward'] = df.apply(reward_calc, axis = 1)

89.4 ms ± 937 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


#### Level 3 - Vectorized

In [6]:
%%timeit

df = get_data()
df['reward'] = df['hate_food']

df.loc[((df['pct_sleeping'] > 0.5) & (df['time_in_bed']> 5)) | (df['age'] > 90) , 'reward'] = df['favorite_food']

6.07 ms ± 77.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## 2. Efficient memory usage in Pandas

In [7]:
def get_dataset(size):
    df = pd.DataFrame()
    df['position'] = np.random.choice(['left', 'middle', 'right'], size)
    df['age'] = np.random.randint(1, 50, size)
    df['team'] = np.random.choice(['red', 'blue', 'yellow', 'green'], size)
    df['win'] = np.random.choice(['yes', 'no'], size)
    df['prob'] = np.random.uniform(0, 1, size)
    return df

In [8]:
df = get_dataset(1_000_000)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 5 columns):
 #   Column    Non-Null Count    Dtype  
---  ------    --------------    -----  
 0   position  1000000 non-null  object 
 1   age       1000000 non-null  int32  
 2   team      1000000 non-null  object 
 3   win       1000000 non-null  object 
 4   prob      1000000 non-null  float64
dtypes: float64(1), int32(1), object(3)
memory usage: 34.3+ MB


In [9]:
%timeit df['age_rank'] = df.groupby(['team', 'position'])['age'].rank()
%timeit df['prob_rank'] = df.groupby(['team', 'position'])['prob'].rank()
%timeit df['win_prob_rank'] = df.groupby(['team', 'position', 'win'])['prob'].rank()

705 ms ± 11.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
772 ms ± 9.95 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
832 ms ± 6.79 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
df.head(5)

Unnamed: 0,position,age,team,win,prob,age_rank,prob_rank,win_prob_rank
0,middle,21,red,yes,0.304961,35117.0,25414.0,12855.0
1,right,22,yellow,no,0.23728,36768.5,19907.0,10039.0
2,middle,32,yellow,no,0.54907,54005.5,45572.0,22779.0
3,left,34,yellow,no,0.112064,56731.0,9289.0,4646.0
4,middle,18,red,yes,0.700816,29928.5,58547.0,29393.0


In [11]:
df = get_dataset(1_000_000)
df['position'] = df['position'].astype('category')
df['team'] = df['team'].astype('category')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 5 columns):
 #   Column    Non-Null Count    Dtype   
---  ------    --------------    -----   
 0   position  1000000 non-null  category
 1   age       1000000 non-null  int32   
 2   team      1000000 non-null  category
 3   win       1000000 non-null  object  
 4   prob      1000000 non-null  float64 
dtypes: category(2), float64(1), int32(1), object(1)
memory usage: 21.0+ MB


In [12]:
# Int Downcasting value range
df['age'] = df['age'].astype('int8')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 5 columns):
 #   Column    Non-Null Count    Dtype   
---  ------    --------------    -----   
 0   position  1000000 non-null  category
 1   age       1000000 non-null  int8    
 2   team      1000000 non-null  category
 3   win       1000000 non-null  object  
 4   prob      1000000 non-null  float64 
dtypes: category(2), float64(1), int8(1), object(1)
memory usage: 18.1+ MB


In [13]:
df['prob']

0         0.305274
1         0.230031
2         0.172582
3         0.686993
4         0.862370
            ...   
999995    0.946388
999996    0.962524
999997    0.176811
999998    0.537493
999999    0.856694
Name: prob, Length: 1000000, dtype: float64

In [14]:
# Downcasting of floats
df['prob'] = df['prob'].astype('float32')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 5 columns):
 #   Column    Non-Null Count    Dtype   
---  ------    --------------    -----   
 0   position  1000000 non-null  category
 1   age       1000000 non-null  int8    
 2   team      1000000 non-null  category
 3   win       1000000 non-null  object  
 4   prob      1000000 non-null  float32 
dtypes: category(2), float32(1), int8(1), object(1)
memory usage: 14.3+ MB


In [15]:
## Cating bool(true/false)

df['win'] = df['win'].map({'yes': True, 'no':False})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 5 columns):
 #   Column    Non-Null Count    Dtype   
---  ------    --------------    -----   
 0   position  1000000 non-null  category
 1   age       1000000 non-null  int8    
 2   team      1000000 non-null  category
 3   win       1000000 non-null  bool    
 4   prob      1000000 non-null  float32 
dtypes: bool(1), category(2), float32(1), int8(1)
memory usage: 7.6 MB


In [16]:
def set_dtypes(df):
    df['position'] = df['position'].astype('category')
    df['team'] = df['team'].astype('category')
    df['age'] = df['age'].astype('int8')
    df['prob'] = df['prob'].astype('float32')
    df['win'] = df['win'].map({'yes': True, 'no':False})
    return df

In [17]:
df = get_dataset(1_000_000)
%timeit df['age_rank'] = df.groupby(['team', 'position'])['age'].rank()
%timeit df['prob_rank'] = df.groupby(['team', 'position'])['prob'].rank()
%timeit df['win_prob_rank'] = df.groupby(['team', 'position', 'win'])['prob'].rank()

704 ms ± 23.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
792 ms ± 39.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
830 ms ± 3.65 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [18]:
df = get_dataset(1_000_000)
df = set_dtypes(df)
%timeit df['age_rank'] = df.groupby(['team', 'position'], observed=False)['age'].rank()
%timeit df['prob_rank'] = df.groupby(['team', 'position'], observed=False)['prob'].rank()
%timeit df['win_prob_rank'] = df.groupby(['team', 'position', 'win'], observed=False)['prob'].rank()

574 ms ± 76.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
814 ms ± 69.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
928 ms ± 27.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## 3. Speed up your data processes

### CSV vs Pickle vs Parquet vs Feather

In [19]:
def get_dataset(size):
    # Create Fake data
    df = pd.DataFrame()
    df['size'] = np.random.choice(['big', 'medium', 'small'], size)
    df['age'] = np.random.randint(1, 50, size)
    df['team'] = np.random.choice(['red', 'blue', 'yellow', 'green'], size)
    df['win'] = np.random.choice(['yes', 'no'], size)
    dates = pd.date_range('2020-01-01', '2022-12-31')
    df['date'] = np.random.choice(dates, size)
    df['prob'] = np.random.uniform(0, 1, size)
    return df

In [20]:
def set_dtypes(df):
    df['size'] = df['size'].astype('category')
    df['team'] = df['team'].astype('category')
    df['age'] = df['age'].astype('int8')
    df['prob'] = df['prob'].astype('float16')
    df['win'] = df['win'].map({'yes': True, 'no':False})
    return df

#### CSV

In [21]:
%%timeit
df = get_dataset(1_000_000)
df.to_csv('test_csv.csv', index=False)

3.96 s ± 145 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [22]:
!ls -GFlash test_csv.csv

47M -rw-r--r-- 1 ybareddy 47M Aug 26 15:13 test_csv.csv


In [23]:
%%timeit
df = pd.read_csv('test_csv.csv')

631 ms ± 9.81 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [24]:
df = get_dataset(1_000_000)
df = set_dtypes(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype         
---  ------  --------------    -----         
 0   size    1000000 non-null  category      
 1   age     1000000 non-null  int8          
 2   team    1000000 non-null  category      
 3   win     1000000 non-null  bool          
 4   date    1000000 non-null  datetime64[ns]
 5   prob    1000000 non-null  float16       
dtypes: bool(1), category(2), datetime64[ns](1), float16(1), int8(1)
memory usage: 13.4 MB


In [25]:
df.to_csv('test_csv.csv', index=False)
df = pd.read_csv('test_csv.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   size    1000000 non-null  object 
 1   age     1000000 non-null  int64  
 2   team    1000000 non-null  object 
 3   win     1000000 non-null  bool   
 4   date    1000000 non-null  object 
 5   prob    1000000 non-null  float64
dtypes: bool(1), float64(1), int64(1), object(3)
memory usage: 39.1+ MB


In [26]:
df = pd.read_csv('test_csv.csv', dtype={'size':'category', 'int64':'int16', 'team':'category'})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype   
---  ------  --------------    -----   
 0   size    1000000 non-null  category
 1   age     1000000 non-null  int64   
 2   team    1000000 non-null  category
 3   win     1000000 non-null  bool    
 4   date    1000000 non-null  object  
 5   prob    1000000 non-null  float64 
dtypes: bool(1), category(2), float64(1), int64(1), object(1)
memory usage: 25.7+ MB


### Pickle

In [27]:
df = get_dataset(1_000_000)
%timeit df.to_pickle('test.pickle')
%timeit df_pickle = pd.read_pickle('test.pickle')

1.02 s ± 13.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
284 ms ± 8.89 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [28]:
!ls -GFlash test.pickle

39M -rw-r--r-- 1 ybareddy 39M Aug 26 15:14 test.pickle


In [29]:
df = get_dataset(1_000_000)
df = set_dtypes(df)
df.to_pickle('test.pickle')
df_pickle = pd.read_pickle('test.pickle')

In [30]:
df_pickle.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype         
---  ------  --------------    -----         
 0   size    1000000 non-null  category      
 1   age     1000000 non-null  int8          
 2   team    1000000 non-null  category      
 3   win     1000000 non-null  bool          
 4   date    1000000 non-null  datetime64[ns]
 5   prob    1000000 non-null  float16       
dtypes: bool(1), category(2), datetime64[ns](1), float16(1), int8(1)
memory usage: 13.4 MB


### Parquet

In [31]:
# pip install pyarrow
# pip install fastparquet

In [33]:
df = get_dataset(1_000_000)
%timeit df.to_parquet('test.parquet')
%timeit df_parquet = pd.read_parquet('test.parquet')

463 ms ± 6.65 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
171 ms ± 4.5 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [34]:
!ls -GFlash test.parquet

11M -rw-r--r-- 1 ybareddy 11M Aug 26 15:14 test.parquet


In [35]:
df_parquet = pd.read_parquet('test.parquet')
df_parquet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype         
---  ------  --------------    -----         
 0   size    1000000 non-null  object        
 1   age     1000000 non-null  int32         
 2   team    1000000 non-null  object        
 3   win     1000000 non-null  object        
 4   date    1000000 non-null  datetime64[ns]
 5   prob    1000000 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int32(1), object(3)
memory usage: 42.0+ MB


In [36]:
df = get_dataset(1_000_000)
df = set_dtypes(df)
df.to_parquet('./data/test.parquet')
df_parquet = pd.read_parquet('./data/test.parquet')

df_parquet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype         
---  ------  --------------    -----         
 0   size    1000000 non-null  category      
 1   age     1000000 non-null  int8          
 2   team    1000000 non-null  category      
 3   win     1000000 non-null  bool          
 4   date    1000000 non-null  datetime64[ns]
 5   prob    1000000 non-null  float16       
dtypes: bool(1), category(2), datetime64[ns](1), float16(1), int8(1)
memory usage: 13.4 MB


In [37]:
df_example = pd.read_parquet('test.parquet', columns=['date', 'win'])

In [38]:
df_example.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 2 columns):
 #   Column  Non-Null Count    Dtype         
---  ------  --------------    -----         
 0   date    1000000 non-null  datetime64[ns]
 1   win     1000000 non-null  object        
dtypes: datetime64[ns](1), object(1)
memory usage: 15.3+ MB


### Feather

In [39]:
df = get_dataset(1_000_000)
%timeit df.to_feather('test.feather')
%timeit df_feather = pd.read_feather('test.feather')

263 ms ± 13.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
133 ms ± 3.24 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [40]:
!ls -GFlash test.feather

28M -rw-r--r-- 1 ybareddy 28M Aug 26 15:17 test.feather


### Comparision Conculsion

In [41]:
print('Reading and writing CSV')
df = get_dataset(5_000_000)
df = set_dtypes(df)
%time df.to_csv('test.csv')
%time df_csv = pd.read_csv('test.csv')

Reading and writing CSV
CPU times: total: 14.4 s
Wall time: 14.3 s
CPU times: total: 2.78 s
Wall time: 2.78 s


In [42]:
print('Reading and writing pickle')
df = get_dataset(5_000_000)
df = set_dtypes(df)
%time df.to_pickle('test.pickle')
%time df_pickle = pd.read_pickle('test.pickle')

Reading and writing pickle
CPU times: total: 46.9 ms
Wall time: 72 ms
CPU times: total: 15.6 ms
Wall time: 25 ms


In [43]:
print('Reading and writing Parquet')
df = get_dataset(5_000_000)
df = set_dtypes(df)
%time df.to_parquet('test.parquet')
%time df_parquet = pd.read_parquet('test.parquet')

Reading and writing Parquet
CPU times: total: 844 ms
Wall time: 763 ms
CPU times: total: 406 ms
Wall time: 136 ms


In [44]:
print('Reading and writing Feather')
df = get_dataset(5_000_000)
df = set_dtypes(df)
%time df.to_feather('test.feather')
%time df_feather = pd.read_feather('test.feather')

Reading and writing Feather
CPU times: total: 312 ms
Wall time: 207 ms
CPU times: total: 203 ms
Wall time: 104 ms


In [45]:
!ls -GFlash test.csv test.pickle test.parquet test.feather

220M -rw-r--r-- 1 ybareddy 220M Aug 26 15:17 test.csv
 37M -rw-r--r-- 1 ybareddy  37M Aug 26 15:17 test.feather
 22M -rw-r--r-- 1 ybareddy  22M Aug 26 15:17 test.parquet
 67M -rw-r--r-- 1 ybareddy  67M Aug 26 15:17 test.pickle


## 4. Read Giant Datasets Fast

In [46]:
%time data = pd.read_csv('Amazon best seller products.csv')

CPU times: total: 125 ms
Wall time: 102 ms


In [47]:
data.shape

(1000, 43)

In [48]:
data.columns

Index(['title', 'seller_name', 'brand', 'description', 'initial_price',
       'final_price', 'final_price_high', 'currency', 'availability',
       'reviews_count', 'categories', 'asin', 'buybox_seller',
       'number_of_sellers', 'root_bs_rank', 'ISBN10', 'answered_questions',
       'domain', 'images_count', 'url', 'video_count', 'image_url',
       'item_weight', 'rating', 'product_dimensions', 'seller_id', 'image',
       'date_first_available', 'discount', 'model_number', 'manufacturer',
       'department', 'plus_content', 'upc', 'video', 'top_review',
       'variations', 'delivery', 'features', 'buybox_prices', 'origin_url',
       'bs_rank', 'bs_rank_category'],
      dtype='object')

In [49]:
%time data = pd.read_csv('Amazon best seller products.csv', usecols=["title", "final_price", "seller_name"], chunksize = 100 )

CPU times: total: 15.6 ms
Wall time: 9 ms


## 5. Master Pandas: Boost Performance with These 3 Pro Tips

##### Technique 1 - Read What You Need
##### Technique 2 - Use Efficient Data Types
##### Technique 3 - Data Chunking

In [50]:
import pandas as pd
import tracemalloc
import json

# Setting to make numbers easier to read on display
pd.options.display.float_format = '{:20.2f}'.format

# Show all columns on output
pd.set_option('display.max_columns', 100)

In [51]:
df = pd.read_csv("./data/hotel_bookings.csv")

df.head(10)                 

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,3,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,4,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Direct,Direct,0,0,0,A,C,0,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Corporate,Corporate,0,0,0,A,A,0,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,0.0,0,BB,GBR,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03
5,Resort Hotel,0,14,2015,July,27,1,0,2,2,0.0,0,BB,GBR,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03
6,Resort Hotel,0,0,2015,July,27,1,0,2,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,0,No Deposit,,,0,Transient,107.0,0,0,Check-Out,2015-07-03
7,Resort Hotel,0,9,2015,July,27,1,0,2,2,0.0,0,FB,PRT,Direct,Direct,0,0,0,C,C,0,No Deposit,303.0,,0,Transient,103.0,0,1,Check-Out,2015-07-03
8,Resort Hotel,1,85,2015,July,27,1,0,3,2,0.0,0,BB,PRT,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,240.0,,0,Transient,82.0,0,1,Canceled,2015-05-06
9,Resort Hotel,1,75,2015,July,27,1,0,3,2,0.0,0,HB,PRT,Offline TA/TO,TA/TO,0,0,0,D,D,0,No Deposit,15.0,,0,Transient,105.5,0,0,Canceled,2015-04-22


In [52]:
df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

In [53]:
df.describe()

Unnamed: 0,is_canceled,lead_time,arrival_date_year,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,agent,company,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests
count,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,119386.0,119390.0,119390.0,119390.0,119390.0,119390.0,103050.0,6797.0,119390.0,119390.0,119390.0,119390.0
mean,0.37,104.01,2016.16,27.17,15.8,0.93,2.5,1.86,0.1,0.01,0.03,0.09,0.14,0.22,86.69,189.27,2.32,101.83,0.06,0.57
std,0.48,106.86,0.71,13.61,8.78,1.0,1.91,0.58,0.4,0.1,0.18,0.84,1.5,0.65,110.77,131.66,17.59,50.54,0.25,0.79
min,0.0,0.0,2015.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6.0,0.0,-6.38,0.0,0.0
25%,0.0,18.0,2016.0,16.0,8.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,62.0,0.0,69.29,0.0,0.0
50%,0.0,69.0,2016.0,28.0,16.0,1.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,179.0,0.0,94.58,0.0,0.0
75%,1.0,160.0,2017.0,38.0,23.0,2.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,229.0,270.0,0.0,126.0,0.0,1.0
max,1.0,737.0,2017.0,53.0,31.0,19.0,50.0,55.0,10.0,10.0,1.0,26.0,72.0,21.0,535.0,543.0,391.0,5400.0,8.0,5.0


In [54]:
df.describe(include=[object])

Unnamed: 0,hotel,arrival_date_month,meal,country,market_segment,distribution_channel,reserved_room_type,assigned_room_type,deposit_type,customer_type,reservation_status,reservation_status_date
count,119390,119390,119390,118902,119390,119390,119390,119390,119390,119390,119390,119390
unique,2,12,5,177,8,5,10,12,3,4,3,926
top,City Hotel,August,BB,PRT,Online TA,TA/TO,A,A,No Deposit,Transient,Check-Out,2015-10-21
freq,79330,13877,92310,48590,56477,97870,85994,74053,104641,89613,75166,1461


In [55]:
def data_cleaning(df):
    """ Fill blank and or missing values"""
    df = df.fillna(0)
    return df

In [56]:
def lead_time_filter(df, filter_value):
    """ Only keep the lead time values above the median"""
    filtered_df = df[df["lead_time"] >= filter_value].copy()
    return filtered_df

In [57]:
def extract(file_path):
    """ Extract the date from the csv file into a dataframe"""
    df = pd.read_csv("./data/hotel_bookings.csv")
    return df

In [58]:
def transform(df):
    """ Apply the transformation steps to the data"""
    df = data_cleaning(df)
    filtered_df = lead_time_filter(df, filter_value=50)
    return filtered_df

In [59]:
def load(filtered_df, output_path):
    """ Write the dataframes to csv files"""
    filtered_df[
        [
            "hotel",
            "arrival_date_year",
            "arrival_date_month",
            "arrival_date_day_of_month",
            "adults",
            "children",
            "babies",
            "customer_type",
            "is_canceled",
            "lead_time"
            
        ]
    ].to_csv(f"{output_path}/hotel_bookings_lead_timings.csv", index=False)

In [60]:
def main():
    tracemalloc.start()
    
    file_path = './data/hotel_bookings.csv'

    df = extract(file_path)
    
    filtered_df = transform(df)

    load(filtered_df, "./data/")

    current, peak = tracemalloc.get_traced_memory()

    print(f"Current memory usgae is {current / 10**6} MB; Peak was {peak / 10**6} MB")
    tracemalloc.stop()

In [61]:
main()

Current memory usgae is 49.103626 MB; Peak was 103.477503 MB


#### Technique 1 - Read What You Need

In [62]:
smaller_df = pd.read_csv(
    "./data/hotel_bookings.csv",
    usecols = [
        "hotel",
        "arrival_date_year",
        "arrival_date_month",
        "arrival_date_day_of_month",
        "adults",
        "children",
        "babies",
        "customer_type",
        "is_canceled",
        "lead_time"
    ]
    )
smaller_df.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_day_of_month,adults,children,babies,customer_type
0,Resort Hotel,0,342,2015,July,1,2,0.0,0,Transient
1,Resort Hotel,0,737,2015,July,1,2,0.0,0,Transient
2,Resort Hotel,0,7,2015,July,1,1,0.0,0,Transient
3,Resort Hotel,0,13,2015,July,1,1,0.0,0,Transient
4,Resort Hotel,0,14,2015,July,1,2,0.0,0,Transient


In [63]:
smaller_df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 10 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   hotel                      119390 non-null  object 
 1   is_canceled                119390 non-null  int64  
 2   lead_time                  119390 non-null  int64  
 3   arrival_date_year          119390 non-null  int64  
 4   arrival_date_month         119390 non-null  object 
 5   arrival_date_day_of_month  119390 non-null  int64  
 6   adults                     119390 non-null  int64  
 7   children                   119386 non-null  float64
 8   babies                     119390 non-null  int64  
 9   customer_type              119390 non-null  object 
dtypes: float64(1), int64(6), object(3)
memory usage: 26.2 MB


## Technique 2 - Use Efficient Data Types

In [64]:
schema = {
    "hotel" : "category",
    "is_canceled" : "boolean",
    "lead_time" : "int16",
    "arrival_date_year" : "int16",   # Max: Year 2015
    "arrival_date_month": "category",
    "arrival_date_day_of_month": "int8",   # Max: 31
    "adults" : "int8",
    "children": "int8",
    "babies": "int8",
    "customer_type": "category"
}

In [65]:
smaller_df.fillna(0, inplace=True)
cast_df = smaller_df.astype(schema).copy()
cast_df.head(5)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_day_of_month,adults,children,babies,customer_type
0,Resort Hotel,False,342,2015,July,1,2,0,0,Transient
1,Resort Hotel,False,737,2015,July,1,2,0,0,Transient
2,Resort Hotel,False,7,2015,July,1,1,0,0,Transient
3,Resort Hotel,False,13,2015,July,1,1,0,0,Transient
4,Resort Hotel,False,14,2015,July,1,2,0,0,Transient


In [66]:
cast_df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 10 columns):
 #   Column                     Non-Null Count   Dtype   
---  ------                     --------------   -----   
 0   hotel                      119390 non-null  category
 1   is_canceled                119390 non-null  boolean 
 2   lead_time                  119390 non-null  int16   
 3   arrival_date_year          119390 non-null  int16   
 4   arrival_date_month         119390 non-null  category
 5   arrival_date_day_of_month  119390 non-null  int8    
 6   adults                     119390 non-null  int8    
 7   children                   119390 non-null  int8    
 8   babies                     119390 non-null  int8    
 9   customer_type              119390 non-null  category
dtypes: boolean(1), category(3), int16(2), int8(4)
memory usage: 1.5 MB


In [67]:
def extract_what_you_need(file_path):
    """ Extract the date from the csv file into a dataframe"""
    df = pd.read_csv(
    "./data/hotel_bookings.csv",
    usecols = [
        "hotel",
        "arrival_date_year",
        "arrival_date_month",
        "arrival_date_day_of_month",
        "adults",
        "children",
        "babies",
        "customer_type",
        "is_canceled",
        "lead_time"
    ]
    )
    return df

In [68]:
def transform_with_efficient_dtypes(df, schema):
    """ Apply the transformation steps to the data"""
    df = data_cleaning(df)

    df = df.astype(schema)
    filtered_df = lead_time_filter(df, filter_value=50)
    return filtered_df

In [69]:
def main_with_tips():
    tracemalloc.start()

    # with open("./schema.json") as f:
    #     schema = json.load(f)
    #     f.close()
    
    file_path = './data/hotel_bookings.csv'

    df = extract_what_you_need(file_path)
    
    filtered_df = transform_with_efficient_dtypes(df, schema)

    load(filtered_df, "./data/")

    current, peak = tracemalloc.get_traced_memory()

    print(f"Current memory usgae is {current / 10**6} MB; Peak was {peak / 10**6} MB")
    tracemalloc.stop()

In [70]:
main_with_tips()

Current memory usgae is 11.045761 MB; Peak was 30.616083 MB


## Technique 3- Chunking your data

In [71]:
df_chunks = pd.read_csv("./data/hotel_bookings.csv", chunksize=1000)
df_chunks

<pandas.io.parsers.readers.TextFileReader at 0x2543f21f740>

In [72]:
next(df_chunks)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,0,0,BB,PRT,Direct,Direct,0,0,0,C,C,3,No Deposit,,,0,Transient,0.00,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,0,0,BB,PRT,Direct,Direct,0,0,0,C,C,4,No Deposit,,,0,Transient,0.00,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,0,0,BB,GBR,Direct,Direct,0,0,0,A,C,0,No Deposit,,,0,Transient,75.00,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,0,0,BB,GBR,Corporate,Corporate,0,0,0,A,A,0,No Deposit,304.00,,0,Transient,75.00,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,0,0,BB,GBR,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,240.00,,0,Transient,98.00,0,1,Check-Out,2015-07-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,Resort Hotel,1,122,2015,August,33,9,2,4,2,0,0,HB,PRT,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,240.00,,0,Transient,166.00,0,2,Canceled,2015-05-27
996,Resort Hotel,1,41,2015,August,33,9,2,4,2,0,0,BB,PRT,Online TA,TA/TO,0,0,0,E,E,0,No Deposit,240.00,,0,Transient,202.00,0,2,Canceled,2015-07-17
997,Resort Hotel,1,41,2015,August,33,9,2,4,2,0,0,BB,PRT,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,240.00,,0,Transient,172.00,0,2,Canceled,2015-07-17
998,Resort Hotel,0,81,2015,August,33,9,2,4,2,1,1,FB,ESP,Direct,Direct,0,0,0,C,C,0,No Deposit,250.00,,0,Transient,277.00,1,1,Check-Out,2015-08-15


In [73]:
next(df_chunks)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
1000,Resort Hotel,0,72,2015,August,33,9,2,5,2,0,0,HB,PRT,Offline TA/TO,TA/TO,0,0,0,A,A,0,No Deposit,149.00,,0,Transient,133.00,0,1,Check-Out,2015-08-16
1001,Resort Hotel,1,68,2015,August,33,9,2,5,2,2,0,BB,PRT,Online TA,TA/TO,0,0,0,G,G,0,No Deposit,240.00,,0,Transient,214.00,0,0,Canceled,2015-06-03
1002,Resort Hotel,0,81,2015,August,33,9,2,5,2,0,0,HB,GBR,Offline TA/TO,TA/TO,0,0,0,A,A,0,No Deposit,243.00,,0,Transient,130.00,0,0,Check-Out,2015-08-16
1003,Resort Hotel,1,72,2015,August,33,9,2,5,2,0,0,BB,PRT,Online TA,TA/TO,0,0,0,D,D,0,No Deposit,240.00,,0,Transient,154.00,0,0,Canceled,2015-06-08
1004,Resort Hotel,0,68,2015,August,33,9,2,5,2,0,0,BB,GBR,Offline TA/TO,TA/TO,0,0,0,A,A,0,No Deposit,26.00,,0,Transient,106.40,0,0,Check-Out,2015-08-16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,Resort Hotel,1,75,2015,September,39,26,1,1,2,0,0,BB,PRT,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,240.00,,0,Transient,73.35,0,1,Canceled,2015-07-17
1996,Resort Hotel,1,59,2015,September,39,26,1,1,2,0,0,HB,PRT,Offline TA/TO,TA/TO,0,0,0,A,A,0,No Deposit,208.00,,0,Transient,74.00,0,0,Canceled,2015-08-24
1997,Resort Hotel,1,61,2015,September,39,26,2,1,2,0,0,BB,PRT,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,240.00,,0,Transient,79.00,0,0,Canceled,2015-08-27
1998,Resort Hotel,1,61,2015,September,39,26,2,1,2,0,0,BB,PRT,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,240.00,,0,Transient,79.00,0,2,Canceled,2015-08-27


In [74]:
def extract_with_chunksize(file_path):
    """ Extract the date from the csv file into a dataframe"""
    df_chunks = pd.read_csv(
    "./data/hotel_bookings.csv",
    usecols = [
        "hotel",
        "arrival_date_year",
        "arrival_date_month",
        "arrival_date_day_of_month",
        "adults",
        "children",
        "babies",
        "customer_type",
        "is_canceled",
        "lead_time"
    ],
    chunksize= 1000
    )
    return df_chunks

In [75]:
def load_with_chunks(filtered_df, output_path):
    """ Write the dataframes to csv files"""
    filtered_df.to_csv(f"{output_path}/hotel_bookings_lead_timings_tips.csv", index=False, mode='a')

In [76]:
def main_pro():
    tracemalloc.start()

    # with open("./schema.json") as f:
    #     schema = json.load(f)
    #     f.close()
    
    file_path = './data/hotel_bookings.csv'

    for df in extract_with_chunksize(file_path):
    
        filtered_df = transform_with_efficient_dtypes(df, schema)
    
        load_with_chunks(filtered_df, "./data/")

    current, peak = tracemalloc.get_traced_memory()

    print(f"Current memory usgae is {current / 10**6} MB; Peak was {peak / 10**6} MB")
    tracemalloc.stop()

In [77]:
main_pro()

Current memory usgae is 0.114446 MB; Peak was 1.342192 MB
