# Exercise 05 : Pandas optimizations

In [1]:
import pandas as pd
import gc

## Read the fines.csv that you saved in the previous exercise

In [2]:
df = pd.read_csv('../ex04/fines.csv')

## Iterations: in all the following subtasks, you need to calculate fines/refund*year for each row and create a new column with the calculated data and measure the time using the magic command %%timeit in the cell

- loop: write a function that iterates through the dataframe using for i in range(0, len(df)), iloc and append() to a list, assign the result of the function to a new column in the dataframe
- do it using iterrows()
- do it using apply() and lambda function
- do it using Series objects from the dataframe
- do it as in the previous subtask but with the method .values

In [4]:
# Loop
def calculate_fine(row):
    return row['Fines'] / row['Refund'] * row['Year']

def calculate_fines_loop(df):
    new_column = []
    for i in range(len(df)):
        new_column.append(calculate_fine(df.iloc[i]))
    df['Calculation'] = new_column

In [22]:
%%timeit

calculate_fines_loop(df)

74.2 ms ± 1.21 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [5]:
# Iterrows
def calculate_fines_iterrows(df):
    df['Calculation'] = [calculate_fine(row) for index, row in df.iterrows()]

In [23]:
%%timeit

calculate_fines_iterrows(df)

52.1 ms ± 1.47 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [6]:
# Apply and lambda
def calculate_fines_apply(df):
    df['Calculation'] = df.apply(lambda row: calculate_fine(row), axis=1)

In [24]:
%%timeit

calculate_fines_apply(df)

10.3 ms ± 430 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [7]:
# Series objects
def calculate_fines_series(df):
    df['Calculation'] = df['Fines'] / df['Refund'] * df['Year']

In [25]:
%%timeit

calculate_fines_series(df)

306 µs ± 27.9 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [8]:
# Values
def calculate_fines_values(df):
    df['Calculation'] = df[['Fines', 'Refund', 'Year']].values.prod(axis=1)

In [26]:
%%timeit

calculate_fines_values(df)

501 µs ± 8.25 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


## Indexing: measure the time using the magic command %%timeit in the cell

- get a row for a specific CarNumber, for example, ’O136HO197RUS’
- set the index in your dataframe with CarNumber
- again, get a row for the same CarNumber

In [15]:
%%timeit

df[df['CarNumber'] == 'O136HO197RUS']

316 µs ± 12.5 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [19]:
df.set_index('CarNumber', inplace=True)

In [21]:
%%timeit

df[df.index == 'O136HO197RUS']

226 µs ± 18.9 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


## Downcasting:

- run df.info(memory_usage=’deep’), pay attention to the Dtype and the memory usage
- make a copy() of your initial dataframe into another dataframe optimized

In [27]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, Y163O8161RUS to V555VV55RUS
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   ID           925 non-null    float64
 1   Refund       930 non-null    float64
 2   Fines        930 non-null    float64
 3   Make         930 non-null    object 
 4   Model        919 non-null    object 
 5   Year         925 non-null    float64
 6   Calculation  925 non-null    float64
dtypes: float64(5), object(2)
memory usage: 211.0 KB


In [32]:
optimized = df.copy()

optimized['ID'] = optimized['ID'].astype('float32')
optimized['Refund'] = optimized['Refund'].astype('float32')
optimized['Fines'] = optimized['Fines'].astype('float32')
optimized['Year'] = optimized['Year'].astype('float32')
optimized['Calculation'] = optimized['Calculation'].astype('float32')

optimized.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, Y163O8161RUS to V555VV55RUS
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   ID           925 non-null    float32
 1   Refund       930 non-null    float32
 2   Fines        930 non-null    float32
 3   Make         930 non-null    object 
 4   Model        919 non-null    object 
 5   Year         925 non-null    float32
 6   Calculation  925 non-null    float32
dtypes: float32(5), object(2)
memory usage: 192.8 KB


In [33]:
gc.collect()

1369