## Imports

In [41]:
import pandas as pd
import gc

## read the fines.csv that you saved in the previous exercise

In [42]:
df = pd.read_csv('../data/fines.csv')
df.head()

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2,3200.0,Ford,Focus,1989
1,E432XX77RUS,1,6500.0,Toyota,Camry,1995
2,7184TT36RUS,1,2100.0,Ford,Focus,1984
3,X582HE161RUS,2,2000.0,Ford,Focus,2015
4,92918M178RUS,1,5700.0,Ford,Focus,2014


## • iterations: in all the following subtasks, you need to calculate fines/refund*year for each row and create a new column with the calculated data and measure the time using the magic command %%timeit in the cell

#### loop: write a function that iterates through the dataframe using for i in range(0, len(df)), iloc and append() to a list, assign the result of the function to a new column in the dataframe

In [43]:
%%time
new_col = []
for i in range(0, len(df)):
    new_col.append(df.iloc[i]['Fines'] / (df.iloc[i]
                   ['Refund'] * df.iloc[i]['Year']))
df['Fines / Refund * Year'] = new_col

CPU times: user 254 ms, sys: 2.04 ms, total: 256 ms
Wall time: 259 ms


In [44]:
df = df.drop(['Fines / Refund * Year'], axis=1)

#### do it using iterrows()

In [45]:
%%time
new_col = []
for index, row in df.iterrows():
    new_col.append(row['Fines'] / (row['Refund'] * row['Year']))
df['Fines / Refund * Year'] = new_col

CPU times: user 41.3 ms, sys: 1.13 ms, total: 42.5 ms
Wall time: 42.6 ms


In [46]:
df = df.drop(['Fines / Refund * Year'], axis=1)

#### do it using apply() and lambda function

In [47]:
%%time
df['Fines / Refund * Year'] = df.apply(
    lambda row: row['Fines'] / (row['Refund'] * row['Year']), axis=1)

CPU times: user 14.9 ms, sys: 957 µs, total: 15.8 ms
Wall time: 16 ms


In [48]:
df = df.drop(['Fines / Refund * Year'], axis=1)

#### do it using Series objects from the dataframe

In [49]:
%%time
df['Fines / Refund * Year'] = df['Fines'] / (df['Refund'] * df['Year'])

CPU times: user 1.06 ms, sys: 205 µs, total: 1.27 ms
Wall time: 1.22 ms


In [50]:
df = df.drop(['Fines / Refund * Year'], axis=1)

#### do it as in the previous subtask but with the method .values

In [51]:
%%time
df['Fines / Refund * Year'] = df['Fines'].values / (df['Refund'].values * df['Year'].values)

CPU times: user 726 µs, sys: 175 µs, total: 901 µs
Wall time: 812 µs


In [52]:
df = df.drop(['Fines / Refund * Year'], axis=1)

## indexing: measure the time using the magic command %%timeit in the cell
#### get a row for a specific CarNumber, for example, ’O136HO197RUS’

In [53]:
%%time
df.loc[df['CarNumber'] == 'O136HO197RUS']

CPU times: user 1.15 ms, sys: 410 µs, total: 1.56 ms
Wall time: 1.73 ms


Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
715,O136HO197RUS,2,7800.0,Toyota,Corolla,1999
902,O136HO197RUS,2,7800.0,Toyota,Corolla,1998


#### set the index in your dataframe with CarNumber

In [54]:
df = df.set_index('CarNumber')
df.head()

Unnamed: 0_level_0,Refund,Fines,Make,Model,Year
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Y163O8161RUS,2,3200.0,Ford,Focus,1989
E432XX77RUS,1,6500.0,Toyota,Camry,1995
7184TT36RUS,1,2100.0,Ford,Focus,1984
X582HE161RUS,2,2000.0,Ford,Focus,2015
92918M178RUS,1,5700.0,Ford,Focus,2014


#### again, get a row for the same CarNumber

In [55]:
%%time
df.loc['O136HO197RUS']

CPU times: user 632 µs, sys: 42 µs, total: 674 µs
Wall time: 637 µs


Unnamed: 0_level_0,Refund,Fines,Make,Model,Year
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
O136HO197RUS,2,7800.0,Toyota,Corolla,1999
O136HO197RUS,2,7800.0,Toyota,Corolla,1998


## downcasting:
#### run df.info(memory_usage=’deep’), pay attention to the Dtype and the memory usage

In [56]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, Y163O8161RUS to H980KC77RUS
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Refund  930 non-null    int64  
 1   Fines   930 non-null    float64
 2   Make    930 non-null    object 
 3   Model   919 non-null    object 
 4   Year    930 non-null    int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 228.7 KB


#### make a copy() of your initial dataframe into another dataframe optimized

In [57]:
optimized = df.copy()

#### downcast from float64 to float32 for all the columns

In [58]:
optimized['Fines'] = pd.to_numeric(optimized['Fines'], downcast='float')

## downcast from int64 to the smallest numerical dtype possible

In [59]:
optimized['Refund'] = pd.to_numeric(optimized['Refund'], downcast='integer')
optimized['Year'] = pd.to_numeric(optimized['Year'], downcast='integer')

#### run info(memory_usage=’deep’) for your new dataframe, pay attention to the Dtype and the memory usage

In [60]:
optimized.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, Y163O8161RUS to H980KC77RUS
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Refund  930 non-null    int8   
 1   Fines   930 non-null    float32
 2   Make    930 non-null    object 
 3   Model   919 non-null    object 
 4   Year    930 non-null    int16  
dtypes: float32(1), int16(1), int8(1), object(2)
memory usage: 213.3 KB


## categories:
#### change the object type columns to the type category

In [61]:
optimized = optimized.reset_index()
optimized['CarNumber'] = optimized['Make'].astype('category')
optimized['Make'] = optimized['Make'].astype('category')
optimized['Model'] = optimized['Model'].astype('category')

#### This time, check the memory usage, it probably has a decrease of 2-3 times compared to the initial dataframe

In [62]:
optimized.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 930 entries, 0 to 929
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   CarNumber  930 non-null    category
 1   Refund     930 non-null    int8    
 2   Fines      930 non-null    float32 
 3   Make       930 non-null    category
 4   Model      919 non-null    category
 5   Year       930 non-null    int16   
dtypes: category(3), float32(1), int16(1), int8(1)
memory usage: 11.5 KB


## memory clean
#### using %reset_selective and the library gc clean the memory of your initial dataframe only

In [63]:
%reset_selective -f df

In [65]:
df

NameError: name 'df' is not defined