# Comparing Runtime Perfomance of Lists, Numpy, and Pandas in Python

In [22]:
import numpy as np
import pandas as pd

from functools import wraps
from time import time
from IPython.display import display

### Helper functions ###
def timeit(f):
    @wraps(f)
    def wrap(*args, **kw):
        ts = time()
        result = f(*args, **kw)
        te = time()
        return f.__name__, te-ts
    return wrap

def display_results(results, size, row=None):
    if row is not None:
        print(f"Loop: size={size}, row={row}")
    else:
        print(f"Loop: size={size}")
    df = pd.DataFrame(results, columns=['method','time (s)']).sort_values('time (s)')
    df['time per iter (micro sec)'] = df['time (s)'] / size * 1e6
    display(df)
    return df

def run_methods(methods, size=10000, row=[1,2,3,4,5]):
    results = []
    for method in methods:
        results.append(method(size,row))
    _ = display_results(results,size, row)
    return results

def display_results_func(results,size):
    print(f"Loop: size={size}")
    df = pd.DataFrame(results, columns=['method','time (s)','func']).sort_values('time (s)')
    df['time per iter (micro sec)'] = df['time (s)'] / size * 1e6
    display(df)
    return df

def run_methods_w_funcs(methods, data, funcs, size):
    results = []
    for method in methods:
        for f in funcs:
            results.append(method(data.copy(),f) + (f.__name__,))
    _ = display_results_func(results,size)
    return results

### Methods to test ###

# List Append
@timeit
def list_append(size,row):
    test = []
    for i in range(size):
        test.append(row)
    return test


# List Append using list and not range
@timeit
def list_append_using_list(size,row):
    test = []
    for i in list(range(size)):
        test.append(row)
    return test

# List Comprehension
@timeit
def list_comp(size,row):
    test = [row for i in range(size)]
    return test

# List Repeat
@timeit
def list_repeat(size,row):
    test = [row] * size
    return test

# Numpy Pre-allocated
@timeit
def numpy_preallocated(size,row):
    test = np.empty((size,5))
    for i in range(size):
        test[i] = row
    return test

# Numpy Repeat
@timeit
def numpy_repeat(size,row):
    test = np.repeat(row, size).reshape(size,5)
    return test

# Pandas from list comp
@timeit
def pandas_from_list_comp(size,row):
    data = [row for i in range(size)]
    test = pd.DataFrame(data, columns=row)
    return test

# Pandas from list comp to numpy array
@timeit
def pandas_from_list_comp_to_array(size,row):
    data = [row for i in range(size)]
    test = pd.DataFrame(np.array(data), columns=row)
    return test

### Slow Methods, to be avoided in loops ###
# Numpy Append
@timeit
def numpy_append(size,row):
    test = np.empty((1,5))
    for i in range(size):
        test = np.append(test, row)
    return test

# Pandas Concat each row to initalized df
@timeit
def pandas_concat_to_existing(size, row):
    test = pd.DataFrame([dict(zip(row,row))])
    for i in range(1,size):
        df_row = pd.DataFrame([row], columns=row)
        test = pd.concat([test, df_row])
    return test


## Comparing runtime of data initalization using numeric data and mixed datatypes

In [23]:
### Compare methods ###

size_slow = 10000
methods_slow = [numpy_append,pandas_concat_to_existing]

size = 1000000
methods = [list_append, list_comp, list_append_using_list, list_repeat, numpy_preallocated, numpy_repeat, pandas_from_list_comp, pandas_from_list_comp_to_array]

row = [1.0,2,3,4,5]
print("### Numeric Datatypes ###")
run_methods(methods_slow, size=size_slow, row=row)
run_methods(methods, size=size, row=row)

row = [1,2,'3',4.0,5]
print("### Mixed Datatypes ###")
run_methods(methods_slow, size=size_slow, row=row)
run_methods(methods, size=size, row=row);

### Numeric Datatypes ###
Loop: size=10000, row=[1.0, 2, 3, 4, 5]


Unnamed: 0,method,time (s),time per iter (micro sec)
0,numpy_append,0.071442,7.144237
1,pandas_concat_to_existing,2.744574,274.457383


Loop: size=1000000, row=[1.0, 2, 3, 4, 5]


Unnamed: 0,method,time (s),time per iter (micro sec)
3,list_repeat,0.000371,0.000371
5,numpy_repeat,0.003387,0.003387
1,list_comp,0.012317,0.012317
0,list_append,0.015565,0.015565
2,list_append_using_list,0.026277,0.026277
7,pandas_from_list_comp_to_array,0.282418,0.282418
4,numpy_preallocated,0.308057,0.308057
6,pandas_from_list_comp,0.473881,0.473881


### Mixed Datatypes ###
Loop: size=10000, row=[1, 2, '3', 4.0, 5]


Unnamed: 0,method,time (s),time per iter (micro sec)
0,numpy_append,2.912435,291.243529
1,pandas_concat_to_existing,3.569462,356.946158


Loop: size=1000000, row=[1, 2, '3', 4.0, 5]


Unnamed: 0,method,time (s),time per iter (micro sec)
3,list_repeat,0.000407,0.000407
1,list_comp,0.012494,0.012494
0,list_append,0.015229,0.015229
2,list_append_using_list,0.025182,0.025182
5,numpy_repeat,0.130679,0.130679
4,numpy_preallocated,0.371661,0.371661
6,pandas_from_list_comp,0.404965,0.404965
7,pandas_from_list_comp_to_array,1.098206,1.098206


### Conclusions on data lopping
Observations:
* List comprehension and append are both fast, but comprehension tends to be a little faster
* Using numpy to iterate over each row in a loop is noteably slower than built-in python lists
* Numpy got noticeably slower on repeated operations with strings
* Pandas is the slowest due to generating the dataframe

Conclusions:
* Seriously avoid numpy append and pandas concat in a loop
* Use numpy when working with numeric data
* Use Pandas is the slowest, but is easier to work with manipul 
* For mixed data types use lists
* For numeric data lists are stil the fastest, but if math operations are going to be performed use numpy
* Use Pandas if you need to view and manipulate data and don't need optimal performance

## Comparing applying functions and data manipulation

In [24]:
### Operations ###
def join_row_to_str(row):
    return ''.join(map(str, row))

def sum_row_as_numeric(row):
    res = 0
    for i in row:
        res += float(i)
    return res
def simple_assign(row):
    return 12345

### Applying operations one by one ###
@timeit
def list_loop(data, f):
    vals = [f(row_i) for row_i in data]
    new_data = list(zip(data, vals))
    # return new_data

@timeit
def df_apply(df,f):
    vals = df.apply(f, axis=1)
    df['vals'] = vals
    # print(df)
    # return df

@timeit
def df_iterrows(df,f): 
    vals = []
    for index, row in df.iterrows():
        vals.append(f(row))
    df['vals'] = vals
    # print(df)

### Utilizing vectorization for operations ###
@timeit
def df_assign(df):
    df['vals'] = 12345

@timeit
def df_add_cols_as_str(df):
    df["vals"] = df.astype(str).sum(axis=1)

@timeit
def df_add_cols_as_float(df):
    df["vals"] = df.astype(float).sum(axis=1)

def run_methods_w_df(methods, df):
    results = []
    for method in methods:
        results.append(method(df.copy()))
    _ = display_results(results, size)
    return results

In [26]:
### Comparing data manipulation ###
size_compare = 100000

df_methods = [df_apply, df_iterrows]
list_methods = [list_loop]
funcs = [join_row_to_str, sum_row_as_numeric, simple_assign]

print("### Applying operations one by one ###") 
print("### Mixed Datatypes ###")
row = [1,2,'3',4.0,5]
data = [row for i in range(size_compare)]
df = pd.DataFrame(data, columns=row)
run_methods_w_funcs(df_methods, df, funcs, size_compare)
run_methods_w_funcs(list_methods, data, funcs, size_compare)

print("### Applying Operations with Vectorization ###")
data = [row for i in range(size_compare)]
df = pd.DataFrame(data, columns=row)

df_vec_methods = [df_assign, df_add_cols_as_str, df_add_cols_as_float]

run_methods_w_df(df_vec_methods, df)


### Applying operations one by one ###
### Mixed Datatypes ###
Loop: size=100000


Unnamed: 0,method,time (s),func,time per iter (micro sec)
2,df_apply,0.15209,simple_assign,1.520901
1,df_apply,0.299505,sum_row_as_numeric,2.995048
0,df_apply,0.362804,join_row_to_str,3.628039
5,df_iterrows,2.045159,simple_assign,20.451589
4,df_iterrows,2.612855,sum_row_as_numeric,26.128554
3,df_iterrows,2.64579,join_row_to_str,26.457901


Loop: size=100000


Unnamed: 0,method,time (s),func,time per iter (micro sec)
2,list_loop,0.061324,simple_assign,0.613236
1,list_loop,0.082139,sum_row_as_numeric,0.821395
0,list_loop,0.1241,join_row_to_str,1.241004


### Applying Operations with Vectorization ###
Loop: size=1000000


Unnamed: 0,method,time (s),time per iter (micro sec)
0,df_assign,0.000281,0.000281
2,df_add_cols_as_float,0.010552,0.010552
1,df_add_cols_as_str,0.120099,0.120099


[('df_assign', 0.00028061866760253906),
 ('df_add_cols_as_str', 0.12009906768798828),
 ('df_add_cols_as_float', 0.010551691055297852)]

Observations:
* Not shown here, but numerical vs mixed data types did not seem to affect runtime significantly for these operations
* Lists are the fastest one by one applying method if vectorization can't be utilized
* df.apply() is slower than list comprehension by approximately 3x
* Vectorization with pandas is the fastest, since it is built on top of numpy, anywhere from 10-100x faster than

Conclusions on data manipulation
* Avoid iterating over rows in a dataframe, it 8-10x slower than using df.apply()
* If vectorization can be applied, that will net 10-1000+x improvements over using a list
* Stick to lists if vectorization can't be applied


## Data manipulation using a practical example

Calculating the distance along the surface of the earth between two lat,lon points

In [30]:
# from math import radians, cos, sin, asin, sqrt
from numpy import radians, cos, sin, arcsin, sqrt
from numba import jit

# Haversine formula
# Original code: https://stackoverflow.com/questions/4913349/haversine-formula-in-python-bearing-and-distance-between-two-gps-points
def haversine(lat1,lon1,lat2, lon2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """

    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * arcsin(sqrt(a)) 

    # 6367 km is the radius of the Earth
    km = 6367 * c
    return km

def haversine_vectorized(lat1,lon1,lat2, lon2):
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = radians(lon1), radians(lat1), radians(lon2), radians(lat2)
 
    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * arcsin(sqrt(a)) 

    # 6367 km is the radius of the Earth
    km = 6367 * c

    return km

@timeit
def apply_haversine(df):
    df['haversine'] = df.apply(lambda x: haversine(*x), axis=1)
    return df

@timeit
def add_haversine_w_vec(df):
    df['haversine'] = haversine_vectorized(df['lat1'],df['lon1'],df['lat2'],df['lon2'])
    return df

@timeit
def list_haversine(data):
    vals = [haversine(*row) for row in data]
    new_data = list(zip(data, vals))
    return new_data

# Generate df
d = {
    'lat1': {0: '31.215379379000467',
    1: '34.22133455500045',
    2: '34.795039606000444',
    3: '31.292159523000464',
    4: '31.69311635000048',
    5: '33.595265517000485',
    6: '34.44060759100046',
    7: '33.254429322000476',
    8: '33.50314015000049',
    9: '34.74643089500046'},
    'lon1': {0: ' -85.36146587999968',
    1: ' -86.15937514799964',
    2: ' -87.68507485299966',
    3: ' -86.25539902199966',
    4: ' -86.26549483099967',
    5: ' -86.66531866799966',
    6: ' -85.75726760699968',
    7: ' -86.81407933399964',
    8: ' -86.80242858299965',
    9: ' -87.69893502799965'}
    }
df = pd.DataFrame(d).astype(float)
df = pd.concat([df]*50000,ignore_index=True)
np.random.seed(123)
rand_dlat = np.random.randint(5, 35, size=len(df))
df['lat2'] = df['lat1']+rand_dlat
rand_dlon = np.random.randint(20, 50, size=len(df))
df['lon2'] = df['lon1']+rand_dlon
data = df.values

print("### Comparing apply, list and vectorization ###")
results = [apply_haversine(df.copy()),list_haversine(data.copy()),add_haversine_w_vec(df.copy())]
display_results(results, size=len(df))


# Conclusions
# - if functions can be vectorized with numpy it is much faster than apply by 100x-200x
# - Using a list comprehension is slightly faster than apply, but not significantly
# - Numba does make non-vecorized functions faster, in this case 2-5x
# - Numba makes vecorized functions much slower, in fact it is an order of magnitude slower (not shown here)
# - Try to write code that can easily be vectorized, if not try numba if speed is important.

### Comparing apply, list and vectorization ###
Loop: size=500000


Unnamed: 0,method,time (s),time per iter (micro sec)
2,add_haversine_w_vec,0.024404,0.048807
1,list_haversine,3.303773,6.607547
0,apply_haversine,4.583332,9.166664


Unnamed: 0,method,time (s),time per iter (micro sec)
2,add_haversine_w_vec,0.024404,0.048807
1,list_haversine,3.303773,6.607547
0,apply_haversine,4.583332,9.166664


## Quickly Investigating Numba

In [31]:

from numba import jit

@jit(nopython=True)
def haversine(lat1,lon1,lat2, lon2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """

    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * arcsin(sqrt(a)) 

    # 6367 km is the radius of the Earth
    km = 6367 * c
    return km

print("### With Numba ###")
results = [apply_haversine(df.copy()),list_haversine(data.copy()),add_haversine_w_vec(df.copy())]
display_results(results, size=len(df));


### With Numba ###
Loop: size=500000


Unnamed: 0,method,time (s),time per iter (micro sec)
2,add_haversine_w_vec,0.026644,0.053288
1,list_haversine,0.64926,1.29852
0,apply_haversine,1.818008,3.636016
