# Chapter 4

This notebook contains the commands that are shown in the lecture 3.

In [1]:
import numpy as np
import scipy
import pandas as pd
import time

## Different resources involved in data analysis pipelines

### Processors as a resource

In [2]:
n_zeros = 10000
ntimes = 1000

z = np.zeros(n_zeros)

time_for_1 = time.time()
for t in range(ntimes):
    for i in range(n_zeros):
        z[i] = z[i] + 1
time_for_2 = time.time()

time_for = time_for_2-time_for_1

z = np.zeros(n_zeros)

time_vec_1 = time.time()
for t in range(ntimes):
    z = z + 1
time_vec_2 = time.time()

time_vec = time_vec_2-time_vec_1

print("""
Time taken:

For loop: %.2g
Vectorized operation: %.2g

Speedup: %.0f
""" % (time_for, time_vec, time_for/time_vec))


Time taken:

For loop: 4.6
Vectorized operation: 0.0056

Speedup: 815



### RAM as a resource

Bootstrapping pipeline from chapter 3.

In [3]:
def load_filesizes(filesizes_file):
    filesizes = pd.read_table(filesizes_file, sep='\s+', names=['Bytes','MonthsTo2021', 'Files'])
    
    # Remove empty files
    filesizes = filesizes[filesizes.loc[:,'Bytes'] != 0]
    # Create a column for log2 of bytes
    filesizes['BytesLog2'] = np.log2(filesizes.loc[:, 'Bytes'])
    filesizes.loc[:,'BytesLog2'] = filesizes.loc[:,'BytesLog2'].astype(np.int64)
    # Determine total space S used by N files of size X during date D: S=N*X 
    filesizes['SpaceUsage'] = filesizes.loc[:,'Bytes']*filesizes.loc[:,'Files']
    # Determine file year and month from the MonthsTo2021-column
    filesizes['TotalMonths'] = 2021*12 - filesizes['MonthsTo2021'] - 1
    filesizes['Year'] = filesizes['TotalMonths'] // 12
    filesizes['Month'] = filesizes['TotalMonths'] % 12 + 1
    filesizes['Day'] = 1
    
    # Set year for really old files and files with incorrect timestamps
    invalid_years = (filesizes['Year'] < 2010) | (filesizes['Year'] > 2020)
    filesizes.loc[invalid_years, ['Year','Month']] = np.NaN
    
    # Get month names for the correct ordering of Month categories
    month_names = pd.date_range(start='2000-01', freq='M', periods=12).month_name()
    # Create Date
    filesizes['Date'] = pd.to_datetime(filesizes[['Year', 'Month', 'Day']])
    # Set Month
    filesizes['Month'] = pd.Categorical(filesizes['Date'].dt.month_name(), categories=month_names, ordered=True)
    # Set Month to be an ordered categorical with predefined levels 
    filesizes['Month'] = pd.Categorical(filesizes['Month'], categories=month_names, ordered=True)
    # Sort data based on Date and BytesLog2
    filesizes.sort_values(['Date','BytesLog2'], inplace=True)
    # Remove old columns
    filesizes.drop(['MonthsTo2021','TotalMonths', 'Day'], axis=1, inplace=True)
    return filesizes

def aggregate_filesize_data(data, groupings, targets, agg_function):
    # Drop rows with NaNs (invalid years)
    data_relevant = data.dropna(axis=0)
    # Pick relevant columns
    data_relevant = data_relevant.loc[:, groupings + targets]
    # Change grouping to category for prettier plotting
    data_relevant[groupings] = data_relevant[groupings].astype('category')

    # Aggregate data
    data_aggregated = data_relevant.groupby(groupings).agg(agg_function).reset_index()
    return data_aggregated

def get_bootstrapped_means(dataset, target_col=None, weight_col=None, n_means=1000):
    # Pick relevant columns
    df = dataset[[target_col, weight_col]].copy()
    # Pick target data column
    target_data = df[target_col]
    # Pick weight data column
    weight_data = df[weight_col]
    # Fill zeros to those byte sizes that are not present in the Files-data
    weight_data.fillna(0, inplace=True)
    # Normalize weight_data into probabilities
    weight_data = weight_data/weight_data.sum()
    
    # Create means vector
    means = np.zeros(n_means, dtype=np.float64)
    for i in range(n_means):
        # Calculate resampled mean
        means[i] = np.mean(np.random.choice(target_data, 100, replace=True, p=weight_data))

    return means

def bootstrap_byteslog2_mean(dataset, group_variable, target_variable, n_means=1000):

    bootstrapping_function = lambda x: get_bootstrapped_means(x, 'BytesLog2', target_variable, n_means=n_means)

    bootstrapped_means = dataset.groupby(group_variable).apply(lambda x: pd.Series({'data': x}))
    bootstrapped_means['SampledMeans'] = bootstrapped_means['data'].apply(bootstrapping_function)
    bootstrapped_means['Mean'] = bootstrapped_means['SampledMeans'].apply(np.mean)
    bootstrapped_means.drop('data', axis=1, inplace=True)
    return bootstrapped_means


In [4]:
def chapter3_pipeline(n_means=10000):
    
    filesizes = load_filesizes('../data/filesizes_timestamps.txt')

    yearly_bytes_sum = aggregate_filesize_data(filesizes, ['Year','BytesLog2'], ['Files', 'SpaceUsage'], 'sum')

    bootstrapped_yearly_means = bootstrap_byteslog2_mean(yearly_bytes_sum, 'Year', 'Files', n_means=n_means)
    
    bootstrapped_yearly_means = bootstrapped_yearly_means.reset_index()[['Year','Mean']]

    return bootstrapped_yearly_means

chapter3_pipeline(n_means=100).head()

Unnamed: 0,Year,Mean
0,2010.0,12.97741
1,2011.0,14.04096
2,2012.0,10.67631
3,2013.0,13.41275
4,2014.0,14.05066


In [5]:
filesizes = load_filesizes('../data/filesizes_timestamps.txt')
yearly_bytes_sum = aggregate_filesize_data(filesizes, ['Year','BytesLog2'], ['Files', 'SpaceUsage'], 'sum')

print(filesizes.memory_usage(deep=True))
print(yearly_bytes_sum.memory_usage(deep=True))

filesizes_size = filesizes.memory_usage(deep=True).sum()
summarized_size = yearly_bytes_sum.memory_usage(deep=True).sum()
print("""
Original data: %d bytes
Summarized data: %d bytes

Reduction ratio: %.2f
""" % (filesizes_size, summarized_size, filesizes_size/summarized_size))

Index         69520
Bytes         69520
Files         69520
BytesLog2     69520
SpaceUsage    69520
Year          69520
Month          9768
Date          69520
dtype: int64
Index          128
Year           881
BytesLog2     2097
Files         3784
SpaceUsage    3784
dtype: int64

Original data: 496408 bytes
Summarized data: 10674 bytes

Reduction ratio: 46.51



In [6]:
def memory_scope_test():
    
    memory_scope_variable = np.random.random(1000)
    print(memory_scope_variable.nbytes)

memory_scope_test()
print(memory_scope_variable.nbytes)

8000


NameError: name 'memory_scope_variable' is not defined

In [7]:
import gc

def memtest_nocollect(n=1000):

    A = np.random.random(n**2)
    
    A_mean = np.mean(A)
    
    time.sleep(5)
    
    B = np.matrix(np.random.random((n, n)) + A_mean)
    B = B + B.T
    B_inv = np.linalg.inv(B)
    
    return np.max(B*B_inv)

def memtest_collect(n=1000):

    A = np.random.random(n**2)
    
    A_mean = np.mean(A)
    
    del A
    gc.collect()
    
    time.sleep(5)
    
    B = np.matrix(np.random.random((n, n)) + A_mean)
    B = B + B.T
    B_inv = np.linalg.inv(B)
    
    return np.max(B*B_inv)

print(memtest_nocollect(100), memtest_collect(100))

1.0000000000000062 1.0000000000000397


In [8]:
%load_ext memory_profiler

In [9]:
%memit memtest_nocollect(3000)

peak memory: 419.71 MiB, increment: 307.42 MiB


In [10]:
%memit memtest_collect(3000)

peak memory: 418.88 MiB, increment: 273.54 MiB


## Parallelization strategies
### Using internal parallelization provided by libraries

In [11]:
import time
import mkl

A = np.random.random((4000,4000))

A = A*A.T

mkl.set_num_threads(1)

time_1thread_1 = time.time()
np.linalg.inv(A)
time_1thread_2 = time.time()

time_1thread = time_1thread_2 - time_1thread_1

mkl.set_num_threads(4)

time_4thread_1 = time.time()
np.linalg.inv(A)
time_4thread_2 = time.time()

time_4thread = time_4thread_2 - time_4thread_1

print("""
Time taken:

1 thread: %.2f
4 threads: %.2f

Speedup: %.2f
""" % (time_1thread, time_4thread, time_1thread/time_4thread))


Time taken:

1 thread: 4.07
4 threads: 1.61

Speedup: 2.53



### Multiprocessing

#### Doing parallel maps with multiprocessing

In [12]:
from multiprocessing import Pool

def x_squared(x):
    return x*x

data = pd.DataFrame({'x':range(1,101)})

print(data.head())

# Run mapping with parallel pool
with Pool(4) as parallel_pool:
    y = parallel_pool.map(x_squared, data['x'])

# Convert resulting list into a Series
y_series = pd.Series(y, name='y')

# Add series to data
data['y'] = y_series

print(data.head())

   x
0  1
1  2
2  3
3  4
4  5
   x   y
0  1   1
1  2   4
2  3   9
3  4  16
4  5  25


In [13]:
import functools

def chapter3_pipeline_parallel(n_means=1000, n_workers=1):

    filesizes = load_filesizes('../data/filesizes_timestamps.txt')
    yearly_bytes_sum = aggregate_filesize_data(filesizes, ['Year','BytesLog2'], ['Files', 'SpaceUsage'], 'sum')

    bootstrapped_means = yearly_bytes_sum.groupby('Year').apply(lambda x: pd.Series({'data': x}))
    
    # Actual parallel part
    
    """
    Here we use functools.partial to create a function with partially filled
    arguments because multiprocessing.Pool.map does not work that well with
    lambda-functions. get_bootstrapped_means was changed to allow target_col
    and weight_col to be set with keyword arguments so that the arguments are
    given in correct order.
    """
    bootstrapping_function = functools.partial(get_bootstrapped_means, target_col='BytesLog2', weight_col='Files', n_means=n_means)

    # Initialize a parallel pool with n_workers workers
    with Pool(n_workers) as parallel_pool:
        # Map a function to each dataset. Output is a list of ndarrays.
        sampled_means = parallel_pool.map(bootstrapping_function, bootstrapped_means['data'])
    
    # Convert list of ndarrays into a Series of ndarrays   
    sampled_means = pd.Series(sampled_means, name='SampledMeans', index=bootstrapped_means.index)

    # Place Series into our DataFrame
    bootstrapped_means['SampledMeans'] = sampled_means
    # End of the parallel part

    bootstrapped_means['Mean'] = bootstrapped_means['SampledMeans'].apply(np.mean)
    
    bootstrapped_means = bootstrapped_means.reset_index()[['Year','Mean']]

    return(bootstrapped_means)

# Measure performance and verify results 
time1 = time.time()
means_orig = chapter3_pipeline(n_means=10000)
time2 = time.time()
orig_time = time2-time1
print('Original pipeline: %.2f' % (orig_time))
print(means_orig)

for n_workers in range(1,5):
    time1 = time.time()
    means = chapter3_pipeline_parallel(n_means=10000, n_workers=n_workers)
    time2 = time.time()
    
    print('Time taken by %d workers: %.2f Speedup was: %.2f' % (n_workers, time2 - time1, orig_time/(time2-time1)))
    print('Maximum difference between calculated means:', (means['Mean']-means_orig['Mean']).abs().max())

Original pipeline: 13.21
      Year       Mean
0   2010.0  12.972783
1   2011.0  14.041408
2   2012.0  10.677178
3   2013.0  13.410552
4   2014.0  14.042002
5   2015.0  11.745106
6   2016.0  13.542875
7   2017.0  11.981502
8   2018.0  13.279947
9   2019.0  13.707754
10  2020.0  13.229183
Time taken by 1 workers: 12.30 Speedup was: 1.07
Maximum difference between calculated means: 0.009673000000001153
Time taken by 2 workers: 7.42 Speedup was: 1.78
Maximum difference between calculated means: 0.008904000000001133
Time taken by 3 workers: 4.74 Speedup was: 2.79
Maximum difference between calculated means: 0.008904000000001133
Time taken by 4 workers: 3.90 Speedup was: 3.38
Maximum difference between calculated means: 0.008904000000001133


## Optimizing code with profilers


In [14]:
import cProfile
import pstats
import io

# Initiate profiler
pr = cProfile.Profile(subcalls=False)
pr.enable()

# Run the pipeline
chapter3_pipeline(n_means=10000)

# Stop profiling
pr.disable()

# Print stats by total time used (top 20)
ps = pstats.Stats(pr).strip_dirs().sort_stats('tottime')
ps.print_stats(20)

# Print into a StringIO buffer and find top 20 function calls by cumulative time
io_stream = io.StringIO()
ps_methods = pstats.Stats(pr, stream=io_stream).strip_dirs().sort_stats('cumulative')
ps_methods.print_stats()

method_lines = [ line for line in io_stream.getvalue().split('\n') if ' {method' in line ]

print('Top methods by cumulative time:\n')
print('\n'.join(method_lines[:20]))

         17987532 function calls (17324700 primitive calls) in 17.112 seconds

   Ordered by: internal time
   List reduced from 1380 to 20 due to restriction <20>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
   110000    4.397    0.000   14.884    0.000 {method 'choice' of 'numpy.random.mtrand.RandomState' objects}
   330370    1.257    0.000    1.257    0.000 {method 'reduce' of 'numpy.ufunc' objects}
110123/110121    1.036    0.000    6.070    0.000 algorithms.py:1616(take_nd)
551172/331084    0.735    0.000    7.628    0.000 {built-in method numpy.array}
   110121    0.498    0.000    1.710    0.000 algorithms.py:1487(_get_take_nd_function)
   110011    0.493    0.000    1.482    0.000 _methods.py:143(_mean)
  2553898    0.457    0.000    0.643    0.000 {built-in method builtins.isinstance}
   110088    0.411    0.000    1.590    0.000 cast.py:442(maybe_promote)
   110062    0.363    0.000    0.363    0.000 {pandas._libs.algos.take_1d_int64_int64}
   220

In [15]:
# Optimized version of the bootstrapping code
def get_bootstrapped_means(dataset, target_col=None, weight_col=None, n_means=1000):
    # Pick relevant columns
    df = dataset[[target_col, weight_col]].copy()
    # Pick target data column
    target_data = df[target_col]
    # Pick weight data column
    weight_data = df[weight_col]
    # Fill zeros to those byte sizes that are not present in the Files-data
    weight_data.fillna(0, inplace=True)
    # Normalize weight_data into probabilities
    weight_data = weight_data/weight_data.sum()
    
    # Calculate resampled mean
    means = np.mean(np.random.choice(target_data, 100*n_means, replace=True, p=weight_data).reshape(100,n_means), axis=0)

    return means