# Chapter 4

This notebook contains the commands that are shown in the lecture 3.

In [1]:
import numpy as np
import scipy
import pandas as pd
import time

## Different resources involved in data analysis pipelines

### Processors as a resource

In [2]:
n_zeros = 10000
times = 1000

z = np.zeros(n_zeros)

time_for_1 = time.time()
for t in range(times):
    for i in range(n_zeros):
        z[i] = z[i] + 1
time_for_2 = time.time()

time_for = time_for_2-time_for_1

z = np.zeros(n_zeros)

time_vec_1 = time.time()
for times in range(times):
    z = z + 1
time_vec_2 = time.time()

time_vec = time_vec_2-time_vec_1

print("""
Time taken:

For loop: %.2g
Vectorized operation: %.2g

Speedup: %.0f
""" % (time_for, time_vec, time_for/time_vec))


Time taken:

For loop: 4.5
Vectorized operation: 0.0056

Speedup: 801



In [9]:
def load_filesizes(filesizes_file):
    filesizes = pd.read_table(filesizes_file, sep='\s+', names=['Bytes','MonthsTo2021', 'Files'])
    
    # Remove empty files
    filesizes = filesizes[filesizes.loc[:,'Bytes'] != 0]
    # Create a column for log2 of bytes
    filesizes['BytesLog2'] = np.log2(filesizes.loc[:, 'Bytes'])
    filesizes.loc[:,'BytesLog2'] = filesizes.loc[:,'BytesLog2'].astype(np.int64)
    # Determine total space S used by N files of size X during date D: S=N*X 
    filesizes['SpaceUsage'] = filesizes.loc[:,'Bytes']*filesizes.loc[:,'Files']
    # Determine file year and month from the MonthsTo2021-column
    filesizes['TotalMonths'] = 2021*12 - filesizes['MonthsTo2021'] - 1
    filesizes['Year'] = filesizes['TotalMonths'] // 12
    filesizes['Month'] = filesizes['TotalMonths'] % 12 + 1
    filesizes['Day'] = 1
    
    # Set year for really old files and files with incorrect timestamps
    invalid_years = (filesizes['Year'] < 2010) | (filesizes['Year'] > 2020)
    filesizes.loc[invalid_years, ['Year','Month']] = np.NaN
    
    # Get month names for the correct ordering of Month categories
    month_names = pd.date_range(start='2000-01', freq='M', periods=12).month_name()
    # Create Date
    filesizes['Date'] = pd.to_datetime(filesizes[['Year', 'Month', 'Day']])
    # Set Month
    filesizes['Month'] = pd.Categorical(filesizes['Date'].dt.month_name(), categories=month_names, ordered=True)
    # Set Month to be an ordered categorical with predefined levels 
    filesizes['Month'] = pd.Categorical(filesizes['Month'], categories=month_names, ordered=True)
    # Sort data based on Date and BytesLog2
    filesizes.sort_values(['Date','BytesLog2'], inplace=True)
    # Remove old columns
    filesizes.drop(['MonthsTo2021','TotalMonths', 'Day'], axis=1, inplace=True)
    return filesizes

def aggregate_filesize_data(data, groupings, targets, agg_function):
    # Drop rows with NaNs (invalid years)
    data_relevant = data.dropna(axis=0)
    # Pick relevant columns
    data_relevant = data_relevant.loc[:, groupings + targets]
    # Change grouping to category for prettier plotting
    data_relevant[groupings] = data_relevant[groupings].astype('category')

    # Aggregate data
    data_aggregated = data_relevant.groupby(groupings).agg(agg_function).reset_index()
    return data_aggregated

def get_bootstrapped_means(dataset, target_col, weight_col, n_means=1000):
    # Pick relevant columns
    df = dataset[[target_col, weight_col]].copy()
    # Pick target data column
    target_data = df[target_col]
    # Pick weight data column
    weight_data = df[weight_col]
    # Fill zeros to those byte sizes that are not present in the Files-data
    weight_data.fillna(0, inplace=True)
    # Normalize weight_data into probabilities
    weight_data = weight_data/weight_data.sum()
    
    # Create means vector
    means = np.zeros(n_means, dtype=np.float64)
    for i in range(n_means):
        # Calculate resampled mean
        means[i] = np.mean(np.random.choice(target_data, 100, replace=True, p=weight_data))

    return means

def bootstrap_byteslog2_mean(dataset, group_variable, target_variable, n_means=1000):

    bootstrapping_function = lambda x: get_bootstrapped_means(x, 'BytesLog2', target_variable, n_means=n_means)

    bootstrapped_means = dataset.groupby(group_variable).apply(lambda x: pd.Series({'data': x}))
    bootstrapped_means['SampledMeans'] = bootstrapped_means['data'].apply(bootstrapping_function)
    bootstrapped_means['Mean'] = bootstrapped_means['SampledMeans'].apply(np.mean)
    bootstrapped_means.drop('data', axis=1, inplace=True)
    return bootstrapped_means


In [17]:
def chapter3_pipeline(n_means=10000):
    
    filesizes = load_filesizes('../data/filesizes_timestamps.txt')

    yearly_bytes_sum = aggregate_filesize_data(filesizes, ['Year','BytesLog2'], ['Files', 'SpaceUsage'], 'sum')

    bootstrapped_yearly_means = bootstrap_byteslog2_mean(yearly_bytes_sum, 'Year', 'Files', n_means=n_means)
    
    bootstrapped_yearly_means = bootstrapped_yearly_means.reset_index()[['Year','Mean']]

    return bootstrapped_yearly_means

In [18]:
chapter3_pipeline(n_means=100)

Unnamed: 0,Year,Mean
0,2010.0,12.9741
1,2011.0,14.0293
2,2012.0,10.7615
3,2013.0,13.3932
4,2014.0,14.0477
5,2015.0,11.7602
6,2016.0,13.5082
7,2017.0,11.8654
8,2018.0,13.2088
9,2019.0,13.683


In [77]:
import cProfile
import pstats
import io

# Initiate profiler
pr = cProfile.Profile(subcalls=False)
pr.enable()

# Run the pipeline
chapter3_pipeline(n_means=10000)

# Stop profiling
pr.disable()

# Print stats by total time used (top 20)
ps = pstats.Stats(pr).strip_dirs().sort_stats('tottime')
ps.print_stats(20)

# Print into a StringIO buffer and find top 20 function calls by cumulative time
io_stream = io.StringIO()
ps_methods = pstats.Stats(pr, stream=io_stream).strip_dirs().sort_stats('cumulative')
ps_methods.print_stats()

method_lines = [ line for line in io_stream.getvalue().split('\n') if ' {method' in line ]

print('Top methods by cumulative time:\n')
print('\n'.join(method_lines[:20]))

         17987532 function calls (17324700 primitive calls) in 17.248 seconds

   Ordered by: internal time
   List reduced from 1380 to 20 due to restriction <20>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
   110000    4.415    0.000   15.015    0.000 {method 'choice' of 'numpy.random.mtrand.RandomState' objects}
   330370    1.279    0.000    1.279    0.000 {method 'reduce' of 'numpy.ufunc' objects}
110123/110121    1.011    0.000    6.131    0.000 algorithms.py:1616(take_nd)
551172/331084    0.741    0.000    7.693    0.000 {built-in method numpy.array}
   110121    0.524    0.000    1.788    0.000 algorithms.py:1487(_get_take_nd_function)
   110011    0.485    0.000    1.495    0.000 _methods.py:143(_mean)
  2553898    0.472    0.000    0.660    0.000 {built-in method builtins.isinstance}
   110088    0.385    0.000    1.572    0.000 cast.py:442(maybe_promote)
   220504    0.368    0.000    1.208    0.000 _dtype.py:321(_name_get)
   110062    0.368    

In [142]:
import time
import mkl

A = np.random.random((4000,4000))

A = A*A.T

In [143]:
np.show_config()

mkl.set_num_threads(1)

time_1thread_1 = time.time()
np.linalg.inv(A)
time_1thread_2 = time.time()

time_1thread = time_1thread_2 - time_1thread_1

mkl.set_num_threads(4)

time_4thread_1 = time.time()
np.linalg.inv(A)
time_4thread_2 = time.time()

time_4thread = time_4thread_2 - time_4thread_1

print("""
Time taken:

1 thread: %.2f
4 threads: %.2f

Speedup: %.2f
""" % (time_1thread, time_4thread, time_1thread/time_4thread))

blas_mkl_info:
    libraries = ['mkl_rt', 'pthread']
    library_dirs = ['/u/59/tuomiss1/unix/conda/envs/dataanalysis/lib']
    define_macros = [('SCIPY_MKL_H', None), ('HAVE_CBLAS', None)]
    include_dirs = ['/u/59/tuomiss1/unix/conda/envs/dataanalysis/include']
blas_opt_info:
    libraries = ['mkl_rt', 'pthread']
    library_dirs = ['/u/59/tuomiss1/unix/conda/envs/dataanalysis/lib']
    define_macros = [('SCIPY_MKL_H', None), ('HAVE_CBLAS', None)]
    include_dirs = ['/u/59/tuomiss1/unix/conda/envs/dataanalysis/include']
lapack_mkl_info:
    libraries = ['mkl_rt', 'pthread']
    library_dirs = ['/u/59/tuomiss1/unix/conda/envs/dataanalysis/lib']
    define_macros = [('SCIPY_MKL_H', None), ('HAVE_CBLAS', None)]
    include_dirs = ['/u/59/tuomiss1/unix/conda/envs/dataanalysis/include']
lapack_opt_info:
    libraries = ['mkl_rt', 'pthread']
    library_dirs = ['/u/59/tuomiss1/unix/conda/envs/dataanalysis/lib']
    define_macros = [('SCIPY_MKL_H', None), ('HAVE_CBLAS', None)]
    include