# Chapter 4

This notebook contains the commands that are shown in the lecture 3.

In [2]:
library(tidyverse)
library(lubridate)

In [3]:
load_filesizes <- function(filesizes_file){
    filesizes <- read_table2(filesizes_file, col_names=c('Bytes','MonthsTo2021', 'Files'))
    
    filesizes <- filesizes %>%
        # Remove empty files
        filter(Bytes != 0) %>%
        # Create a column for log2 of bytes
        mutate(BytesLog2 = log2(Bytes)) %>%
        # Determine total space S used by N files of size X during date D: S=N*X 
        mutate(SpaceUsage = Bytes*Files) %>%
        # Determine file year and month from the MonthsTo2021-column
        mutate(
            TotalMonths = 2021*12 - MonthsTo2021 - 1,
            Year = TotalMonths %/% 12,
            Month = TotalMonths %% 12 +1,
            Day = 1
        )

     # Set year for really old files and files with incorrect timestamps
    invalid_years = c((filesizes['Year'] < 2010) | (filesizes['Year'] > 2020))
    filesizes[invalid_years, c('Year','Month')] <- NaN
    
    # Get month names for the correct ordering of Month categories
    month_names <- month(seq(1,12), label=TRUE, locale='C')
    filesizes <- filesizes %>%
        mutate(
            # Create Date and get the name for the month
            Date = make_datetime(Year, Month, Day),
            # Set Month 
            Month=month(Month, label=TRUE, locale='C'),
            # Set Month to be an ordered categorical with predefined levels 
            Month=factor(Month, ordered=TRUE, levels=month_names))
    filesizes <- filesizes %>%
        # Sort data based on Date and BytesLog2
        arrange(Date, BytesLog2) %>%
        # Remove old columns
        select(-MonthsTo2021,-TotalMonths,-Day)
    return(filesizes)
}

aggregate_filesize_data <- function(data, grouping, target, agg_function) {
    data_relevant <- data %>%
        # Drop rows with NaNs (invalid years)
        drop_na() %>%
        # Pick relevant columns
        select_at(vars(c(grouping, target))) %>%
        # Change grouping to category for prettier plotting
        mutate_at(vars(grouping), as.factor)

    # Aggregate data
    data_aggregated <- data_relevant %>%
        group_by_at((grouping)) %>%
        summarize_at(vars(target), agg_function) %>%
        ungroup()

    return(data_aggregated)
}

get_bootstrapped_means <- function(dataset, target_col, weight_col, n_means=1000) {
    # Pick relevant columns
    # Pick target data column and convert it to integer
    target_data <- as.numeric(as.character(dataset[[target_col]]))
    # Pick weight data column
    weight_data <- dataset[[weight_col]]
    weight_data <- weight_data/sum(weight_data)

    # Create means vector
    means <- numeric(n_means)
    for (i in seq(n_means)) {
        # Calculate resampled mean
        means[[i]] <- mean(sample(target_data, 100, replace=TRUE, prob=weight_data))
    }
    return(means)
}

bootstrap_byteslog2_mean <- function(dataset, group_variable, target_variable, n_means=1000) {
    
    bootstrapping_function <- function(x) get_bootstrapped_means(x, 'BytesLog2', target_variable, n_means=n_means)
    
    bootstrapped_means <- dataset %>%
        group_by_at(vars(group_variable)) %>%
        nest() %>%
        mutate(
            SampledMeans=map(data, bootstrapping_function),
            Mean=map(SampledMeans, mean)
        ) %>%
        select(-data)
    
    return(bootstrapped_means)
}

In [7]:

chapter3_pipeline <- function(n_means=10000) {

    filesizes <- load_filesizes('../data/filesizes_timestamps.txt')

    yearly_bytes_sum <- aggregate_filesize_data(filesizes, c('Year','BytesLog2'), c('Files', 'SpaceUsage'), sum)

    bootstrapped_yearly_means <- yearly_bytes_sum %>%
        bootstrap_byteslog2_mean('Year', 'Files', n_means=n_means) %>%
        select(Year, Mean)

    return(bootstrapped_yearly_means)
}



In [14]:
# Initiate profiler
profile_tempfile <- tempfile()
Rprof(profile_tempfile, memory.profiling=TRUE)

# Run the pipeline
glimpse(chapter3_pipeline(10000))

# Stop profiling
Rprof()

# Print top 20 function calls by cumulative time
summaryRprof(profile_tempfile, memory='both')['by.self']

# Remove profiling file
unlink(profile_tempfile)

Parsed with column specification:
cols(
  Bytes = col_double(),
  MonthsTo2021 = col_double(),
  Files = col_double()
)


Observations: 11
Variables: 2
$ Year <fct> 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020
$ Mean <list> [12.97627, 14.0465, 10.6741, 13.406, 14.04326, 11.75316, 13.545â€¦


Unnamed: 0,self.time,self.pct,total.time,total.pct,mem.total
"""sample.int""",0.48,39.34,0.52,42.62,352.5
"""mean""",0.34,27.87,1.1,90.16,741.4
"""mean.default""",0.14,11.48,0.14,11.48,108.3
"""sample""",0.1,8.2,0.62,50.82,434.5
"""get_bootstrapped_means""",0.04,3.28,1.14,93.44,773.8
"""(""",0.02,1.64,0.02,1.64,18.0
"""factor""",0.02,1.64,0.02,1.64,0.0
"""gzfile""",0.02,1.64,0.02,1.64,15.1
"""is_formulaish""",0.02,1.64,0.02,1.64,0.0
"""length""",0.02,1.64,0.02,1.64,0.0


In [72]:
cat("
A <- matrix(runif(4000*4000), ncol=4000)
A <- A %*% t(A)

time_1 = Sys.time()
A_inv <- solve(A)
time_2 = Sys.time()
print(as.double(time_2 - time_1))
", file="omp_test.R")

Sys.setenv(OMP_NUM_THREADS="1")
output <- system('Rscript omp_test.R', intern=TRUE)
time_1thread <- as.numeric(str_extract(output, '\\d.\\d+'))

Sys.setenv(OMP_NUM_THREADS="4")
output <- system('Rscript omp_test.R', intern=TRUE)
time_4thread <- as.numeric(str_extract(output, '\\d.\\d+'))

cat(sprintf("Time taken:\n\n1 thread: %.2f\n4 threads: %.2f\n\nSpeedup: %.2f", time_1thread, time_4thread, time_1thread/time_4thread))

Time taken:

1 thread: 4.58
4 threads: 1.85

Speedup: 2.48