# Chapter 2

This notebook contains the commands that are shown in the lectures.

In [1]:
library(tidyverse)
library(lubridate)

Registered S3 methods overwritten by 'ggplot2':
  method         from 
  [.quosures     rlang
  c.quosures     rlang
  print.quosures rlang
Registered S3 method overwritten by 'rvest':
  method            from
  read_xml.response xml2
── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 3.1.1       ✔ purrr   0.3.2  
✔ tibble  2.1.1       ✔ dplyr   0.8.0.1
✔ tidyr   0.8.3       ✔ stringr 1.4.0  
✔ readr   1.3.1       ✔ forcats 0.4.0  
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()

Attaching package: ‘lubridate’

The following object is masked from ‘package:base’:

    date



In [2]:
# Comment this out if you haven't installed arrow yet
library(arrow)

“package ‘arrow’ was built under R version 3.6.3”
Attaching package: ‘arrow’

The following object is masked from ‘package:utils’:

    timestamp



# Simple data operations

## Loading data from CSVs

[read_csv](https://readr.tidyverse.org/reference/read_delim.html)

In [3]:
atp_players <- read_csv('../data/atp_players.csv', col_names=c('player_id', 'first_name', 'last_name', 'hand', 'birth_date', 'country_code'))
head(atp_players)

Parsed with column specification:
cols(
  player_id = col_double(),
  first_name = col_character(),
  last_name = col_character(),
  hand = col_character(),
  birth_date = col_double(),
  country_code = col_character()
)


player_id,first_name,last_name,hand,birth_date,country_code
100001,Gardnar,Mulloy,R,19131122,USA
100002,Pancho,Segura,R,19210620,ECU
100003,Frank,Sedgman,R,19271002,AUS
100004,Giuseppe,Merlo,R,19271011,ITA
100005,Richard Pancho,Gonzales,R,19280509,USA
100006,Grant,Golden,R,19290821,USA


In [4]:
str(atp_players)

Classes ‘spec_tbl_df’, ‘tbl_df’, ‘tbl’ and 'data.frame':	54938 obs. of  6 variables:
 $ player_id   : num  1e+05 1e+05 1e+05 1e+05 1e+05 ...
 $ first_name  : chr  "Gardnar" "Pancho" "Frank" "Giuseppe" ...
 $ last_name   : chr  "Mulloy" "Segura" "Sedgman" "Merlo" ...
 $ hand        : chr  "R" "R" "R" "R" ...
 $ birth_date  : num  19131122 19210620 19271002 19271011 19280509 ...
 $ country_code: chr  "USA" "ECU" "AUS" "ITA" ...
 - attr(*, "spec")=
  .. cols(
  ..   player_id = col_double(),
  ..   first_name = col_character(),
  ..   last_name = col_character(),
  ..   hand = col_character(),
  ..   birth_date = col_double(),
  ..   country_code = col_character()
  .. )


In [5]:
head(atp_players)

player_id,first_name,last_name,hand,birth_date,country_code
100001,Gardnar,Mulloy,R,19131122,USA
100002,Pancho,Segura,R,19210620,ECU
100003,Frank,Sedgman,R,19271002,AUS
100004,Giuseppe,Merlo,R,19271011,ITA
100005,Richard Pancho,Gonzales,R,19280509,USA
100006,Grant,Golden,R,19290821,USA


## Creating and removing columns


[Tidyverse mutate](https://dplyr.tidyverse.org/reference/mutate.html)

[Lubridate parse_date_time](https://lubridate.tidyverse.org/reference/parse_date_time.html)

In [6]:
atp_players <- atp_players %>%
    mutate(birth_date=parse_date_time(birth_date, order='%a %Y%m%d'))
str(atp_players)

“All formats failed to parse. No formats found.”

Classes ‘spec_tbl_df’, ‘tbl_df’, ‘tbl’ and 'data.frame':	54938 obs. of  6 variables:
 $ player_id   : num  1e+05 1e+05 1e+05 1e+05 1e+05 ...
 $ first_name  : chr  "Gardnar" "Pancho" "Frank" "Giuseppe" ...
 $ last_name   : chr  "Mulloy" "Segura" "Sedgman" "Merlo" ...
 $ hand        : chr  "R" "R" "R" "R" ...
 $ birth_date  : POSIXct, format: NA NA ...
 $ country_code: chr  "USA" "ECU" "AUS" "ITA" ...


[unite](https://tidyr.tidyverse.org/reference/unite.html)

In [7]:
atp_players <- atp_players %>%
    unite(name, last_name, first_name, sep=', ', remove=FALSE)

head(atp_players)

player_id,name,first_name,last_name,hand,birth_date,country_code
100001,"Mulloy, Gardnar",Gardnar,Mulloy,R,,USA
100002,"Segura, Pancho",Pancho,Segura,R,,ECU
100003,"Sedgman, Frank",Frank,Sedgman,R,,AUS
100004,"Merlo, Giuseppe",Giuseppe,Merlo,R,,ITA
100005,"Gonzales, Richard Pancho",Richard Pancho,Gonzales,R,,USA
100006,"Golden, Grant",Grant,Golden,R,,USA


[select](https://dplyr.tidyverse.org/reference/select.html)

In [8]:
atp_players <- atp_players %>%
    select(-first_name, -last_name)

str(atp_players)

Classes ‘tbl_df’, ‘tbl’ and 'data.frame':	54938 obs. of  5 variables:
 $ player_id   : num  1e+05 1e+05 1e+05 1e+05 1e+05 ...
 $ name        : chr  "Mulloy, Gardnar" "Segura, Pancho" "Sedgman, Frank" "Merlo, Giuseppe" ...
 $ hand        : chr  "R" "R" "R" "R" ...
 $ birth_date  : POSIXct, format: NA NA ...
 $ country_code: chr  "USA" "ECU" "AUS" "ITA" ...


## Turning input processing tasks into functions

In [9]:
load_atp_players <- function(atp_players_file){
    atp_players <- read_csv(atp_players_file, col_names=c('player_id', 'first_name', 'last_name', 'hand', 'birth_date', 'country_code'), col_types=cols()) %>%
        mutate(birth_date=parse_date_time(birth_date, order='%Y%m%d')) %>%
        unite(name, last_name, first_name, sep=', ', remove=TRUE)
    return(atp_players)
}

atp_players <- load_atp_players('../data/atp_players.csv')
head(atp_players)

“ 125 failed to parse.”

player_id,name,hand,birth_date,country_code
100001,"Mulloy, Gardnar",R,1913-11-22,USA
100002,"Segura, Pancho",R,1921-06-20,ECU
100003,"Sedgman, Frank",R,1927-10-02,AUS
100004,"Merlo, Giuseppe",R,1927-10-11,ITA
100005,"Gonzales, Richard Pancho",R,1928-05-09,USA
100006,"Golden, Grant",R,1929-08-21,USA


## Categorical data format

[mutate_at](https://dplyr.tidyverse.org/reference/mutate_all.html)

In [10]:
object.size(atp_players[['hand']])
atp_players_categorized <- atp_players %>%
    mutate_at(c('country_code', 'hand'), as.factor)
object.size(atp_players_categorized[['hand']])
print(levels(atp_players_categorized[['hand']]))
str(atp_players_categorized)

439776 bytes

220440 bytes

[1] "A" "L" "R" "U"
Classes ‘tbl_df’, ‘tbl’ and 'data.frame':	54938 obs. of  5 variables:
 $ player_id   : num  1e+05 1e+05 1e+05 1e+05 1e+05 ...
 $ name        : chr  "Mulloy, Gardnar" "Segura, Pancho" "Sedgman, Frank" "Merlo, Giuseppe" ...
 $ hand        : Factor w/ 4 levels "A","L","R","U": 3 3 3 3 3 3 2 3 3 3 ...
 $ birth_date  : POSIXct, format: "1913-11-22" "1921-06-20" ...
 $ country_code: Factor w/ 210 levels "AFG","AHO","ALB",..: 200 62 13 97 200 200 160 58 88 43 ...


In [11]:
categorize_players <- function(players) {
    players <- players %>%
        mutate_at(c('country_code', 'hand'), as.factor)
    return(players)
}
str(atp_players)
atp_players <- categorize_players(atp_players)
str(atp_players)

Classes ‘tbl_df’, ‘tbl’ and 'data.frame':	54938 obs. of  5 variables:
 $ player_id   : num  1e+05 1e+05 1e+05 1e+05 1e+05 ...
 $ name        : chr  "Mulloy, Gardnar" "Segura, Pancho" "Sedgman, Frank" "Merlo, Giuseppe" ...
 $ hand        : chr  "R" "R" "R" "R" ...
 $ birth_date  : POSIXct, format: "1913-11-22" "1921-06-20" ...
 $ country_code: chr  "USA" "ECU" "AUS" "ITA" ...
Classes ‘tbl_df’, ‘tbl’ and 'data.frame':	54938 obs. of  5 variables:
 $ player_id   : num  1e+05 1e+05 1e+05 1e+05 1e+05 ...
 $ name        : chr  "Mulloy, Gardnar" "Segura, Pancho" "Sedgman, Frank" "Merlo, Giuseppe" ...
 $ hand        : Factor w/ 4 levels "A","L","R","U": 3 3 3 3 3 3 2 3 3 3 ...
 $ birth_date  : POSIXct, format: "1913-11-22" "1921-06-20" ...
 $ country_code: Factor w/ 210 levels "AFG","AHO","ALB",..: 200 62 13 97 200 200 160 58 88 43 ...


## Joining datasets together

[parse_date_time](https://lubridate.tidyverse.org/reference/parse_date_time.html)

In [12]:
load_atp_rankings <- function(atp_rankings_file){
    atp_rankings <- read_csv(atp_rankings_file, col_types=cols()) %>%
        mutate(ranking_date=parse_date_time(ranking_date, order='%Y%m%d'))
    return(atp_rankings)
}

atp_rankings00 <- load_atp_rankings('../data/atp_rankings_00s.csv')
atp_rankings10 <- load_atp_rankings('../data/atp_rankings_10s.csv')

head(atp_rankings00)
head(atp_rankings10)

ranking_date,rank,player,points
2000-01-10,1,101736,4135
2000-01-10,2,102338,2915
2000-01-10,3,101948,2419
2000-01-10,4,103017,2184
2000-01-10,5,102856,2169
2000-01-10,6,102358,2107


ranking_date,rank,player,points
2010-01-04,1,103819,10550
2010-01-04,2,104745,9205
2010-01-04,3,104925,8310
2010-01-04,4,104918,7030
2010-01-04,5,105223,6785
2010-01-04,6,103786,4930


[bind_rows](https://dplyr.tidyverse.org/reference/bind.html)

In [13]:
print(nrow(atp_rankings00))
print(nrow(atp_rankings10))
atp_rankings <- bind_rows(atp_rankings00, atp_rankings10)
print(nrow(atp_rankings))
print(head(atp_rankings))

[1] 920907
[1] 916296
[1] 1837203
# A tibble: 6 x 4
  ranking_date         rank player points
  <dttm>              <dbl>  <dbl>  <dbl>
1 2000-01-10 00:00:00     1 101736   4135
2 2000-01-10 00:00:00     2 102338   2915
3 2000-01-10 00:00:00     3 101948   2419
4 2000-01-10 00:00:00     4 103017   2184
5 2000-01-10 00:00:00     5 102856   2169
6 2000-01-10 00:00:00     6 102358   2107


[rename](https://dplyr.tidyverse.org/reference/rename.html)

In [14]:
atp_rankings <- atp_rankings %>%
    rename(player_id=player)
head(atp_rankings)

ranking_date,rank,player_id,points
2000-01-10,1,101736,4135
2000-01-10,2,102338,2915
2000-01-10,3,101948,2419
2000-01-10,4,103017,2184
2000-01-10,5,102856,2169
2000-01-10,6,102358,2107


In [15]:
load_multiple_atp_rankings <- function(atp_rankings_files){
    datasets <- list()
    for (atp_ranking_file in atp_rankings_files) {
        dataset <- load_atp_rankings(atp_ranking_file)
        datasets <- append(datasets, list(dataset))
    }
    atp_rankings <- bind_rows(datasets) %>%
        rename(player_id=player)
    return(atp_rankings)
}

atp_rankings <- load_multiple_atp_rankings(c('../data/atp_rankings_00s.csv','../data/atp_rankings_10s.csv'))
print(nrow(atp_rankings))
head(atp_rankings)

[1] 1837203


ranking_date,rank,player_id,points
2000-01-10,1,101736,4135
2000-01-10,2,102338,2915
2000-01-10,3,101948,2419
2000-01-10,4,103017,2184
2000-01-10,5,102856,2169
2000-01-10,6,102358,2107


[left_join](https://dplyr.tidyverse.org/reference/join.html)

In [16]:
atp_data <- atp_rankings %>%
    left_join(atp_players, by='player_id')
str(atp_data)
head(atp_data)

Classes ‘spec_tbl_df’, ‘tbl_df’, ‘tbl’ and 'data.frame':	1837203 obs. of  8 variables:
 $ ranking_date: POSIXct, format: "2000-01-10" "2000-01-10" ...
 $ rank        : num  1 2 3 4 5 6 7 8 9 10 ...
 $ player_id   : num  101736 102338 101948 103017 102856 ...
 $ points      : num  4135 2915 2419 2184 2169 ...
 $ name        : chr  "Agassi, Andre" "Kafelnikov, Yevgeny" "Sampras, Pete" "Kiefer, Nicolas" ...
 $ hand        : Factor w/ 4 levels "A","L","R","U": 3 3 3 3 3 3 3 3 2 3 ...
 $ birth_date  : POSIXct, format: "1970-04-29" "1974-02-18" ...
 $ country_code: Factor w/ 210 levels "AFG","AHO","ALB",..: 200 161 200 76 28 179 62 200 43 137 ...


ranking_date,rank,player_id,points,name,hand,birth_date,country_code
2000-01-10,1,101736,4135,"Agassi, Andre",R,1970-04-29,USA
2000-01-10,2,102338,2915,"Kafelnikov, Yevgeny",R,1974-02-18,RUS
2000-01-10,3,101948,2419,"Sampras, Pete",R,1971-08-12,USA
2000-01-10,4,103017,2184,"Kiefer, Nicolas",R,1977-07-05,GER
2000-01-10,5,102856,2169,"Kuerten, Gustavo",R,1976-09-10,BRA
2000-01-10,6,102358,2107,"Enqvist, Thomas",R,1974-03-13,SWE


## Demonstrating ATP dataset: Longest reign at rank 1

[filter](https://dplyr.tidyverse.org/reference/filter.html)

In [17]:
atp_top1 <- atp_data %>%
    filter(rank == 1)

# or

atp_top1 <- atp_data[atp_data['rank'] == 1,]

head(atp_top1)

ranking_date,rank,player_id,points,name,hand,birth_date,country_code
2000-01-10,1,101736,4135,"Agassi, Andre",R,1970-04-29,USA
2000-01-17,1,101736,4135,"Agassi, Andre",R,1970-04-29,USA
2000-01-24,1,101736,4135,"Agassi, Andre",R,1970-04-29,USA
2000-01-31,1,101736,5045,"Agassi, Andre",R,1970-04-29,USA
2000-02-07,1,101736,5045,"Agassi, Andre",R,1970-04-29,USA
2000-02-14,1,101736,5045,"Agassi, Andre",R,1970-04-29,USA


[lag](https://dplyr.tidyverse.org/reference/lead-lag.html)

In [18]:
atp_top1 <- atp_top1 %>%
    mutate(previous_top=lag(player_id))

head(atp_top1)

ranking_date,rank,player_id,points,name,hand,birth_date,country_code,previous_top
2000-01-10,1,101736,4135,"Agassi, Andre",R,1970-04-29,USA,
2000-01-17,1,101736,4135,"Agassi, Andre",R,1970-04-29,USA,101736.0
2000-01-24,1,101736,4135,"Agassi, Andre",R,1970-04-29,USA,101736.0
2000-01-31,1,101736,5045,"Agassi, Andre",R,1970-04-29,USA,101736.0
2000-02-07,1,101736,5045,"Agassi, Andre",R,1970-04-29,USA,101736.0
2000-02-14,1,101736,5045,"Agassi, Andre",R,1970-04-29,USA,101736.0


In [19]:
# Better when we want to drop rows
atp_top1_reigns <- atp_top1 %>%
    filter(player_id != previous_top)

# Logical indexing is more useful when we want to edit certain rows
atp_top1_reigns <- drop_na(atp_top1[atp_top1['player_id'] != atp_top1['previous_top'],])
head(atp_top1_reigns)

ranking_date,rank,player_id,points,name,hand,birth_date,country_code,previous_top
2000-09-11,1,101948,3739,"Sampras, Pete",R,1971-08-12,USA,101736
2000-11-20,1,103498,3920,"Safin, Marat",R,1980-01-27,RUS,101948
2000-12-04,1,102856,4195,"Kuerten, Gustavo",R,1976-09-10,BRA,103498
2001-01-29,1,103498,4265,"Safin, Marat",R,1980-01-27,RUS,102856
2001-02-26,1,102856,4365,"Kuerten, Gustavo",R,1976-09-10,BRA,103498
2001-04-02,1,103498,4270,"Safin, Marat",R,1980-01-27,RUS,102856


[lead](https://dplyr.tidyverse.org/reference/lead-lag.html)

[difftime](https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/difftime)

In [20]:
atp_top1_reigns <- atp_top1_reigns %>%
    mutate(reign_length=difftime(lead(ranking_date), ranking_date))
head(atp_top1_reigns)

ranking_date,rank,player_id,points,name,hand,birth_date,country_code,previous_top,reign_length
2000-09-11,1,101948,3739,"Sampras, Pete",R,1971-08-12,USA,101736,70 days
2000-11-20,1,103498,3920,"Safin, Marat",R,1980-01-27,RUS,101948,14 days
2000-12-04,1,102856,4195,"Kuerten, Gustavo",R,1976-09-10,BRA,103498,56 days
2001-01-29,1,103498,4265,"Safin, Marat",R,1980-01-27,RUS,102856,28 days
2001-02-26,1,102856,4365,"Kuerten, Gustavo",R,1976-09-10,BRA,103498,35 days
2001-04-02,1,103498,4270,"Safin, Marat",R,1980-01-27,RUS,102856,21 days


[top_n](https://dplyr.tidyverse.org/reference/top_n.html)

[arrange](https://dplyr.tidyverse.org/reference/arrange.html)

[desc](https://dplyr.tidyverse.org/reference/desc.html)

In [21]:
atp_top1_reigns %>%
    top_n(5, reign_length) %>%
    arrange(desc(reign_length))

ranking_date,rank,player_id,points,name,hand,birth_date,country_code,previous_top,reign_length
2004-02-02,1,103819,5225,"Federer, Roger",R,1981-08-08,SUI,104053,1659 days
2014-07-07,1,104925,13130,"Djokovic, Novak",R,1987-05-22,SRB,104745,854 days
2001-11-19,1,103720,4365,"Hewitt, Lleyton",R,1981-02-24,AUS,102856,525 days
2010-06-07,1,104745,8700,"Nadal, Rafael",L,1986-06-03,ESP,103819,392 days
2011-07-04,1,104925,13285,"Djokovic, Novak",R,1987-05-22,SRB,104745,371 days


# Using binary data formats to improve your pipeline

## CSVs and challenges with them

In [22]:
write_csv(atp_data, '../data/atp_data_r.csv')
head(read_csv('../data/atp_data_r.csv'))

Parsed with column specification:
cols(
  ranking_date = col_datetime(format = ""),
  rank = col_double(),
  player_id = col_double(),
  points = col_double(),
  name = col_character(),
  hand = col_character(),
  birth_date = col_datetime(format = ""),
  country_code = col_character()
)


ranking_date,rank,player_id,points,name,hand,birth_date,country_code
2000-01-10,1,101736,4135,"Agassi, Andre",R,1970-04-29,USA
2000-01-10,2,102338,2915,"Kafelnikov, Yevgeny",R,1974-02-18,RUS
2000-01-10,3,101948,2419,"Sampras, Pete",R,1971-08-12,USA
2000-01-10,4,103017,2184,"Kiefer, Nicolas",R,1977-07-05,GER
2000-01-10,5,102856,2169,"Kuerten, Gustavo",R,1976-09-10,BRA
2000-01-10,6,102358,2107,"Enqvist, Thomas",R,1974-03-13,SWE


## Serialized objects

In [23]:
save(atp_data, file='../data/atp_data.Rdata')
rm(atp_data)

In [24]:
load('../data/atp_data.Rdata')
head(atp_data)

ranking_date,rank,player_id,points,name,hand,birth_date,country_code
2000-01-10,1,101736,4135,"Agassi, Andre",R,1970-04-29,USA
2000-01-10,2,102338,2915,"Kafelnikov, Yevgeny",R,1974-02-18,RUS
2000-01-10,3,101948,2419,"Sampras, Pete",R,1971-08-12,USA
2000-01-10,4,103017,2184,"Kiefer, Nicolas",R,1977-07-05,GER
2000-01-10,5,102856,2169,"Kuerten, Gustavo",R,1976-09-10,BRA
2000-01-10,6,102358,2107,"Enqvist, Thomas",R,1974-03-13,SWE


## Feather

In [25]:
write_feather(atp_data ,'../data/atp_data_r.feather')
head(read_feather('../data/atp_data_r.feather'))

ranking_date,rank,player_id,points,name,hand,birth_date,country_code
2000-01-10,1,101736,4135,"Agassi, Andre",R,1970-04-29,USA
2000-01-10,2,102338,2915,"Kafelnikov, Yevgeny",R,1974-02-18,RUS
2000-01-10,3,101948,2419,"Sampras, Pete",R,1971-08-12,USA
2000-01-10,4,103017,2184,"Kiefer, Nicolas",R,1977-07-05,GER
2000-01-10,5,102856,2169,"Kuerten, Gustavo",R,1976-09-10,BRA
2000-01-10,6,102358,2107,"Enqvist, Thomas",R,1974-03-13,SWE


## Parquet

In [26]:
library(arrow)
write_parquet(atp_data ,'../data/atp_data_r.parquet')
head(read_parquet('../data/atp_data_r.parquet'))

ranking_date,rank,player_id,points,name,hand,birth_date,country_code
2000-01-10,1,101736,4135,"Agassi, Andre",R,1970-04-29,USA
2000-01-10,2,102338,2915,"Kafelnikov, Yevgeny",R,1974-02-18,RUS
2000-01-10,3,101948,2419,"Sampras, Pete",R,1971-08-12,USA
2000-01-10,4,103017,2184,"Kiefer, Nicolas",R,1977-07-05,GER
2000-01-10,5,102856,2169,"Kuerten, Gustavo",R,1976-09-10,BRA
2000-01-10,6,102358,2107,"Enqvist, Thomas",R,1974-03-13,SWE


## HDF5

In [27]:
library(hdf5r)
h5_file <- H5File$new('../data/atp_data_r.h5', mode = 'w')
h5_group <- h5_file$create_group('atp_data')
for (column in colnames(atp_data)) {
    h5_group[[column]] <- atp_data[[column]]
}
print(h5_group)
print(h5_file)

h5_file$close_all()

“package ‘hdf5r’ was built under R version 3.6.3”
Attaching package: ‘hdf5r’

The following object is masked from ‘package:purrr’:

    flatten_df

“During conversion, the following issues occured: H5T_CONV_EXCEPT_RANGE_LOW”

Class: H5Group
Filename: /u/59/tuomiss1/unix/dataanalysis/data-analysis-workflows-course/data/atp_data_r.h5
Group: /atp_data
Listing:
         name    obj_type dataset.dims dataset.type_class
   birth_date H5I_DATASET      1837203          H5T_FLOAT
 country_code H5I_DATASET      1837203           H5T_ENUM
         hand H5I_DATASET      1837203           H5T_ENUM
         name H5I_DATASET      1837203         H5T_STRING
    player_id H5I_DATASET      1837203          H5T_FLOAT
       points H5I_DATASET      1837203          H5T_FLOAT
         rank H5I_DATASET      1837203          H5T_FLOAT
 ranking_date H5I_DATASET      1837203          H5T_FLOAT
Class: H5File
Filename: /u/59/tuomiss1/unix/dataanalysis/data-analysis-workflows-course/data/atp_data_r.h5
Access type: H5F_ACC_RDWR
Listing:
     name  obj_type dataset.dims dataset.type_class
 atp_data H5I_GROUP         <NA>               <NA>


## Other data formats

### Excel spreadsheets

In [28]:
library(readxl)
efw <- read_excel('../data/efw.xlsx', col_names=TRUE, range='B5:BU4055')
head(efw)

New names:
* data -> data...8
* data -> data...10
* data -> data...12
* data -> data...14
* data -> data...16
* … and 6 more problems


Year,ISO_Code,Countries,Economic Freedom Summary Index,Rank,Quartile,Government consumption,data...8,Transfers and subsidies,data...10,...,Conscription,Labor market regulations,Administrative requirements,Regulatory Burden,Starting a business,Impartial Public Administration,Licensing restrictions,Tax compliance,Business regulations,Regulation
2018,ALB,Albania,7.8,26,1,8.155882,12.27,6.73842,12.47,...,10,6.717929,5.651538,6.666667,9.742477,5.396,5.62194,7.17525,6.708979,7.721734
2018,DZA,Algeria,4.97,157,4,3.220588,29.05,7.817129,8.511137,...,3,5.645397,4.215154,2.444444,9.305002,3.906,8.771111,7.029528,5.945207,5.563704
2018,AGO,Angola,4.75,159,4,7.698695,13.82444,9.623978,1.88,...,0,5.338186,2.937894,2.444444,8.730805,5.044,7.916416,6.782923,5.642747,5.3862
2018,ARG,Argentina,5.78,144,4,5.938235,19.81,6.307902,14.05,...,10,5.119549,2.714233,6.666667,9.579288,7.202,5.726521,6.508295,6.3995,5.757401
2018,ARM,Armenia,7.92,18,1,7.717647,13.76,7.711172,8.9,...,0,6.461113,5.170406,6.0,9.86353,6.298,9.302574,7.040738,7.279208,7.762321
2018,AUS,Australia,8.23,5,1,4.45,24.87,6.867958,11.994595,...,10,7.803349,3.981758,8.888889,9.928614,10.0,8.953087,8.823021,8.429228,8.726281


### SQL databases

In [29]:
library(DBI)
con <- DBI::dbConnect(RSQLite::SQLite(), dbname = "../data/atp_players_r.sqlite")
copy_to(con, atp_players, overwrite=TRUE, temporary=FALSE)
dbDisconnect(con)

con <- DBI::dbConnect(RSQLite::SQLite(), dbname = "../data/atp_players_r.sqlite")
print(tbl(con,'atp_players'))
dbDisconnect(con)

# Source:   table<atp_players> [?? x 5]
# Database: sqlite 3.22.0
#   [/u/59/tuomiss1/unix/dataanalysis/data-analysis-workflows-course/data/atp_players_r.sqlite]
   player_id name                     hand   birth_date country_code
       <dbl> <chr>                    <chr>       <dbl> <chr>       
 1    100001 Mulloy, Gardnar          R     -1770681600 USA         
 2    100002 Segura, Pancho           R     -1531612800 ECU         
 3    100003 Sedgman, Frank           R     -1333324800 AUS         
 4    100004 Merlo, Giuseppe          R     -1332547200 ITA         
 5    100005 Gonzales, Richard Pancho R     -1314316800 USA         
 6    100006 Golden, Grant            R     -1273795200 USA         
 7    100007 Segal, Abe               L     -1236816000 RSA         
 8    100008 Nielsen, Kurt            R     -1234483200 DEN         
 9    100009 Gulyas, Istvan           R     -1206057600 HUN         
10    100010 Ayala, Luis              R     -1176681600 CHI         
# … with m