# 1. Set-up the working directory and input folder

In [None]:
rm(list=ls()) # clean up your environment

In [None]:
# set a folder where you want the output of your analyses to be
setwd("~/git/RcourseSpring2019/analyses/")

In [None]:
# set the path to the folder where your data are
data_folder <- "~/git/RcourseSpring2019/data/case_study/"

# 2. Get the data from multiple subjects, separately by file type

In [None]:
# define lists of files per type
list_expInfo <- paste(data_folder, list.files(path=data_folder, pattern="*expInfo.csv"), sep="")
list_choices <- paste(data_folder, list.files(path=data_folder, pattern="*choices.csv"), sep="")
list_events <- paste(data_folder, list.files(path=data_folder, pattern="*events.csv"), sep="")

In [None]:
# check if files are missing
length(list_expInfo)
length(list_choices)
length(list_events)

In [None]:
head(list_expInfo)

In [None]:
# load only the first participant to check what each file contains:
expInfo_participant_1 <- read.csv(list_expInfo[1])
choices_participant_1 <- read.csv(list_choices[1])
events_participant_1 <- read.csv(list_events[1])

In [None]:
head(expInfo_participant_1)
head(choices_participant_1)
head(events_participant_1)

#### Write a function to import the data

Because I have several participants and file, we want to somehow automate the importing process. There are several ways to do it, but whn you have a list containing all the filepaths of the data to import (in this case we have a different list for the different file types), what is easy to do is to write a function to import data of 1 participant, and then apply it to the lists containing the filepaths.

In [None]:
# First define a function to load one participant's data given the filepath
importDataFrame <- function(file_path, header=TRUE, sep=',', deleteIndex=TRUE, transpose=FALSE){
    
    dataFrame_temp = read.table(file_path, header = header, sep = sep) # read the dataframe from file
    
    if (deleteIndex) {
        dataFrame_temp = subset(dataFrame_temp, select = -c(X)) # remove the column X from the dataframe
    }
    
    if (transpose) { # as in the expInfo files: participant numbers are already there
        dataFrame_temp = t(dataFrame_temp)
        colnames(dataFrame_temp) = dataFrame_temp["V1",]
        rownames(dataFrame_temp) <- c()
        dataFrame_temp = dataFrame_temp[2,]
    
    } else { # as in the choices and events: we have to infer the participant number from the file name
        file_name = tail(strsplit(file_path, '/')[[1]], n=1) # obtain the file name from the file path
        subject_number = strsplit(file_name, '_')[[1]][1] # obtain the participant number from the file name
        dataFrame_temp$participant = as.numeric(subject_number) # add the participant number to the dataframe
    }
    
    return(dataFrame_temp)
}

In [None]:
# Let's check if it works using the first pariticipant in each list
head(importDataFrame(list_choices[1]))
head(importDataFrame(list_expInfo[1], header=FALSE, deleteIndex=FALSE, transpose=TRUE))
head(importDataFrame(list_events[1]))

#### Merging the data directly from the list of files:

We are now going to use two new functions, `lapply` and `do.call`, to apply our new function to load the data to the list of files that we defined earlier:

-`lapply` is pretty much like `apply` but for lists and vectors (https://www.rdocumentation.org/packages/base/versions/3.6.0/topics/lapply)

-`do.call` takes a function as input and splatters its other arguments to the function. It is widely used, for example, to assemble lists into simpler structures (often with rbind or cbind) (https://www.stat.berkeley.edu/~s133/Docall.html)

For example:

In [None]:
x <- lapply(iris[,c('Petal.Length', 'Petal.Width')], mean)
do.call(c, x)

In [None]:
# merge data frames of all participants for choices
dataFrame_choices <- as.data.frame(do.call(rbind, lapply(list_choices, importDataFrame))) # load and merge
dataFrame_choices <- dataFrame_choices[order(dataFrame_choices$participant, dataFrame_choices$trial),] # re-order based on participants and trials
row.names(dataFrame_choices) <- 1:nrow(dataFrame_choices) # optional: rename the rows to respect the length

# inspect the result
head(dataFrame_choices)
tail(dataFrame_choices)
dim(dataFrame_choices)

In [None]:
# merge data frames of all participants for events
dataFrame_events <- as.data.frame(do.call(rbind, lapply(list_events, importDataFrame)))
dataFrame_events <- dataFrame_events[order(dataFrame_events$participant),]

# inspect the result
head(dataFrame_events)
tail(dataFrame_events)
dim(dataFrame_events)

In [None]:
# merge data frames of all participants for expInfo
dataFrame_expInfo <- as.data.frame(do.call(rbind, lapply(list_expInfo, importDataFrame, header=FALSE, deleteIndex=FALSE, transpose=TRUE)))
dataFrame_expInfo$participant <- as.numeric(as.character(dataFrame_expInfo$participant))
dataFrame_expInfo <- dataFrame_expInfo[order(dataFrame_expInfo$participant),]
dataFrame_expInfo

Careful with factors!!! https://stackoverflow.com/questions/3418128/how-to-convert-a-factor-to-integer-numeric-without-loss-of-information

In [None]:
dataFrame_expInfo[,'reward'] <- as.numeric(levels(dataFrame_expInfo[,'reward']))[as.integer(dataFrame_expInfo[,'reward'])]

In [None]:
options(repr.plot.width=5, repr.plot.height=5) # this is not necessary in RStudio.

hist(dataFrame_expInfo[,'reward'], breaks=10, main='Reward distribution', xlab='Reward', col='grey80', border='grey60')

In [None]:
# save merged data frames to file
write.csv(dataFrame_choices, file = "dataFrame_choices.csv", row.names=FALSE)
write.csv(dataFrame_events, file = "dataFrame_events.csv", row.names=FALSE)
write.csv(dataFrame_expInfo, file = "dataFrame_expInfo.csv", row.names=FALSE)

# 3. Cleaning-up the merged data

In [None]:
rm(list=ls()) # clean up your environment

In [None]:
# load merged data frames to file
dataFrame_choices <- read.csv(file = "dataFrame_choices.csv")
dataFrame_events <- read.csv(file = "dataFrame_events.csv")
dataFrame_expInfo <- read.csv(file = "dataFrame_expInfo.csv")

In [None]:
library('dplyr')

#### 3.1) check if all participants are OK, by looking at the average performance:

In [None]:
grouped_data <- group_by(
    dataFrame_choices, 
    participant
)

mean_performance <- summarise(
    grouped_data, 
    mean_accuracy = mean(accuracy, na.rm=TRUE), 
    mean_rt = mean(rt, na.rm=TRUE)
)

head(mean_performance)

Plot the scatterplot of mean_accuracy (on the x axis) and mean_rt (on the y_axis) using the dataframe `mean_performance` that I just created:
- add a vertical line at .5 (chance level)
- add a horizontal line at 3 (RT deadline)

In [None]:
options(repr.plot.width=5, repr.plot.height=5) # this is not necessary in RStudio.

plot(x = mean_performance$mean_accuracy, 
     y = mean_performance$mean_rt, 
     col = rgb(.1, .1, .1, .5), 
     pch = 20,
     xlab = 'Mean accuracy',
     ylab = 'Mean RT',
     bty = 'l'
    )

abline(
    v = .5,
    col = 'maroon4',
    lw = 2,
    lty = 3
)

abline(
    h = 3,
    col = 'maroon4',
    lw = 2,
    lty = 3
)

3.2) based on the results, we can for example decide to exclude the participants that took more than 1.5 sec to reply on average, and had less than 60% accuracy.
- Who are these participants?
- Save these participants in a variable called `participants_to_exclude`

In [None]:
participants_to_exclude <- mean_performance[mean_performance$mean_accuracy < .6 | mean_performance$mean_rt > 1.5, ]$participant
participants_to_exclude

3.3) create a new dataFrame called `dataFrame_choices_cut` that does NOT include the `participants_to_exclude` vector (that you defined in 1.2)

In [None]:
dataFrame_choices_cut <- dataFrame_choices

for (p in participants_to_exclude) {
  dataFrame_choices_cut <- dataFrame_choices_cut[dataFrame_choices_cut$participant != p,]
}

# check how many participants are left:
length(unique(dataFrame_choices_cut$participant))
length(unique(dataFrame_choices$participant))

# good! so now we have a clean dataset and we can start analysisng the data :)

# 4. Plot the mean performance by condition

#### Write a plotting function

Because I will need to plot several barplots with error bars, let's write down a function that does this automatically to avoid copy-pasting code throughout our script (thus decreasing the probability of making mistakes, forgetting something, ...).

In [None]:
mean_bars_with_errors <- function (data, DV, groupingA, groupingB="None", bars='se', col='lightblue3', alpha_ci=.05, xlab='', ylab='', legend_arguments=NULL) {
    data[,'DV'] = data[,DV]
    data[,'groupingA'] = data[,groupingA]
    
    if (groupingB == "None") { # if there is only 1 grouping variable
        grouped_mean <- aggregate(formula = DV ~ groupingA, data = data, FUN = mean, na.action = na.omit)
        grouped_sd <- aggregate(formula = DV ~ groupingA, data = data, FUN = sd, na.action = na.omit)
        grouped_n <- aggregate(formula = DV ~ groupingA, data = data, FUN = length, na.action = na.omit)
        
        grouped_se <- grouped_sd$DV / sqrt(grouped_n$DV)
        
        if (bars == 'ci') {
            t <- qt((1-alpha_ci)/2 + .5, grouped_n$DV - 1)
            grouped_ci <- t*grouped_se
            grouped_bars <- grouped_ci
        }
        if (bars == 'se') {
            grouped_bars <- grouped_se
        }
        
        b <- barplot(
            height = grouped_mean$DV,
            names.arg = grouped_mean$groupingA,
            col = col,
            border = col,
            ylim = c(0, max(grouped_mean$DV + grouped_bars)),
            xlab = xlab,
            ylab = ylab
        )
        segments(
            x0 = b, 
            x1 = b, 
            y0 = grouped_mean$DV - grouped_bars, 
            y1 = grouped_mean$DV + grouped_bars, 
            lwd = 2
        )

    } else { # if there are 2 grouping variables
        library(reshape2)
        
        data[,'groupingB'] = data[,groupingB] 
        grouped_mean <- aggregate(formula = DV ~ groupingA + groupingB, data = data, FUN = mean, na.action = na.omit)
        grouped_sd <- aggregate(formula = DV ~ groupingA + groupingB, data = data, FUN = sd, na.action = na.omit)
        grouped_n <- aggregate(formula = DV ~ groupingA + groupingB, data = data, FUN = length, na.action = na.omit)
        
        grouped_mean_wide <- acast(data = grouped_mean, formula = groupingB ~ groupingA)
        grouped_sd_wide <- acast(data = grouped_sd, formula = groupingB ~ groupingA)
        grouped_n_wide <- acast(data = grouped_n, formula = groupingB ~ groupingA)
        
        grouped_se_wide <- grouped_sd_wide / sqrt(grouped_n_wide)
        
        which_NAs <- apply(X = is.na(grouped_se_wide), MARGIN = 2, FUN = sum)
        grouped_mean_wide <- grouped_mean_wide[,which_NAs == 0]
        grouped_se_wide <- grouped_se_wide[,which_NAs == 0]
        
        if (bars == 'ci') {
            t <- qt((1-alpha_ci)/2 + .5, grouped_n_wide - 1)   # tend to 1.96 if sample size is big enough
            grouped_ci_wide <- t*grouped_se_wide
            grouped_bars <- grouped_ci_wide
        }
        if (bars == 'se') {
            grouped_bars <- grouped_se_wide
        }
        
        b <- barplot(
            height = grouped_mean_wide,
            ylab = ylab,
            xlab = xlab,
            col = col,
            beside = TRUE,
            ylim = c(0, max(grouped_mean_wide + grouped_bars)),
            legend.text = TRUE,
            args.legend = legend_arguments
        )
        segments(
            x0 = b,
            x1 = b,
            y0 = grouped_mean_wide - grouped_bars,
            y1 = grouped_mean_wide + grouped_bars,
            lwd = 2
        )
    }
}

In [None]:
options(repr.plot.width=9, repr.plot.height=5) # this is not necessary in RStudio.

In [None]:
mean_bars_with_errors(
    dataFrame_choices_cut, 
    DV='rt', 
    groupingA = 'block_number', 
    bars='ci',
    xlab = 'Block number',
    ylab = 'Mean RT'
)

In [None]:
mean_bars_with_errors(
    dataFrame_choices_cut, 
    DV='accuracy', 
    groupingA = 'block_number', 
    bars='ci',
    xlab = 'Block number',
    ylab = 'Mean accuracy'
)

In [None]:
library("wesanderson")
# See all palettes
names(wes_palettes)

In [None]:
# We can also reorder our factor variables before plotting:
levels(dataFrame_choices_cut$reference_IV)

dataFrame_choices_cut$reference_IV <- factor(dataFrame_choices_cut$reference_IV, levels = c("--", "-", "+", "++"))
dataFrame_choices_cut$reward_IV <- factor(dataFrame_choices_cut$reward_IV, levels = c("--", "-", "+", "++"))

levels(dataFrame_choices_cut$reference_IV)

In [None]:
mean_bars_with_errors(
    dataFrame_choices_cut, 
    DV='rt', 
    groupingA = 'reference_IV', 
    groupingB = 'reward_IV', 
    bars='ci',
    xlab = 'Reference',
    ylab = 'Mean RT',
    col = wes_palette(n=4, name="Zissou1"),
    legend_arguments = list(y = .2, bg = 'white', title = 'Reward')
)

In [None]:
mean_bars_with_errors(
    dataFrame_choices_cut, 
    DV='accuracy', 
    groupingA = 'reference_IV', 
    groupingB = 'reward_IV', 
    bars='ci',
    xlab = 'Reference',
    ylab = 'Mean accuracy',
    col = wes_palette(n=4, name="Moonrise3"),
    legend_arguments = list(y = .3, bg = 'white', title = 'Reward')
)

Let's  have a closer look at the RT distribution, and also at the continuous versions of **reference** and **reward** manipulations, that are stored as `reference_block` and `reward_trial_better_option` variables, respectively.

In [None]:
dataFrame_choices_cut$log_rt <- log(dataFrame_choices_cut$rt)

dataFrame_choices_cut <- na.omit(dataFrame_choices_cut)

In [None]:
options(repr.plot.width=5, repr.plot.height=5) # this is not necessary in RStudio.

hist(dataFrame_choices_cut$log_rt, breaks=40, main='Log RTs', xlab='log(RT)', col='grey80', border='grey60')
abline(v=log(.1), col='maroon3')

In [None]:
options(repr.plot.width=5, repr.plot.height=5) # this is not necessary in RStudio.

hist(dataFrame_choices_cut[dataFrame_choices_cut$log_rt < log(.1), 'rt'], 
     breaks=20, main='RT distribution', xlab='RT', col='grey80', border='grey60')

Let's put a more realistic bound to these RTs. RTs lower than 100 are usually hard to interpret.

In [None]:
dataFrame_choices_cut <- dataFrame_choices_cut[dataFrame_choices_cut$rt > .1,]

In [None]:
options(repr.plot.width=9, repr.plot.height=5) # this is not necessary in RStudio.

In [None]:
plot(dataFrame_choices_cut$reference_block,
     dataFrame_choices_cut$log_rt,
     col = rgb(0.1, 0.1, 0.1, .1), 
     pch = 20,
     xlab = 'Reference',
     ylab = 'RT')

In [None]:
plot(dataFrame_choices_cut$reward_trial_better_option,
     dataFrame_choices_cut$log_rt,
     col = rgb(0.1, 0.1, 0.1, .1), 
     pch = 20,
     xlab = 'Reward',
     ylab = 'RT')

In [None]:
colors <- dataFrame_choices_cut$reference_IV
colors[colors == '--'] <- 1
colors[colors == '-'] <- 2
colors[colors == '+'] <- 3
colors[colors == '++'] <- 4

In [None]:
# Let's first build a palette with number of colors the levels in he baths variable:
colors <- c()
gradient <- 0
for (c in unique(dataFrame_choices_cut$reference_IV)) {
    colors <- c(colors, rgb(gradient, .1, gradient, .3))
    gradient <- gradient + .25
}

# And show the palette (just for fun):
image(1:length(colors), 
      1, 
      as.matrix(1:length(colors)), 
      col=colors,
      main = 'Reference IV',
      xlab="", ylab = "", xaxt = "n", yaxt = "n", bty = "n")

for (c in unique(dataFrame_choices_cut$reference_IV)) {
    text(c, 1, c)
}

# Now we assign a color to each data point based on the palette we created:
zcolor <- colors[dataFrame_choices_cut$reference_IV]

In [None]:
plot(dataFrame_choices_cut$reward_trial_better_option,
     dataFrame_choices_cut$log_rt,
     col = zcolor, 
     pch = 20,
     xlab = 'Reference',
     ylab = 'log RT')

# 5. Fit a multilevel model

In [None]:
library(lme4)

In [None]:
standardize <- function(x) {
    return(x - mean(x)/sd(x))
}

In [None]:
dataFrame_choices_cut_z <- dataFrame_choices_cut
dataFrame_choices_cut_z$reward_trial_better_option <- standardize(dataFrame_choices_cut$reward_trial_better_option)
dataFrame_choices_cut_z$reference_block <- standardize(dataFrame_choices_cut$reference_block)
dataFrame_choices_cut_z$block_number <- standardize(dataFrame_choices_cut$block_number)

In [None]:
rts_lm <- lmer(log_rt ~ reward_trial_better_option*reference_block + block_number + (1 | participant), data=dataFrame_choices_cut_z)
summary(rts_lm)

In [None]:
rts_lm_table_results <- as.data.frame(confint(rts_lm))
rts_lm_table_results

In [None]:
acc_lm <- glmer(accuracy ~ reward_trial_better_option*reference_block + block_number + (1 | participant), 
                data=dataFrame_choices_cut_z,
                family = binomial)
summary(acc_lm)