## Load Packages

In [1]:
# list required packages
list.of.packages <- c("tidyverse", "entropy", "philentropy")

# check and install packages
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)

# load packages
lapply(list.of.packages, require, character.only = TRUE)

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

also installing the dependency ‘poorman’


Loading required package: tidyverse

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.2     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mggplot2  [39m 3.4.2     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.2     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mpurrr    [39m 1.0.1     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
Loadin

## Load Data

In [2]:
# Load Data
input.dir <- "/kaggle/input/linking-writing-processes-to-writing-quality"

input.train <- "train_logs.csv"
input.test <- "test_logs.csv"

train <- read.csv(file.path(input.dir, input.train))
test <- read.csv(file.path(input.dir, input.test))

In [3]:
head(train, 5)

Unnamed: 0_level_0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count
Unnamed: 0_level_1,<chr>,<int>,<int>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<int>,<int>
1,001519c8,1,4526,4557,31,Nonproduction,Leftclick,Leftclick,NoChange,0,0
2,001519c8,2,4558,4962,404,Nonproduction,Leftclick,Leftclick,NoChange,0,0
3,001519c8,3,106571,106571,0,Nonproduction,Shift,Shift,NoChange,0,0
4,001519c8,4,106686,106777,91,Input,q,q,q,1,1
5,001519c8,5,107196,107323,127,Input,q,q,q,2,1


## Helper Functions

In [4]:
get_words <- function(word_count, down_event, activity) {
  n <- 1
  
  rows <- length(word_count)
  result <- c()
  
  backspace <- down_event %in% c("Backspace", "Delete")
  shift <- down_event %in% c("Shift", "Capslock", "Control")
  space <- down_event %in% c("Space", "Enter", "Tab", "-", "(", ")", "[", "]", "&")
  letter <- down_event %in% c("q", ".", ",", "'", ";", "!", "?", "/", "\\", "\"", ":", "$")
  
  back_streak <- 0
  letter_streak <- 0
  
  
  for (i in 1:rows) {
    
    # letter
    if(letter[i]) {
      letter_streak <- letter_streak + 1
      back_streak <- 0
      result[i] <- n
    
    # backspace
    } else if(backspace[i]) {
      
      back_streak <- back_streak + 1
      
      # backspaces exceed letters
      if ((back_streak >= letter_streak) | (letter_streak == 0)) {
        
        result[(i-back_streak-letter_streak+1):i] <- NA
        letter_streak <- 0
        back_streak <- 0
        n <- n + 1
      
      # letters exceed backspaces
      } else {
        result[i] <- n
      }
    
    # space
    } else if(space[i]) {
      if(letter_streak == 1) {
        result[i-1] <- NA
      }
      letter_streak <- 0
      back_streak <- 0
      result[i] <- NA
      n <- n + 1
      
    # shift
    } else if(shift[i]) {
      result[i] <- n

    # other
    } else {
      result[i] <- NA # "Other" for testing
      n <- n + 1
    }

  }
  return(result)

}

In [5]:
get_spaces <- function(each_word) {
  n <- 1
  
  rows <- length(each_word)
  result <- c()
  
  space <- is.na(each_word)
  
  for (i in 1:rows) {
    if (space[i]) {
      result[i] <- as.character(n)
    } else {
      result[i] <- NA
      n <- n + 1
    }
  }
  
  return(result)
}

In [6]:
get_sentences <- function(down_event, is_word) {
  n <- 1
  rows <- length(is_word)
  result <- c()
  
  
  breaks <- down_event %in% c(".", "?", "!")
  valid <- breaks & is_word
  
  check <- FALSE
  
  for (i in 1:rows) {
    if(check) {
      if(valid[i]) {
        result[i-1] <- n - 1
        result[i] <- n-1
        
      } else if (is_word[i]) {
        check <- FALSE
        result[i] <- n
        
      } else {
        result[i] <- NA
      }
      
    } else if (valid[i]) {
      result[i] <- n
      n <- n + 1
      check <- TRUE
    } else {
      result[i] <- n
    }
 
  }
  
  return(result)
}

In [7]:
get_breaks <- function(each_sentence) {
  n <- 1
  rows <- length(each_sentence)
  result <- c()
  
  valid <- is.na(each_sentence)
  
  for(i in 1:rows) {
    if(valid[i]) {
      result[i] <- n      
    } else {
      n <- n + 1
      result[i] <- NA
    }
  }
  
  return(result)
}

In [8]:
get_deletions <- function(activity) {
  n <- 1
  
  rows <- length(activity)
  result <- c()
  backspace <- activity == "Remove/Cut"
  
  for (i in 1:rows) {
    if(backspace[i]) {
      result[i] <- n
    } else {
      result[i] <- NA
      n <- n + 1
    }
    
  }
  return(result)
  
}

In [9]:
get_bursts <- function(iki) {
  threshold <- mean(iki, na.rm=TRUE) + 2 * sd(iki, na.rm=TRUE)
  
  n <- 1
  rows <- length(iki)
  result <- c()
  
  breaks <- iki > threshold 
  
  for (i in 1:rows) {
    result[i] <- n
    if(breaks[i]) {n <- n + 1}
  }
  
  return(result)
}

In [10]:
get_other <- function(down_event, activity) {
  n <- 1
  
  rows <- length(down_event)
  result <- c()
  
  events <- down_event %in% c("Leftclick", "ArrowLeft", "ArrowRight", "ArrowUp", "ArrowDown")
  activities <- word(activity, 1) %in% c("Replace", "Move", "Paste")
  valid <- events | activities
  
  for(i in 1:rows) {
    if(valid[i]) {
      result[i] <- n
    } else {
      n <- n+1
      result[i] <- NA
    }
  }
  
  return(result)
}

In [11]:
get_slope <- function(key_strokes, interval) {
  interval <- as.numeric(interval)
  
  x <- 1:max(interval, 60)
  zeroes <- rep(0, max(interval, 60))
  indexes <- which(x %in% interval)
  
  y <- replace(zeroes, indexes, key_strokes)
  
  slope <- coef(lm(y ~ x))[2]
  return(slope)
}


In [12]:
get_entropy <- function(key_strokes, interval) {
  interval <- as.numeric(interval)
  maxInt <- max(60,as.numeric(interval))
  num_zeros <- maxInt - length(key_strokes)
  zeros <- rep(0, num_zeros)
  result <- entropy::entropy(c(key_strokes, zeros)) / sum(key_strokes)
  return(result)              
}  

In [13]:
get_uniformity <- function(key_strokes, interval) {
  interval <- as.numeric(interval)
  key_strokes <- key_strokes[interval <= 60]
  interval <- interval[interval <= 60]
  index <- which(1:60 %in% interval)
  
  total <- max(cumsum(key_strokes))
  a <- rep(total/60, 60)
  b <- replace(rep(0,60), index, key_strokes)
  result <- philentropy::JSD(rbind(a, b), unit = "log2")
  return(result)
}

## Function for Adding Features

In [14]:
add_features <- function(df) {
  df %>%
    group_by(id) %>%
    

    mutate(
      # time intervals
      interval = as.factor((up_time %/% 30000) + 1),
      
      # within word
      each_word = as.factor(get_words(word_count, down_event, activity)),
      each_space = as.factor(get_spaces(each_word)),
      is_word = ifelse(is.na(each_word), FALSE, TRUE),
      each_sentence = as.factor(get_sentences(down_event, is_word)),
      each_break = as.factor(get_breaks(each_sentence)),
      is_sentence = ifelse(is.na(each_sentence), FALSE, TRUE),
      each_deletion = as.factor(get_deletions(activity)),
      each_other = as.factor(get_other(down_event, activity)),
    
      # one-hot encode activity
      is_nonproduction = ifelse(activity == "Nonproduction", 1, 0),
      is_input = ifelse(activity == "Input", 1, 0),
      is_cut = ifelse(activity == "Remove/Cut", 1, 0),
      is_paste = ifelse(activity == "Paste", 1, 0),
      is_replace = ifelse(activity == "Replace", 1, 0),
      is_move = ifelse(str_sub(activity, 1, 4) == "Move", 1, 0),
    
      # cumulative number of each action
      cum_nonproduction = cumsum(is_nonproduction),
      cum_input = cumsum(is_input),
      cum_cut = cumsum(is_cut),
      cum_paste = cumsum(is_paste),
      cum_replace = cumsum(is_replace),
      cum_move = cumsum(is_move),
    
      # find net and total characters
      net_chars = c(0, diff(cursor_position)),
      net_chars = ifelse((is_nonproduction + is_move == 1), 0, net_chars),
      char_count = cumsum(net_chars),
      end_dist = char_count - cursor_position,
      leading_edge = ifelse(cursor_position <= 2 | end_dist <= 2, 1, 0),
           
      # iki
      iki = c(diff(down_time), 0) - action_time,
      iki = ifelse(iki > 0, iki, 0),
      
      # iki level - this is now done in the aggregation function
      #iki_level = as.factor(sapply(iki, get_level)),
      
      # bursts
      each_burst = as.factor(get_bursts(iki)),
    ) %>%
    
    group_by(id, each_burst) %>%
    filter(!is.na(each_burst)) %>%
    mutate(burst_r = ifelse((last(is_cut) == 1) & (last(leading_edge) == 1), 1, 0),
           burst_i = ifelse((leading_edge == 0), 1, 0),
           burst_p = ifelse((first(is_input) == 1) & (last(is_input) == 1), 1, 0))
  
}

## Function for Aggregating Across Features

In [15]:
aggregate_features <- function(df) {
  
  # basic counts and iki
  basic <- df %>%
    group_by(id) %>%
    summarize(word_count =  tail(word_count, 1),
              char_count = tail(char_count, 1),
              initial_pause_time = head(down_time, 1),
              total_time = tail(up_time, 1),
              total_keystrokes = length(event_id),
              num_del = sum(is_cut, na.rm=TRUE),
              mean_iki = mean(iki, na.rm=TRUE),
              median_iki = median(iki, na.rm=TRUE),
              sd_iki = sd(iki, na.rm=TRUE),
              max_iki = max(iki, na.rm=TRUE))
  
  
  # word iki metrics
  word.iki <- df %>%
    group_by(id) %>%
    filter(is_word) %>%
    summarize(mean_word_iki = mean(iki, na.rm=TRUE),
              sd_word_iki = sd(iki, na.rm=TRUE))

  
  # space iki metrics
  space.iki <- df %>%
    group_by(id) %>%
    filter(!is_word) %>%
    summarize(mean_space_iki = mean(iki, na.rm=TRUE),
              sd_space_iki = sd(iki, na.rm=TRUE))    

  
  # between words time
  space.times <- df %>%
    group_by(id, each_space) %>%
    filter(!is.na(each_space)) %>%
    summarize(space_time = max(up_time, na.rm=TRUE) - min(down_time, na.rm=TRUE)) %>%
    group_by(id) %>%
    summarize(mean_space_time = mean(space_time, na.rm=TRUE),
              sd_space_time = sd(space_time, na.rm=TRUE))

  
  # between sentence times
  break.times <- df %>%
    group_by(id, each_break) %>%
    filter(!is.na(each_break)) %>%
    summarize(break_time = max(up_time, na.rm=TRUE) - min(down_time, na.rm=TRUE)) %>%
    group_by(id) %>%
    summarize(mean_break_time = mean(break_time, na.rm=TRUE),
              sd_break_time = sd(break_time, na.rm=TRUE))
  
  
  # specific length iki
  specific.iki <- df %>%
    group_by(id) %>%
    summarize(iki_05_10 = sum(iki %in%  500:1000),
              iki_10_15 = sum(iki %in% 1000:1500),
              iki_15_20 = sum(iki %in% 1500:2000),
              iki_20_30 = sum(iki %in% 2000:3000),
              iki_30 = sum(iki > 3000, na.rm = TRUE))
  
  
  # long pauses
  long.pause <- df %>%
    group_by(id) %>%
    mutate(mean_word_iki = mean(iki, na.rm=TRUE),
           sd_word_iki = sd(iki, na.rm=TRUE),
           long_pause = mean_word_iki + 2 * sd_word_iki) %>%
    group_by(id, each_space) %>%
    filter(!is.na(each_space)) %>%
    summarize(space_time = max(up_time, na.rm=TRUE) - min(down_time, na.rm=TRUE),
              long_pause = first(long_pause)) %>%
    group_by(id) %>%
    summarize(prop_long_pauses = mean(space_time > long_pause, na.rm=TRUE))

    
  # revisions
  revisions <- df %>%
    group_by(id) %>%
    mutate(mean_word_iki = mean(iki, na.rm=TRUE),
           sd_word_iki = sd(iki, na.rm=TRUE),
           long_pause = mean_word_iki + 2 * sd_word_iki,
           num_internal_insert = sum(is_input == 1 & leading_edge == 0)) %>%
    group_by(id, each_deletion) %>%
    filter(!is.na(each_deletion)) %>%
    summarize(max_iki = max(iki, na.rm=TRUE),
              long_pause = first(long_pause),
              num_internal_insert = first(num_internal_insert),
              leading_edge = first(leading_edge)) %>%
    group_by(id) %>%
    summarize(num_revisions_leading = sum(leading_edge),
              num_revisions_internal = sum((max_iki <= long_pause) & (leading_edge == 0)) +
                first(num_internal_insert)) %>%
    mutate(num_revisions = num_revisions_leading + num_revisions_internal)
  
  # backspacing
  backspace <- df %>%
    group_by(id, each_deletion) %>%
    filter(!is.na(each_deletion)) %>%
    summarize(num_keys = n(),
              del_time = max(up_time, na.rm=TRUE) - min(down_time, na.rm=TRUE))
   
  backspace.single <- backspace %>%
    group_by(id) %>%
    filter(num_keys == 1) %>%
    summarize(mean_del_time_single = mean(del_time, na.rm=TRUE),
              sd_del_time_single = sd(del_time, na.rm=TRUE))
  
  backspace.multiple <- backspace %>%
    group_by(id) %>%
    filter(num_keys > 1) %>%
    summarize(mean_del_time_multiple = mean(del_time, na.rm=TRUE),
              sd_del_time_multiple = sd(del_time, na.rm=TRUE))
  
  
  # character proportions
  char.props <- df %>%
    group_by(id) %>%
    summarize(prop.final = last(char_count) / n(),
              prop.leading = mean(leading_edge))
  
  
  # bursts
  bursts <- df %>%
    group_by(id, each_burst) %>%
    filter(!is.na(each_burst)) %>%
    summarize(chars = sum(is_input),
              is_r = first(burst_r),
              is_i = first(burst_i),
              is_p = first(burst_p),
              words = last(word_count) - first(word_count),
              total_words = last(word_count)) %>%
    mutate(words_p = ifelse((words > 0) & (is_p == 1), words, 0)) %>%
    group_by(id) %>%
    summarize(mean_burst_len = mean(chars, na.rm=TRUE),
              sd_burst_len = sd(chars, na.rm=TRUE),
              num_bursts = n(),
              prop_r = mean(is_r),
              prop_i = mean(is_i),
              prop_p = mean(is_p),
              total_words = max(total_words),
              words_p = sum(words_p)) %>%
    mutate(prop_words_p = words_p / total_words) %>%
    select(id, mean_burst_len, sd_burst_len, num_bursts,
           prop_r, prop_i, prop_p, prop_words_p)
  
  
  # transitions
  transitions.word <- df %>%
    group_by(id) %>%
    mutate(total_transitions = length(unique(each_space, na.rm=TRUE))) %>%
    group_by(id, each_space) %>%
    filter(all(down_event %in%  c("Shift", "Space", "Capslock", "Control", "Tab", "Enter"))) %>%
    group_by(id) %>%
    summarize(prop_trans_word = length(unique(each_space, na.rm=TRUE)) / first(total_transitions))
  
  transitions.sentence <- df %>%
    group_by(id) %>%
    mutate(total_transitions = length(unique(each_break, na.rm=TRUE))) %>%
    group_by(id, each_break) %>%
    filter(all(down_event %in%  c("Shift", "Space", "Capslock", "Control", "Tab", "Enter"))) %>%
    group_by(id) %>%
    summarize(prop_trans_sentence = length(unique(each_break, na.rm=TRUE)) / first(total_transitions))
 
  # time intervals
  intervals <- df %>%
    group_by(id, interval) %>%
    summarize(key_strokes = n()) %>%
    group_by(id) %>%
    summarize(sd_interval_keys = sd(c(key_strokes, rep(0,max(60,as.numeric(interval))-length(key_strokes)))),
              slope_interval_keys = get_slope(key_strokes, interval),
              entropy_interval_keys = get_entropy(key_strokes, interval),
              # this is not compatible with some essays
              #uniformity_interval_keys = get_uniformity(key_strokes, interval),
              extremes_interval_keys = sum(sign(c(NA, diff(key_strokes))) != 0, na.rm = TRUE))

  intervals.dist <- df %>%
    group_by(id, interval) %>%
    summarize(key_strokes = n(),
              row_number = last(event_id)) %>%
    group_by(id) %>%
    mutate(dist_windows = c(NA, diff(row_number))) %>%
    summarise(mean_interval_dist = mean(dist_windows, na.rm = TRUE),
              sd_interval_dist = sd(dist_windows, na.rm = TRUE))

    
  # other events
  other.events <- df %>%
    group_by(id) %>%
    mutate(total_time = last(down_time)) %>%
    group_by(id, each_other) %>%
    filter(!is.na(each_other)) %>%
    summarize(other_time = max(up_time, na.rm=TRUE) - min(down_time, na.rm=TRUE),
              total_time = first(total_time)) %>%
    group_by(id) %>%
    summarize(mean_other_time = mean(other_time, na.rm=TRUE),
              sd_other_time = sd(other_time, na.rm=TRUE),
              prop_other_time = sum(other_time, na.rm=TRUE) / first(total_time))
  

  
  result <- basic %>%
    left_join(word.iki, by="id") %>%
    left_join(space.iki, by="id") %>%
    left_join(space.times, by="id") %>%
    left_join(break.times, by="id") %>%
    left_join(specific.iki, by="id") %>%
    left_join(long.pause, by="id") %>%
    left_join(revisions, by="id") %>%
    left_join(backspace.single, by="id") %>%
    left_join(backspace.multiple, by="id") %>%
    left_join(char.props, by="id") %>%
    left_join(bursts, by="id") %>%
    left_join(transitions.word, by="id") %>%
    left_join(transitions.sentence, by="id") %>%
    left_join(intervals, by="id") %>%
    left_join(intervals.dist, by="id") %>%
    left_join(other.events, by="id")
  
  return(result)
}

## Extract Features

In [16]:
extract <- function(df) {
  df.expanded <- add_features(df)
  df.aggregated <- aggregate_features(df.expanded)
  return(df.aggregated)
}

In [17]:
train_aggregate <- extract(train)
test_aggregate <- extract(test)

[1m[22m`summarise()` has grouped output by 'id'. You can override using the `.groups`
argument.
[1m[22m`summarise()` has grouped output by 'id'. You can override using the `.groups`
argument.
[1m[22m`summarise()` has grouped output by 'id'. You can override using the `.groups`
argument.
[1m[22m`summarise()` has grouped output by 'id'. You can override using the `.groups`
argument.
[1m[22m`summarise()` has grouped output by 'id'. You can override using the `.groups`
argument.
[1m[22m`summarise()` has grouped output by 'id'. You can override using the `.groups`
argument.
[1m[22m`summarise()` has grouped output by 'id'. You can override using the `.groups`
argument.
[1m[22m`summarise()` has grouped output by 'id'. You can override using the `.groups`
argument.
[1m[22m`summarise()` has grouped output by 'id'. You can override using the `.groups`
argument.
[1m[22m`summarise()` has grouped output by 'id'. You can override using the `.groups`
argument.
[1m[22m[36mℹ[39m

In [18]:
na_train <- colSums(is.na(train_aggregate))
na_test <- colSums(is.na(test_aggregate))

prop_train <- round(100* na_train / nrow(train_aggregate),1)
prop_test <- round(100* na_test / nrow(test_aggregate),1)

na_table <- cbind("Train NAs"=na_train,
                  "%"=prop_train,
                  "Test NAs"=na_test,
                  "%"=prop_test)
rownames(na_table) <- names(na_train)
na_table

Unnamed: 0,Train NAs,%,Test NAs,%.1
id,0,0.0,0,0.0
word_count,0,0.0,0,0.0
char_count,0,0.0,0,0.0
initial_pause_time,0,0.0,0,0.0
total_time,0,0.0,0,0.0
total_keystrokes,0,0.0,0,0.0
num_del,0,0.0,0,0.0
mean_iki,0,0.0,0,0.0
median_iki,0,0.0,0,0.0
sd_iki,0,0.0,0,0.0


## Save Data

In [19]:
output.dir <- "/kaggle/working"
output.train <- "aggregate_train_logs.csv"
output.test <- "aggregate_test_logs.csv"

write.csv(train_aggregate, file.path(output.dir, output.train), row.names = FALSE)
write.csv(test_aggregate, file.path(output.dir, output.test), row.names = FALSE)