brendano / dlanalysis
- Source
- Commits
- Network (0)
- Issues (0)
- Downloads (0)
- Wiki (1)
- Graphs
-
Branch:
master
dlanalysis / util.R
| d60c16b1 » | brendano | 2008-06-25 | 1 | # util.R: | |
| 4a6ecb46 » | brendano | 2009-09-01 | 2 | # Utilities to make R a happier place | |
| d60c16b1 » | brendano | 2008-06-25 | 3 | # Brendan O'Connor, brenocon@gmail.com | |
| 4 | |||||
| 5 | |||||
| 73fe52a2 » | brendano | 2008-05-29 | 6 | options(showWarnCalls=T, showErrorCalls=T) | |
| 7164ce94 » | brendano | 2009-10-02 | 7 | ||
| 8 | if ( (numcol <-Sys.getenv("COLUMNS")) != "") { | ||||
| 9 | numcol = as.integer(numcol) | ||||
| 10 | options(width= numcol - 1) | ||||
| 11 | } else if (system("stty -a &>/dev/null") == 0) { | ||||
| 12 | # mac specific? probably bad in the R GUI too. | ||||
| 4a6ecb46 » | brendano | 2009-09-01 | 13 | numcol = as.integer(sub(".* ([0-9]+) column.*", "\\1", system("stty -a", intern=T)[1])) | |
| 14 | if (numcol > 0) | ||||
| 15 | options(width= numcol - 1 ) | ||||
| 16 | } | ||||
| 73fe52a2 » | brendano | 2008-05-29 | 17 | ||
| 18 | |||||
| 19 | util = new.env() | ||||
| 20 | |||||
| 492df59a » | brendano | 2009-02-11 | 21 | ###### | |
| 22 | # | ||||
| 23 | # dataframe-outputting apply and aggregation functions. | ||||
| 24 | # dfagg, dfapply, df2matrix, matrix2df | ||||
| 25 | # i'm often confused whether proper R style should emphasize matrices or dataframes. | ||||
| 26 | # so these support for a dataframe-centric lifestyle. | ||||
| 27 | |||||
| 4a6ecb46 » | brendano | 2009-09-01 | 28 | reframe <- function(.data, ...) { | |
| 29 | eval(substitute(data.frame(...)), .data, parent.frame()) | ||||
| 30 | } | ||||
| 31 | |||||
| 32 | # reframe = function(.data, ...) { | ||||
| 33 | # e = eval(substitute(list(...)), .data, parent.frame()) | ||||
| 34 | # data.frame(e) | ||||
| 35 | # } | ||||
| 36 | |||||
| 37 | |||||
| 492df59a » | brendano | 2009-02-11 | 38 | ||
| 39 | util$dfagg <- function(d, byvals, fn, trim=TRUE) { | ||||
| 40 | # like by() but usually returns dataframes: | ||||
| 41 | # if fn() returns a list, a data frame is returned. | ||||
| 42 | # -> byvals are the row names. | ||||
| 43 | # -> each list is coerced into a row. | ||||
| 44 | # if fn() returns a nonlist, a vector is returned. | ||||
| 45 | # -> byvals are the names. | ||||
| 46 | # We attempt to be tolerant for slight inconsistencies in fn()'s return values. | ||||
| 47 | # | ||||
| 48 | # Goal is to be like SQL GROUP BY: dataframes in, aggregated dataframes out. | ||||
| 49 | # | ||||
| 50 | # If you have a multidimensional matrix (R calls "array"), apply() lets you | ||||
| 51 | # select the margin for rollup in a similar way. | ||||
| 97d19e55 » | brendano | 2009-03-24 | 52 | # | |
| 53 | # ALTERNATIVE: ddply() from hadley wickham's plyr: http://had.co.nz/plyr/ | ||||
| 492df59a » | brendano | 2009-02-11 | 54 | ||
| 55 | if (class(byvals) == 'function') | ||||
| 56 | byvals = byvals(d) | ||||
| 57 | if (trim && is.factor(byvals) && !setequal( present_levels(byvals), levels(byvals)) ) { | ||||
| 58 | # change to "stop" to find if necessary | ||||
| 59 | warning("Uhoh, byvals is a factor but only using only a subset of its levels. Trimming them. Hopefully this is what you want.") | ||||
| 60 | byvals = trim_levels(byvals) | ||||
| 61 | } | ||||
| 62 | |||||
| 63 | b = by(d, byvals, fn) | ||||
| 64 | list2df(b) | ||||
| 65 | } | ||||
| 66 | |||||
| 67 | util$list2df <- function(ls) { | ||||
| 68 | # Wants a list of lists, each of which has the same set named indexes. | ||||
| 69 | # Outputs a dataframe where said indexes are the column names. | ||||
| 70 | # Is tolerant for slight inconsistencies of present indexes. | ||||
| 71 | # Transfers list index names to dataframe rownames. | ||||
| 97d19e55 » | brendano | 2009-03-24 | 72 | # | |
| 73 | # ALTERNATIVE: ldply() -- i think -- from http://had.co.nz/plyr/ | ||||
| 492df59a » | brendano | 2009-02-11 | 74 | ||
| 75 | b=ls | ||||
| 76 | cols = NULL | ||||
| 77 | for (i in 1:min(100,length(b))) { | ||||
| 78 | cols = c(cols, try(names(b[[i]]))) | ||||
| 79 | } | ||||
| 80 | cols = unique(cols) | ||||
| 81 | |||||
| 50e282fb » | brendano | 2009-02-14 | 82 | dynamic_returns = ( | |
| 83 | if (class(b[[1]]) == "data.frame") TRUE | ||||
| 84 | else if (class(b[[1]]) == "list") FALSE | ||||
| 85 | else FALSE | ||||
| 86 | # else stop(paste("don't know how to aggregate returns of type",class(b[[1]]))) | ||||
| 87 | ) | ||||
| 88 | |||||
| 89 | ret = NULL | ||||
| 90 | if ( dynamic_returns ) { | ||||
| 91 | ret = as.list(rep(0, length(cols))) | ||||
| 92 | names(ret) = cols | ||||
| 93 | ret = data.frame(ret)[0,] | ||||
| 94 | |||||
| 95 | for (i in 1:length(b)) { | ||||
| 96 | ret = rbind(ret, b[[i]]) | ||||
| 97 | } | ||||
| 98 | } else { | ||||
| 99 | ret = data.frame(row.names=names(b)) | ||||
| 100 | for (col in cols) { | ||||
| 101 | # print(col) | ||||
| 102 | ret[,col] = sapply(names(b), function(k) { | ||||
| 103 | if (is.null(b[[k]])) { | ||||
| 104 | NA | ||||
| 105 | } else if (!is.null(names(b[[k]]))) { | ||||
| 106 | b[[k]][[col]] | ||||
| 107 | } else if (length(b[[k]])==1 && is.na(b[[k]])) { | ||||
| 108 | NA | ||||
| 109 | } else stop("dont know what to do with value ",b[[k]]) | ||||
| 110 | }) | ||||
| 111 | } | ||||
| 112 | if (length(cols) == 0) { | ||||
| 113 | return(sapply(names(b), function(k) b[[k]])) | ||||
| 114 | } | ||||
| 492df59a » | brendano | 2009-02-11 | 115 | } | |
| 116 | ret | ||||
| 117 | } | ||||
| 118 | |||||
| 119 | util$matrix2df <- function(x) { | ||||
| 120 | # sapply() with fn() yielding lists returns a matrix with named rows/cols ... | ||||
| 121 | # and whenever you name-index into this thing it return a list ... yuck | ||||
| 122 | # make that shit more normal. | ||||
| 123 | if (class(x) != 'matrix') stop("why is class ",class(x)) | ||||
| 124 | colnames = dimnames(x)[[2]] | ||||
| 125 | if (nrow(x) > 1) | ||||
| 126 | data.frame( | ||||
| 127 | sapply(colnames, function(n) unlist(x[,n])), | ||||
| 128 | row.names=row.names(x)) | ||||
| 129 | else | ||||
| 130 | # because sapply returns a named vector in this case... | ||||
| 131 | data.frame( | ||||
| 132 | t(sapply(colnames, function(n) unlist(x[,n]))), | ||||
| 133 | row.names=row.names(x)) | ||||
| 134 | } | ||||
| 135 | |||||
| 136 | util$dfapply <- function(collection, fn) { | ||||
| 137 | # like sapply/lapply except it expects fn() to yield lists. | ||||
| 138 | # each list gets coerced into a single row of a returned dataframe. | ||||
| 97d19e55 » | brendano | 2009-03-24 | 139 | # ALTERNATIVE: adply() -- i think -- from http://had.co.nz/plyr/ | |
| 140 | |||||
| 492df59a » | brendano | 2009-02-11 | 141 | r = sapply(collection, fn) | |
| 142 | r = base::t(r) | ||||
| 143 | # sapply gives real f'd up stuff for singleton list return values. compare replicate(10,list(a=unif(1))) vs replicate(10,list(a=runif(1),b=runif(1)). and the transposes are weirder | ||||
| 144 | if (length(unique(dimnames(r)[[2]])) == 1) { | ||||
| 145 | r = base::t(r) | ||||
| 146 | dimnames(r) = list(NULL, unique(dimnames(r)[[1]])) | ||||
| 147 | } | ||||
| 148 | r = matrix2df(r) | ||||
| 149 | row.names(r) = collection | ||||
| 150 | r | ||||
| 151 | } | ||||
| 152 | |||||
| 153 | |||||
| 154 | util$df2matrix <- function(d, bycols, targetcol, | ||||
| 155 | targetfn = if (is.numeric(d[,targetcol])) mean else most_common) | ||||
| 156 | { | ||||
| 157 | # for df's that essentially store sparse matrices. make a real matrix via | ||||
| 158 | # by()-like conditioning on multiple columns ... a contingency table. | ||||
| 159 | # Design goal: inspired by table(), which does the same thing, except cells are always counts. | ||||
| 160 | # | ||||
| 161 | # This is *NOT* the inverse of matrix2df ! would be good to change naming. | ||||
| 162 | # | ||||
| 163 | # e.g. you want to know the effects of "ps" and "t" on "acc", marginalizing out "size": | ||||
| 164 | # > head(d) | ||||
| 165 | # size ps t acc | ||||
| 166 | # 1 2 0.0009765625 -1 668 | ||||
| 167 | # 2 2 0.0009765625 0 668 | ||||
| 168 | # 3 2 0.0009765625 20 670 | ||||
| 169 | # 4 2 0.0009765625 50 664 | ||||
| 170 | # | ||||
| 171 | # you do: | ||||
| 172 | # > df2matrix(head(d), c('ps','t'), 'acc', mean) | ||||
| 173 | # -1 0 20 50 | ||||
| 174 | # 0.0009765625 668 668 670 664 | ||||
| 175 | # 0.5 668 668 NA NA | ||||
| 176 | # | ||||
| 177 | # then heatmap(.Last.value, Rowv=NA,Colv=NA,scale='none') or whatever else your heart desires | ||||
| 97d19e55 » | brendano | 2009-03-24 | 178 | # | |
| 179 | # ALTERNATIVE: daply() from hadley wickham's plyr: http://had.co.nz/plyr/ | ||||
| 492df59a » | brendano | 2009-02-11 | 180 | ||
| 181 | for (j in 1:length(bycols)) | ||||
| 182 | d[,bycols[j]] = factor(d[,bycols[j]]) | ||||
| 183 | |||||
| 184 | the_dimnames = lapply(1:length(bycols), function(j) levels((d[,bycols[j]])) ) | ||||
| 185 | |||||
| 186 | # the by() cascade: | ||||
| 187 | # we want, for bycols=c('ps','t') and targetcol='acc', finalfn=mean: | ||||
| 188 | # by(d,d$ps, function(x) by(x,x$t, function(x) mean(x$acc))) | ||||
| 189 | # so recursively build that linked list of closures, from right to left. | ||||
| 190 | by_cascade = list() | ||||
| 191 | by_cascade[[length(bycols)+1]] = function(x) targetfn(x[,targetcol]) | ||||
| 192 | |||||
| 193 | for (j in length(bycols):1) { | ||||
| 194 | by_cascade[[j]] = with(list(j=j), | ||||
| 195 | function(x) { | ||||
| 196 | by(x, x[,bycols[j]], by_cascade[[j+1]]) | ||||
| 197 | } | ||||
| 198 | ) | ||||
| 199 | } | ||||
| 200 | |||||
| 201 | b = by_cascade[[1]](d) | ||||
| 202 | m = array(NA, dim=sapply(the_dimnames,length), dimnames=the_dimnames) | ||||
| 203 | |||||
| 204 | # simplest and slowest: dont use any margins for assignments. | ||||
| 205 | # yes, this would be extremely speedy in c++ | ||||
| 206 | all_spots = multi_xprod(lapply(1:length(bycols), function(j) 1:length(the_dimnames[[j]]))) | ||||
| 207 | for (i in 1:length(all_spots)) { | ||||
| 208 | inds = all_spots[[i]] | ||||
| 209 | m[t(inds)] = b[[inds]] | ||||
| 210 | } | ||||
| 211 | m | ||||
| 212 | } | ||||
| 213 | |||||
| 214 | util$kill_df_lists <- function(d) { | ||||
| 215 | # if you have internal lists inside your dataframe. if you always use | ||||
| 216 | # matrix2df this should never happen. but sometimes it does. yikes! | ||||
| 217 | for(n in names(d)) | ||||
| 218 | if (is.list(d[,n])) | ||||
| 219 | d[,n] = list2v(d[,n]) | ||||
| 220 | d | ||||
| 221 | } | ||||
| 222 | |||||
| 223 | util$flipleft <- function(x, named_vec, by) { | ||||
| 224 | # Kinda dangerous but sometimes convenient: | ||||
| 225 | # Left join data frame `x` against named_vec, matching named_vec's names | ||||
| 226 | # against a column in x. | ||||
| 227 | # Returns the new column as a bare vector, same height as x. | ||||
| 228 | if (is.null(names(named_vec))) { | ||||
| 229 | stopifnot(length(named_vec) == nlevels(x[,by])) | ||||
| 230 | names(named_vec) = levels(x[,by]) | ||||
| 231 | } | ||||
| 232 | y = data.frame(row.names=names(named_vec), ze_y_value=named_vec) | ||||
| 233 | x$ze_orig_order = 1:nrow(x) | ||||
| 234 | merged = merge(x,y, by.x=by, by.y=0, all.x=T, all.y=F) | ||||
| 235 | |||||
| 236 | merged$ze_y_value[order( merged$ze_orig_order )] | ||||
| 237 | } | ||||
| 238 | |||||
| 239 | ####### | ||||
| 240 | |||||
| 7b5506d3 » | brendano | 2009-03-24 | 241 | util$read.tsv <- function(...) read.delim(..., quote='') # honest-to-goodness vanilla tsv with header | |
| 242 | |||||
| 73fe52a2 » | brendano | 2008-05-29 | 243 | util$msg <- function(...) cat(..., "\n", file=stderr()) | |
| 244 | |||||
| 245 | util$strlen <- function(s) length(strsplit(s,"")[[1]]) | ||||
| 246 | |||||
| 247 | util$strmatch <- function(pat,s) length(grep(pat,s)) > 0 | ||||
| 248 | |||||
| 249 | util$strstrip <- function(s) gsub("^\\s*|\\s*$", "", s) | ||||
| 250 | |||||
| d60c16b1 » | brendano | 2008-06-25 | 251 | util$as.c <- as.character | |
| 252 | |||||
| 3d974a50 » | brendano | 2008-05-30 | 253 | util$is_empty <- function(collection) length(collection) == 0 | |
| 254 | |||||
| d4d16257 » | brendano | 2008-06-29 | 255 | util$first <- function(x) head(x, 1) | |
| 256 | |||||
| 1d4b8217 » | brendano | 2008-06-19 | 257 | util$last <- function(x) tail(x, 1) | |
| 258 | |||||
| 73fe52a2 » | brendano | 2008-05-29 | 259 | util$unwhich <- function(indices, len=length(indices)) { | |
| 260 | ret = rep(F,len) | ||||
| 261 | ret[indices] = T | ||||
| 262 | ret | ||||
| 263 | } | ||||
| 264 | |||||
| 4aedf99c » | brendano | 2008-07-11 | 265 | util$nna <- function(...) !is.na(...) # i type this a lot, i think its worth 3 characters + shift key | |
| 266 | |||||
| 7b5506d3 » | brendano | 2009-03-24 | 267 | util$kna <- function(x) x[nna(x)] # kill NA's (from vector) | |
| 268 | |||||
| 4aedf99c » | brendano | 2008-07-11 | 269 | # hm: unitnorm and rescale are both subsumed in reshape:rescaler | |
| 270 | |||||
| b2ff0e9e » | brendano | 2008-07-03 | 271 | util$unitnorm <- function(x, na.rm=FALSE, ...) (x - mean(x,na.rm=na.rm,...)) / sd(x,na.rm=na.rm) | |
| 272 | |||||
| 273 | util$renorm <- function(x, mean=0, sd=1, ...) (unitnorm(x,...) * sd) + mean | ||||
| d60c16b1 » | brendano | 2008-06-25 | 274 | ||
| 4aedf99c » | brendano | 2008-07-11 | 275 | util$rescale <- function(x, bounds=range(x)) (x-bounds[1]) / (bounds[2]-bounds[1]) | |
| 276 | |||||
| d60c16b1 » | brendano | 2008-06-25 | 277 | util$rbern <- function(n, p=0.5) rbinom(n, size=1, prob=p) | |
| 278 | |||||
| 7b5506d3 » | brendano | 2009-03-24 | 279 | # util$my_rmultinom <- function(n, w=c(1,1,8)) { | |
| 280 | # # because rmultinom() doesn't make sense to me | ||||
| 281 | # # w are per-class weights (unnorm probs) | ||||
| 282 | # p = w / sum(w) | ||||
| 283 | # cutoffs = cumsum(p) | ||||
| 284 | # unif = runif(n) | ||||
| 285 | # mat = sapply(unif, function(x) x < cutoffs) | ||||
| 286 | # apply(mat, 2, function(x) min(which(x)) - 1) | ||||
| 287 | # } | ||||
| 288 | |||||
| 289 | util$boot_binom <- function(n, p) rbinom(1,n,p)/n | ||||
| 290 | |||||
| d60c16b1 » | brendano | 2008-06-25 | 291 | util$shuffle <- function(...) UseMethod("shuffle") | |
| 292 | |||||
| 293 | util$shuffle.default <- function(x) x[order(runif(length(x)))] | ||||
| 294 | |||||
| 295 | util$shuffle.data.frame <- function(x) x[order(runif(nrow(x))),] | ||||
| 42fbe53d » | brendano | 2008-06-20 | 296 | ||
| 50e282fb » | brendano | 2009-02-14 | 297 | util$sample_df <- function(d, size=10, ...) { | |
| 298 | samp = sample(1:nrow(d), size=size, ...) | ||||
| 299 | d[samp,] | ||||
| 300 | } | ||||
| 301 | |||||
| 42fbe53d » | brendano | 2008-06-20 | 302 | util$present_levels <- function(x) intersect(levels(x), x) | |
| 303 | |||||
| d4d16257 » | brendano | 2008-06-29 | 304 | util$trim_levels <- function(...) UseMethod("trim_levels") | |
| 305 | |||||
| 306 | util$trim_levels.factor <- function(x) factor(x, levels=present_levels(x)) | ||||
| 307 | |||||
| 308 | util$trim_levels.data.frame <- function(x) { | ||||
| 309 | for (n in names(x)) | ||||
| 310 | if (is.factor(x[,n])) | ||||
| 311 | x[,n] = trim_levels(x[,n]) | ||||
| 312 | x | ||||
| 313 | } | ||||
| 314 | |||||
| 315 | util$kill_names <- function(x) { names(x) = NULL; x } | ||||
| 42fbe53d » | brendano | 2008-06-20 | 316 | ||
| 492df59a » | brendano | 2009-02-11 | 317 | ||
| 318 | # Variants on table() | ||||
| 319 | |||||
| 320 | util$table.freq <- function(...) table(...) / sum(table(...)) | ||||
| 321 | |||||
| 322 | util$table.square <- function(x,y, ..., values=unique(c(as.c(x),as.c(y)))) { | ||||
| 323 | # intended for factor data | ||||
| 324 | x=as.c(x); y=as.c(y) | ||||
| 325 | for (i in 1:length(values)) for (j in 1:length(values)) { | ||||
| 326 | x = c(x, values[i]); y = c(y, values[j]) | ||||
| 327 | } | ||||
| 328 | legit_inds = as.c(x) %in% values & as.c(y) %in% values | ||||
| 329 | t = table(x[legit_inds], y[legit_inds], ...) | ||||
| 330 | # print(t) | ||||
| 331 | t = t - 1 | ||||
| 332 | t | ||||
| 333 | } | ||||
| 334 | |||||
| 335 | util$table.cond <- function(...) { | ||||
| 50e282fb » | brendano | 2009-02-14 | 336 | # for two args x,y: x on rows, y on cols, cells are P(y|x) | |
| 492df59a » | brendano | 2009-02-11 | 337 | t = table(...) | |
| 338 | for (x1 in 1:nrow(t)) | ||||
| 339 | t[x1,] = t[x1,] / sum(t[x1,]) | ||||
| 340 | t | ||||
| 341 | } | ||||
| 342 | |||||
| 343 | util$table.range <- function(x, min=NULL, max=NULL) { | ||||
| 344 | # like table(), but only for integers, and forces a contiguous range of bins | ||||
| 345 | # so counts of 0 can appear. Useful if you want to compare tables between | ||||
| 346 | # different datasets. | ||||
| 347 | if (is.null(min)) min = min(x) | ||||
| 348 | if (is.null(max)) max = max(x) | ||||
| 349 | x2 = rep(0, max-min+1) | ||||
| 350 | t = table(x) | ||||
| 351 | t_inds = as.integer(names(t)) | ||||
| 352 | t_mask = t_inds >= min & t_inds <= max | ||||
| 353 | t_inds = t_inds[t_mask] | ||||
| 354 | mask = t_inds - min + 1 | ||||
| 355 | x2[mask] = x2[mask] + t[t_mask] | ||||
| 356 | names(x2) = min:max | ||||
| 357 | x2 | ||||
| 358 | } | ||||
| 359 | |||||
| 360 | # Tie breakers | ||||
| 361 | |||||
| 1d4b8217 » | brendano | 2008-06-19 | 362 | util$fair_gt <- function(x,y) { | |
| 363 | # breaks ties arbitrarily. # of TRUE's should be halfway between > and >=. | ||||
| 364 | ret = rep(NA, length(x)) | ||||
| 365 | ret[x > y] = TRUE | ||||
| 366 | ret[x < y] = FALSE | ||||
| 367 | # which filler order? should randomly chooise either c(T,F) vs c(F,T) as the | ||||
| 368 | # seed (or a random permutation of 50/50 distribution on the whole length), | ||||
| 369 | # but not clear how to stably but arbitrarily choose one... hash the bitmap | ||||
| 370 | # of the concatenation of x and y perhaps. don't know how to do in highlevel R. | ||||
| 371 | filler_length = length(which(x==y)) | ||||
| 372 | filler = rep(c(TRUE,FALSE), ceiling(filler_length/2) )[1:filler_length] | ||||
| 373 | ret[which(x == y)] = filler | ||||
| 374 | ret | ||||
| 375 | } | ||||
| 376 | |||||
| 377 | util$fair_lt <- function(x,y) ! fair_gt(x,y) | ||||
| 378 | |||||
| 379 | util$rand_gt <- function(x,y) { | ||||
| 380 | # breaks ties randomly. | ||||
| 381 | ret = rep(NA, length(x)) | ||||
| 382 | ret[x > y] = TRUE | ||||
| 383 | ret[x < y] = FALSE | ||||
| 384 | filler_length = length(which(x==y)) | ||||
| 385 | filler = as.logical(rbern(filler_length)) | ||||
| 386 | ret[which(x == y)] = filler | ||||
| 387 | ret | ||||
| 388 | } | ||||
| 389 | |||||
| 390 | util$rand_lt <- function(x,y) ! rand_gt(x,y) | ||||
| 391 | |||||
| 37beba92 » | brendano | 2008-06-16 | 392 | util$most_common <- function(x) names(which.max(table(x, exclude=NULL))) | |
| 393 | |||||
| 5909eefd » | brendano | 2008-06-16 | 394 | util$p2o <- function(p) p / (1-p) # probability -> odds ratio | |
| 395 | util$o2p <- function(o) o / (1+o) # odds ratio -> probability | ||||
| 1d4b8217 » | brendano | 2008-06-19 | 396 | util$lo2p <-function(lo) o2p(2^lo) | |
| d60c16b1 » | brendano | 2008-06-25 | 397 | util$p2lo <-function(p) log2(p2o(p)) # i like base-2 logits best. | |
| c7cc1fd4 » | dolores | 2008-08-18 | 398 | util$lno2p <-function(lo) o2p(exp(lo)) | |
| 399 | util$p2lno <-function(p) log(p2o(p)) | ||||
| 5909eefd » | brendano | 2008-06-16 | 400 | ||
| 37beba92 » | brendano | 2008-06-16 | 401 | util$merge.list <- function(x,y,only.new.y=FALSE,append=FALSE,...) { | |
| 402 | # http://tolstoy.newcastle.edu.au/R/devel/04/11/1469.html | ||||
| 403 | out=x | ||||
| 404 | |||||
| 405 | ystructure = names(c(y,recursive=TRUE)) | ||||
| 406 | xstructure = names(c(x,recursive=TRUE)) | ||||
| 407 | yunique = ystructure[! ystructure %in% xstructure] | ||||
| 408 | |||||
| 409 | ystructure = sapply(ystructure,FUN=function(element) strsplit(element,"\\.")) | ||||
| 410 | xstructure = sapply(xstructure,FUN=function(element) strsplit(element,"\\.")) | ||||
| 411 | yunique = sapply(yunique,FUN=function(element) strsplit(element,"\\.")) | ||||
| 412 | |||||
| 413 | if (only.new.y) | ||||
| 414 | lapply(yunique, FUN=function(index) out[[index]]<<-y[[index]]) | ||||
| 415 | else { | ||||
| 416 | if (!append) { | ||||
| 417 | lapply(ystructure, FUN=function(index) out[[index]]<<-y[[index]]) | ||||
| 418 | } | ||||
| 419 | else lapply(ystructure, FUN=function(index) out[[index]]<<-c(out[[index]],y[[index]])) | ||||
| 420 | } | ||||
| 421 | return(out) | ||||
| 422 | } | ||||
| 423 | |||||
| 73fe52a2 » | brendano | 2008-05-29 | 424 | util$lax_rbind <- function(...) { | |
| 425 | inputs = list(...) | ||||
| 426 | each_names = sapply(inputs, names) | ||||
| 427 | all_names = unique(c(each_names, recursive=TRUE)) | ||||
| 2355b0c5 » | brendano | 2009-10-02 | 428 | if (is.data.frame(inputs[[1]]) && all(dim(inputs[[1]])==c(0,0))) | |
| 429 | inputs[[1]] = NULL | ||||
| 73fe52a2 » | brendano | 2008-05-29 | 430 | for (k in 1:length(inputs)) { | |
| 431 | if (is.null(inputs[[k]])) next | ||||
| 432 | more = setdiff(all_names, names(inputs[[k]])) | ||||
| 2355b0c5 » | brendano | 2009-10-02 | 433 | names(inputs[[k]]) = c(names(inputs[[k]]), more) | |
| 434 | # inputs[[k]][,more] = NA | ||||
| 73fe52a2 » | brendano | 2008-05-29 | 435 | } | |
| 436 | do.call(rbind, inputs) | ||||
| 437 | } | ||||
| 438 | |||||
| 439 | util$fill_bool <- function(bool, true='yes', false='no') { | ||||
| 440 | ret = rep(NA,length(bool)) | ||||
| 441 | names(ret) = names(bool) | ||||
| 442 | ret[bool] = true | ||||
| 443 | ret[!bool] = false | ||||
| 444 | ret | ||||
| 445 | } | ||||
| 446 | |||||
| 3d974a50 » | brendano | 2008-05-30 | 447 | util$trmap <- function(vec, translation_table) { | |
| 4aedf99c » | brendano | 2008-07-11 | 448 | # pre-obsoleted by chartr() ? like unix "tr" | |
| 3d974a50 » | brendano | 2008-05-30 | 449 | ret = rep(NA, length(vec)) | |
| 450 | for (x in names(translation_table)) | ||||
| 451 | ret[as.c(vec)==x] = translation_table[[x]] | ||||
| 452 | ret | ||||
| 453 | } | ||||
| 73fe52a2 » | brendano | 2008-05-29 | 454 | ||
| 455 | util$bgrep <- function(pat,x, ...) { | ||||
| 3d974a50 » | brendano | 2008-05-30 | 456 | # "boolean" grep: return a logical vector ready for &, | etc ops. | |
| 457 | # so bgrep works in the world of vector ops like ==, %in%, etc. | ||||
| 73fe52a2 » | brendano | 2008-05-29 | 458 | unwhich(grep(pat,x,...), length(x)) | |
| 459 | } | ||||
| 460 | |||||
| d60c16b1 » | brendano | 2008-06-25 | 461 | util$ngrep <- function(pat,x, ...) | |
| 462 | # "normal" grep: return values, not indices | ||||
| 463 | x[grep(pat,x,...)] | ||||
| 37beba92 » | brendano | 2008-06-16 | 464 | ||
| 73fe52a2 » | brendano | 2008-05-29 | 465 | util$tapply2 <- function(x, ...) { | |
| 4aedf99c » | brendano | 2008-07-11 | 466 | # like tapply but preserves factors | |
| 73fe52a2 » | brendano | 2008-05-29 | 467 | if (is.factor(x)) { | |
| 468 | r = factor(tapply(as.character(x), ...), levels=levels(x)) | ||||
| 469 | } else { | ||||
| 470 | r = tapply(x, ...) | ||||
| 471 | } | ||||
| 472 | r | ||||
| 473 | } | ||||
| 474 | |||||
| 475 | util$inject <- function(collection, start, fn) { | ||||
| d60c16b1 » | brendano | 2008-06-25 | 476 | # like lisp reduce. (named after ruby) | |
| 73fe52a2 » | brendano | 2008-05-29 | 477 | acc = start | |
| 478 | for (x in collection) | ||||
| 479 | acc = fn(acc, x) | ||||
| 480 | acc | ||||
| 481 | } | ||||
| 482 | |||||
| 483 | util$select <- function(collection, fn) { | ||||
| 4aedf99c » | brendano | 2008-07-11 | 484 | # like lisp filter. (named after ruby) | |
| 485 | # nice for lists. not useful for vectors, use boolean vector indexing instead. | ||||
| 73fe52a2 » | brendano | 2008-05-29 | 486 | r = c() | |
| 487 | for (x in collection) | ||||
| 488 | if (fn(x)) | ||||
| 489 | r = c(r, x) | ||||
| 490 | r | ||||
| 491 | } | ||||
| 492 | |||||
| 493 | util$xprod <- function(xs,ys) { | ||||
| 494 | ret = list() | ||||
| 495 | i=0 | ||||
| 496 | for (x in xs) for (y in ys) { | ||||
| 497 | i = i+1 | ||||
| 498 | ret[[i]] = list(x=x,y=y) | ||||
| 499 | } | ||||
| 500 | ret | ||||
| 501 | } | ||||
| 502 | |||||
| d60c16b1 » | brendano | 2008-06-25 | 503 | util$multi_xprod <- function(args) { | |
| 504 | # args = list(...) | ||||
| 505 | pair_xprod <- function(xs,ys) { | ||||
| 506 | ret = list() | ||||
| 507 | i=0 | ||||
| 508 | for (x in xs) for (y in ys) { | ||||
| 509 | i = i+1 | ||||
| 510 | ret[[i]] = c(x,y) | ||||
| 511 | } | ||||
| 512 | ret | ||||
| 513 | } | ||||
| 514 | ret = list(NA) | ||||
| 515 | for (i in 1:length(args)) { | ||||
| 516 | ret = pair_xprod(ret, args[[i]]) | ||||
| 517 | } | ||||
| 518 | lapply(ret, function(x) x[2:length(x)]) | ||||
| 519 | } | ||||
| 520 | |||||
| 7b5506d3 » | brendano | 2009-03-24 | 521 | util$printf <- function(...) cat(sprintf(...)) | |
| 522 | |||||
| 523 | util$listprint <- function(x) { | ||||
| 524 | s = paste(sapply(names(x), function(n) sprintf("%s=%s", n,x[[n]])), collapse=' ') | ||||
| 525 | printf("%s\n", s) | ||||
| 526 | } | ||||
| 527 | |||||
| 528 | |||||
| 4aedf99c » | brendano | 2008-07-11 | 529 | # routines to help manage longrunning jobs and optimize performance. | |
| d60c16b1 » | brendano | 2008-06-25 | 530 | # so much more potential here... | |
| 531 | |||||
| 532 | util$timeit <- function(expr, name=NULL) { | ||||
| 533 | # print how long the expression takes, and return its value too. | ||||
| 4aedf99c » | brendano | 2008-07-11 | 534 | # So you can interpose timeit({ blabla }) around any chunk of code "blabla". | |
| 73fe52a2 » | brendano | 2008-05-29 | 535 | start = Sys.time() | |
| d60c16b1 » | brendano | 2008-06-25 | 536 | ret = eval(expr) | |
| 73fe52a2 » | brendano | 2008-05-29 | 537 | finish = Sys.time() | |
| d60c16b1 » | brendano | 2008-06-25 | 538 | if (!is.null(name)) cat(name,": ") | |
| 2355b0c5 » | brendano | 2009-10-02 | 539 | print(finish-start) | |
| 73fe52a2 » | brendano | 2008-05-29 | 540 | invisible(ret) | |
| 541 | } | ||||
| 542 | |||||
| b2ff0e9e » | brendano | 2008-07-03 | 543 | util$dotprogress <- function(callback, interval=10) { | |
| d60c16b1 » | brendano | 2008-06-25 | 544 | # intended to wrap the anonymous callback for sapply() or somesuch. | |
| 73fe52a2 » | brendano | 2008-05-29 | 545 | count = 0 | |
| 546 | return(function(...) { | ||||
| 547 | if ((count <<- count+1) %% interval == 0) | ||||
| 548 | cat(".") | ||||
| 549 | callback(...) | ||||
| 550 | }) | ||||
| 551 | } | ||||
| 552 | |||||
| 553 | |||||
| 492df59a » | brendano | 2009-02-11 | 554 | ######## | |
| 73fe52a2 » | brendano | 2008-05-29 | 555 | ||
| 7b5506d3 » | brendano | 2009-03-24 | 556 | util$hintonplot <- function(mat, max_value=max(abs(mat)), mid_value=0, ...) { | |
| 557 | # example for counts: | ||||
| 558 | # > t=table(cyl=mtcars$cyl, mpg=cut(mtcars$mpg,3)) | ||||
| 559 | # > t | ||||
| 560 | # mpg | ||||
| 561 | # cyl (10.4,18.2] (18.2,26.1] (26.1,33.9] | ||||
| 562 | # 4 0 6 5 | ||||
| 563 | # 6 2 5 0 | ||||
| 564 | # 8 12 2 0 | ||||
| 565 | # > hintonplot(t) # same thing but graphically | ||||
| 566 | |||||
| 567 | plot.new() | ||||
| 568 | plot.window(xlim=c(0.5,ncol(mat)+0.5), ylim=c(0.5,nrow(mat)+0.5)) | ||||
| 569 | |||||
| 570 | x_mid = 1:ncol(mat) | ||||
| 571 | y_mid = 1:nrow(mat) | ||||
| 572 | |||||
| 573 | area = abs(mat) / max_value | ||||
| 574 | side = sqrt(area) | ||||
| 575 | |||||
| 576 | |||||
| 577 | for (x in 1:ncol(mat)) { | ||||
| 578 | for (y in nrow(mat):1) { | ||||
| 579 | # ym = (nrow(mat):1)[y] | ||||
| 580 | ym = y | ||||
| 581 | d = side[ym,x] / 2 | ||||
| 582 | rect(x-d, y-d, x+d, y+d, col=if (mat[ym,x]>0) 'darkblue' else 'darkred') | ||||
| 583 | } | ||||
| 584 | } | ||||
| 585 | |||||
| 586 | axis(1, 1:ncol(mat), labels=colnames(mat)) | ||||
| 587 | # axis(2, nrow(mat):1, labels=row.names(mat)) | ||||
| 588 | axis(2, 1:nrow(mat), labels=row.names(mat)) | ||||
| 589 | title(xlab=names(dimnames(mat))[2], ylab=names(dimnames(mat))[1], ...) | ||||
| 590 | } | ||||
| 591 | |||||
| 4a6ecb46 » | brendano | 2009-09-01 | 592 | util$linelight <- function(x,y, lty='dashed', col='lightgray', ...) { | |
| 593 | # highlight a point with lines running to the axes. | ||||
| 594 | left = par('usr')[1] | ||||
| 595 | bot = par('usr')[3] | ||||
| 596 | segments(left,y, x,y, lty=lty, col=col, ...) | ||||
| 597 | segments(x,bot, x,y, lty=lty, col=col, ...) | ||||
| 598 | } | ||||
| 599 | |||||
| 600 | util$binary_eval <- function(pred,labels, cutoff='naive', repar=TRUE, ...) { | ||||
| 7559a34b » | brendano | 2009-03-31 | 601 | library(ROCR) | |
| 602 | # plot(performance(prediction(pred,y),'acc')) | ||||
| 603 | rocr_pred = prediction(pred,labels) | ||||
| 604 | acc = performance(rocr_pred,'acc') | ||||
| 605 | f1 = performance(rocr_pred,'f') | ||||
| 606 | auc = performance(rocr_pred,'auc')@y.values[[1]] | ||||
| 607 | roc = performance(rocr_pred,'rec','spec') | ||||
| 608 | # sensspec = performance(rocr_pred,'rec','spec') | ||||
| 609 | pr_curve = performance(rocr_pred,'prec','rec') | ||||
| 4a6ecb46 » | brendano | 2009-09-01 | 610 | rp_curve = performance(rocr_pred,'rec','prec') | |
| 7559a34b » | brendano | 2009-03-31 | 611 | ||
| 612 | printf("AUC = %.3f\n", auc) | ||||
| 613 | |||||
| 614 | if (cutoff=='naive') { | ||||
| 615 | if (all(pred>=0) & all(pred<=1)) { | ||||
| 616 | printf("Predictions seem to be probabilities, so ") | ||||
| 617 | cutoff = 0.5 | ||||
| 618 | } else if (any(pred<0) & any(pred>0)) { | ||||
| 619 | printf("Predictions seem to be real-valued scores, so ") | ||||
| 620 | cutoff = 0 | ||||
| 621 | } else { | ||||
| 622 | warning("cant tell what naive cutoff should be") | ||||
| 623 | cutoff = NULL | ||||
| 624 | } | ||||
| 625 | printf("using naive cutoff %s:\n", cutoff) | ||||
| 626 | } else if (class(cutoff)=='character') { | ||||
| 627 | printf("Using %s-best cutoff ", cutoff) | ||||
| 4a6ecb46 » | brendano | 2009-09-01 | 628 | perf = performance(rocr_pred, cutoff, ...) | |
| 7559a34b » | brendano | 2009-03-31 | 629 | cutoff_ind = which.max(perf@y.values[[1]]) | |
| 4a6ecb46 » | brendano | 2009-09-01 | 630 | cutoff = if (cutoff=='prbe') perf@x.values[[1]][1] else rocr_pred@cutoffs[[1]][cutoff_ind] | |
| 7559a34b » | brendano | 2009-03-31 | 631 | printf("%f:\n", cutoff) | |
| 632 | } else { | ||||
| 633 | printf("For cutoff %s:\n", cutoff) | ||||
| 634 | } | ||||
| 635 | cutoff_ind = last(which(rocr_pred@cutoffs[[1]] >= cutoff)) | ||||
| 636 | |||||
| 637 | |||||
| 4a6ecb46 » | brendano | 2009-09-01 | 638 | if (repar) par(mfrow=c(2,2)) | |
| 639 | |||||
| 640 | pp = function(perf) { | ||||
| 641 | if (is.finite(cutoff_ind)) { | ||||
| 642 | x=perf@x.values[[1]][cutoff_ind] | ||||
| 643 | y=perf@y.values[[1]][cutoff_ind] | ||||
| 644 | points(x,y, col='blue') | ||||
| 645 | linelight(x,y, col='lightblue') | ||||
| 646 | } | ||||
| 647 | } | ||||
| 7559a34b » | brendano | 2009-03-31 | 648 | plot(acc); pp(acc) | |
| 649 | plot(f1); pp(f1) | ||||
| 650 | plot(roc); pp(roc) | ||||
| 651 | abline(a=1,b=-1,lty='dashed',col='gray') | ||||
| 652 | legend('bottomleft',legend=sprintf("AUC = %.3f",auc)) | ||||
| 4a6ecb46 » | brendano | 2009-09-01 | 653 | plot(rp_curve); pp(rp_curve) | |
| 654 | pp = function(ind,...) points(rp_curve@x.values[[1]][ind], rp_curve@y.values[[1]][ind], ...) | ||||
| 7559a34b » | brendano | 2009-03-31 | 655 | best_f1 = which.max(f1@y.values[[1]]) | |
| 4a6ecb46 » | brendano | 2009-09-01 | 656 | pp(best_f1, pch=2,col='green') | |
| 657 | f05 = performance(rocr_pred,'f',beta=0.5) | ||||
| 658 | best_f05 = which.max(f05@y.values[[1]]) | ||||
| 659 | pp(best_f05,pch=2,col='green') | ||||
| 660 | f2 = performance(rocr_pred,'f',beta=2) | ||||
| 661 | best_f2 = which.max(f2@y.values[[1]]) | ||||
| 662 | pp(best_f2,pch=2,col='green') | ||||
| 663 | |||||
| 664 | prbe = performance(rocr_pred,'prbe')@y.values[[1]] | ||||
| 665 | linelight(prbe,prbe,col='lightgray') | ||||
| 7559a34b » | brendano | 2009-03-31 | 666 | ||
| 667 | # printf("Acc = %.3f\n", mean((pred >= cutoff) == (labels > 0))) | ||||
| 668 | printf("Acc = %.3f\n", acc@y.values[[1]][cutoff_ind]) | ||||
| 669 | |||||
| 670 | printf(" F = %.3f\n", f1@y.values[[1]][cutoff_ind]) | ||||
| 671 | printf(" Prec = %.3f\n", pr_curve@y.values[[1]][cutoff_ind]) | ||||
| 672 | printf(" Rec = %.3f\n", pr_curve@x.values[[1]][cutoff_ind]) | ||||
| 673 | printf(" Spec = %.3f\n", roc@x.values[[1]][cutoff_ind]) | ||||
| 674 | if (rocr_pred@n.pos[[1]] != rocr_pred@n.neg[[1]]) | ||||
| 675 | printf("Balanced Acc = %.3f\n", mean(c(roc@x.values[[1]][cutoff_ind], roc@y.values[[1]][cutoff_ind]))) | ||||
| 676 | |||||
| 677 | |||||
| 678 | invisible(rocr_pred) | ||||
| 679 | } | ||||
| 680 | |||||
| 7b5506d3 » | brendano | 2009-03-24 | 681 | ||
| 682 | |||||
| 683 | |||||
| 684 | ######## | ||||
| 685 | |||||
| 492df59a » | brendano | 2009-02-11 | 686 | # for interactivity... | |
| d60c16b1 » | brendano | 2008-06-25 | 687 | ||
| 492df59a » | brendano | 2009-02-11 | 688 | util$excel <- function(d) { | |
| 7b5506d3 » | brendano | 2009-03-24 | 689 | f = paste("/tmp/tmp.", round(runif(1)*1000),".csv", sep='') | |
| 690 | # con = file(f, "w", encoding="MACROMAN") | ||||
| 691 | con = file(f, "w") | ||||
| 692 | write.csv(d, con, row.names=FALSE) | ||||
| 693 | close(con) | ||||
| 492df59a » | brendano | 2009-02-11 | 694 | # system(paste("open -a 'Microsoft Excel' ",f, sep='')) | |
| 695 | system(paste("open -a '/Applications/Microsoft Office 2008/Microsoft Excel.app' ",f, sep='')) | ||||
| 696 | } | ||||
| d60c16b1 » | brendano | 2008-06-25 | 697 | ||
| 492df59a » | brendano | 2009-02-11 | 698 | util$mate <- function(...) { | |
| 699 | system(paste("mate", ...)) | ||||
| 700 | } | ||||
| d60c16b1 » | brendano | 2008-06-25 | 701 | ||
| 7559a34b » | brendano | 2009-03-31 | 702 | util$vim <- function(...) { | |
| 703 | system(paste("vim",...)) | ||||
| 704 | } | ||||
| 705 | |||||
| 492df59a » | brendano | 2009-02-11 | 706 | util$ppy <- function(x, column.major=FALSE, ...) { | |
| 707 | # pretty-print as yaml. intended for rows with big textual cells. | ||||
| 708 | # a la mysql's \G operator | ||||
| 709 | # same usecase as ppy() in dotfiles.org/~brendano/.irbrc | ||||
| d60c16b1 » | brendano | 2008-06-25 | 710 | ||
| 492df59a » | brendano | 2009-02-11 | 711 | library(yaml) | |
| 712 | cat(as.yaml(x, column.major=column.major), ...) | ||||
| 713 | cat("\n", ...) | ||||
| 714 | } | ||||
| d60c16b1 » | brendano | 2008-06-25 | 715 | ||
| 492df59a » | brendano | 2009-02-11 | 716 | util$newwin <- function(x) { | |
| 717 | f = paste("/tmp/tmp.", round(runif(1)*100),".txt", sep='') | ||||
| 718 | capture.output(print(x),file=f) | ||||
| 719 | # system("FILE_TO_VIEW=/tmp/tmp.txt /Applications/Utilities/Terminal.app/Contents/MacOS/Terminal /users/brendano/sw/bin/lame_viewer.sh") | ||||
| 720 | # system("DISPLAY=:0 /usr/X11R6/bin/xterm -geometry 80x60 -e less /tmp/tmp.txt &") | ||||
| 721 | system(paste("mate ",f," &", sep='')) | ||||
| d60c16b1 » | brendano | 2008-06-25 | 722 | } | |
| 73fe52a2 » | brendano | 2008-05-29 | 723 | ||
| d60c16b1 » | brendano | 2008-06-25 | 724 | ||
| 492df59a » | brendano | 2009-02-11 | 725 | ########## | |
| 37beba92 » | brendano | 2008-06-16 | 726 | ||
| 492df59a » | brendano | 2009-02-11 | 727 | while("util" %in% search()) | |
| 728 | detach("util") | ||||
| 729 | attach(util) | ||||
| 73fe52a2 » | brendano | 2008-05-29 | 730 | ||
| 492df59a » | brendano | 2009-02-11 | 731 | ########## | |
| 73fe52a2 » | brendano | 2008-05-29 | 732 | ||
| 733 | |||||
| 734 | |||||
| 4aedf99c » | brendano | 2008-07-11 | 735 | ||
| 736 | |||||
| 492df59a » | brendano | 2009-02-11 | 737 | ||
| 738 | ## deprecated ## | ||||
| 4aedf99c » | brendano | 2008-07-11 | 739 | ||
| d4d16257 » | brendano | 2008-06-29 | 740 | util$mymerge <- function(x,y, row.x=F,row.y=F, keep.y=NULL, by=NULL, ...) { | |
| d60c16b1 » | brendano | 2008-06-25 | 741 | # Wrapper around merge(). turns out this is not needed because i didnt | |
| 742 | # read merge()'s manual page carefully enough: it has a facility for | ||||
| 743 | # joining on rownames. merge() is great. | ||||
| 744 | |||||
| 3d974a50 » | brendano | 2008-05-30 | 745 | if (row.x) x[,by] = row.names(x) | |
| 746 | if (row.y) y[,by] = row.names(y) | ||||
| 747 | |||||
| d4d16257 » | brendano | 2008-06-29 | 748 | ret = merge(x,y,by=by, suffixes=c('','.y'), ...) | |
| 5909eefd » | brendano | 2008-06-16 | 749 | if (row.x && nrow(ret)==nrow(x)) row.names(ret) = row.names(x) | |
| 750 | if (row.y && nrow(ret)==nrow(y)) row.names(ret) = row.names(y) | ||||
| 42fbe53d » | brendano | 2008-06-20 | 751 | ||
| 752 | if (!is.null(keep.y)) | ||||
| 753 | ret = ret[ ,c(names(x),keep.y) ] | ||||
| 3d974a50 » | brendano | 2008-05-30 | 754 | ret | |
| 73fe52a2 » | brendano | 2008-05-29 | 755 | } | |
| 756 | |||||
| 757 | util$read.xmlss <- function(f) { | ||||
| 7b5506d3 » | brendano | 2009-03-24 | 758 | # ALTERNATIVE: read.tsv(pipe("xlsx2tsv ...")) with github.com/brendano/tsvutils | |
| 759 | # xlsx is DIFFERENT from xmlss. on mac, need excel 2008 to get it | ||||
| 760 | |||||
| d60c16b1 » | brendano | 2008-06-25 | 761 | ## BAD BUG: the xml skips cells sometimes. tricky to parse, argh. | |
| 73fe52a2 » | brendano | 2008-05-29 | 762 | # Mac Excel 2004 calls this "XML Spreadsheet". It's nice because it's UTF-8. | |
| 763 | # [ mac .xls seems to be macroman, but xls2csv (perl converter) f's it up,. | ||||
| 764 | # and then iconv can't recover. boo! ] | ||||
| 7b5506d3 » | brendano | 2009-03-24 | 765 | ||
| 766 | |||||
| 73fe52a2 » | brendano | 2008-05-29 | 767 | csv_pipe = pipe(paste('ruby <<EOF | |
| 768 | require "rubygems" | ||||
| 769 | require "hpricot" | ||||
| 770 | require "fastercsv" | ||||
| 771 | h = Hpricot(File.read("',f,'")) | ||||
| 772 | mat = (h.at("worksheet")/"row").map{|row| (row/"cell").map{|data| data.inner_text}} | ||||
| 773 | mat.each{|row| puts row.to_csv} | ||||
| 774 | ', sep='')) | ||||
| 775 | df = read.csv(csv_pipe) | ||||
| 776 | # close(csv_pipe) | ||||
| 777 | df | ||||
| 778 | } | ||||
| 779 | |||||
| 492df59a » | brendano | 2009-02-11 | 780 | util$list2v <- function(x) sapply(x, I) # turns list's values into a vector. index names are dropped. pre-obsoleted by unlist() ? | |
| d60c16b1 » | brendano | 2008-06-25 | 781 | ||
