brendano / dlanalysis

a bunch of R code for various statistical analyses

dlanalysis / util.R
d60c16b1 » brendano 2008-06-25 xval stuff, c++ extension f... 1 # util.R:
4a6ecb46 » brendano 2009-09-01 reframe(), linelight(), and... 2 # Utilities to make R a happier place
d60c16b1 » brendano 2008-06-25 xval stuff, c++ extension f... 3 # Brendan O'Connor, brenocon@gmail.com
4
5
73fe52a2 » brendano 2008-05-29 generic analysis initial co... 6 options(showWarnCalls=T, showErrorCalls=T)
7164ce94 » brendano 2009-10-02 linux friendlier 7
8 if ( (numcol <-Sys.getenv("COLUMNS")) != "") {
9 numcol = as.integer(numcol)
10 options(width= numcol - 1)
11 } else if (system("stty -a &>/dev/null") == 0) {
12 # mac specific? probably bad in the R GUI too.
4a6ecb46 » brendano 2009-09-01 reframe(), linelight(), and... 13 numcol = as.integer(sub(".* ([0-9]+) column.*", "\\1", system("stty -a", intern=T)[1]))
14 if (numcol > 0)
15 options(width= numcol - 1 )
16 }
73fe52a2 » brendano 2008-05-29 generic analysis initial co... 17
18
19 util = new.env()
20
492df59a » brendano 2009-02-11 util reorg 21 ######
22 #
23 # dataframe-outputting apply and aggregation functions.
24 # dfagg, dfapply, df2matrix, matrix2df
25 # i'm often confused whether proper R style should emphasize matrices or dataframes.
26 # so these support for a dataframe-centric lifestyle.
27
4a6ecb46 » brendano 2009-09-01 reframe(), linelight(), and... 28 reframe <- function(.data, ...) {
29 eval(substitute(data.frame(...)), .data, parent.frame())
30 }
31
32 # reframe = function(.data, ...) {
33 # e = eval(substitute(list(...)), .data, parent.frame())
34 # data.frame(e)
35 # }
36
37
492df59a » brendano 2009-02-11 util reorg 38
39 util$dfagg <- function(d, byvals, fn, trim=TRUE) {
40 # like by() but usually returns dataframes:
41 # if fn() returns a list, a data frame is returned.
42 # -> byvals are the row names.
43 # -> each list is coerced into a row.
44 # if fn() returns a nonlist, a vector is returned.
45 # -> byvals are the names.
46 # We attempt to be tolerant for slight inconsistencies in fn()'s return values.
47 #
48 # Goal is to be like SQL GROUP BY: dataframes in, aggregated dataframes out.
49 #
50 # If you have a multidimensional matrix (R calls "array"), apply() lets you
51 # select the margin for rollup in a similar way.
97d19e55 » brendano 2009-03-24 point out plyr equivalents 52 #
53 # ALTERNATIVE: ddply() from hadley wickham's plyr: http://had.co.nz/plyr/
492df59a » brendano 2009-02-11 util reorg 54
55 if (class(byvals) == 'function')
56 byvals = byvals(d)
57 if (trim && is.factor(byvals) && !setequal( present_levels(byvals), levels(byvals)) ) {
58 # change to "stop" to find if necessary
59 warning("Uhoh, byvals is a factor but only using only a subset of its levels. Trimming them. Hopefully this is what you want.")
60 byvals = trim_levels(byvals)
61 }
62
63 b = by(d, byvals, fn)
64 list2df(b)
65 }
66
67 util$list2df <- function(ls) {
68 # Wants a list of lists, each of which has the same set named indexes.
69 # Outputs a dataframe where said indexes are the column names.
70 # Is tolerant for slight inconsistencies of present indexes.
71 # Transfers list index names to dataframe rownames.
97d19e55 » brendano 2009-03-24 point out plyr equivalents 72 #
73 # ALTERNATIVE: ldply() -- i think -- from http://had.co.nz/plyr/
492df59a » brendano 2009-02-11 util reorg 74
75 b=ls
76 cols = NULL
77 for (i in 1:min(100,length(b))) {
78 cols = c(cols, try(names(b[[i]])))
79 }
80 cols = unique(cols)
81
50e282fb » brendano 2009-02-14 dfagg that can concatenate ... 82 dynamic_returns = (
83 if (class(b[[1]]) == "data.frame") TRUE
84 else if (class(b[[1]]) == "list") FALSE
85 else FALSE
86 # else stop(paste("don't know how to aggregate returns of type",class(b[[1]])))
87 )
88
89 ret = NULL
90 if ( dynamic_returns ) {
91 ret = as.list(rep(0, length(cols)))
92 names(ret) = cols
93 ret = data.frame(ret)[0,]
94
95 for (i in 1:length(b)) {
96 ret = rbind(ret, b[[i]])
97 }
98 } else {
99 ret = data.frame(row.names=names(b))
100 for (col in cols) {
101 # print(col)
102 ret[,col] = sapply(names(b), function(k) {
103 if (is.null(b[[k]])) {
104 NA
105 } else if (!is.null(names(b[[k]]))) {
106 b[[k]][[col]]
107 } else if (length(b[[k]])==1 && is.na(b[[k]])) {
108 NA
109 } else stop("dont know what to do with value ",b[[k]])
110 })
111 }
112 if (length(cols) == 0) {
113 return(sapply(names(b), function(k) b[[k]]))
114 }
492df59a » brendano 2009-02-11 util reorg 115 }
116 ret
117 }
118
119 util$matrix2df <- function(x) {
120 # sapply() with fn() yielding lists returns a matrix with named rows/cols ...
121 # and whenever you name-index into this thing it return a list ... yuck
122 # make that shit more normal.
123 if (class(x) != 'matrix') stop("why is class ",class(x))
124 colnames = dimnames(x)[[2]]
125 if (nrow(x) > 1)
126 data.frame(
127 sapply(colnames, function(n) unlist(x[,n])),
128 row.names=row.names(x))
129 else
130 # because sapply returns a named vector in this case...
131 data.frame(
132 t(sapply(colnames, function(n) unlist(x[,n]))),
133 row.names=row.names(x))
134 }
135
136 util$dfapply <- function(collection, fn) {
137 # like sapply/lapply except it expects fn() to yield lists.
138 # each list gets coerced into a single row of a returned dataframe.
97d19e55 » brendano 2009-03-24 point out plyr equivalents 139 # ALTERNATIVE: adply() -- i think -- from http://had.co.nz/plyr/
140
492df59a » brendano 2009-02-11 util reorg 141 r = sapply(collection, fn)
142 r = base::t(r)
143 # sapply gives real f'd up stuff for singleton list return values. compare replicate(10,list(a=unif(1))) vs replicate(10,list(a=runif(1),b=runif(1)). and the transposes are weirder
144 if (length(unique(dimnames(r)[[2]])) == 1) {
145 r = base::t(r)
146 dimnames(r) = list(NULL, unique(dimnames(r)[[1]]))
147 }
148 r = matrix2df(r)
149 row.names(r) = collection
150 r
151 }
152
153
154 util$df2matrix <- function(d, bycols, targetcol,
155 targetfn = if (is.numeric(d[,targetcol])) mean else most_common)
156 {
157 # for df's that essentially store sparse matrices. make a real matrix via
158 # by()-like conditioning on multiple columns ... a contingency table.
159 # Design goal: inspired by table(), which does the same thing, except cells are always counts.
160 #
161 # This is *NOT* the inverse of matrix2df ! would be good to change naming.
162 #
163 # e.g. you want to know the effects of "ps" and "t" on "acc", marginalizing out "size":
164 # > head(d)
165 # size ps t acc
166 # 1 2 0.0009765625 -1 668
167 # 2 2 0.0009765625 0 668
168 # 3 2 0.0009765625 20 670
169 # 4 2 0.0009765625 50 664
170 #
171 # you do:
172 # > df2matrix(head(d), c('ps','t'), 'acc', mean)
173 # -1 0 20 50
174 # 0.0009765625 668 668 670 664
175 # 0.5 668 668 NA NA
176 #
177 # then heatmap(.Last.value, Rowv=NA,Colv=NA,scale='none') or whatever else your heart desires
97d19e55 » brendano 2009-03-24 point out plyr equivalents 178 #
179 # ALTERNATIVE: daply() from hadley wickham's plyr: http://had.co.nz/plyr/
492df59a » brendano 2009-02-11 util reorg 180
181 for (j in 1:length(bycols))
182 d[,bycols[j]] = factor(d[,bycols[j]])
183
184 the_dimnames = lapply(1:length(bycols), function(j) levels((d[,bycols[j]])) )
185
186 # the by() cascade:
187 # we want, for bycols=c('ps','t') and targetcol='acc', finalfn=mean:
188 # by(d,d$ps, function(x) by(x,x$t, function(x) mean(x$acc)))
189 # so recursively build that linked list of closures, from right to left.
190 by_cascade = list()
191 by_cascade[[length(bycols)+1]] = function(x) targetfn(x[,targetcol])
192
193 for (j in length(bycols):1) {
194 by_cascade[[j]] = with(list(j=j),
195 function(x) {
196 by(x, x[,bycols[j]], by_cascade[[j+1]])
197 }
198 )
199 }
200
201 b = by_cascade[[1]](d)
202 m = array(NA, dim=sapply(the_dimnames,length), dimnames=the_dimnames)
203
204 # simplest and slowest: dont use any margins for assignments.
205 # yes, this would be extremely speedy in c++
206 all_spots = multi_xprod(lapply(1:length(bycols), function(j) 1:length(the_dimnames[[j]])))
207 for (i in 1:length(all_spots)) {
208 inds = all_spots[[i]]
209 m[t(inds)] = b[[inds]]
210 }
211 m
212 }
213
214 util$kill_df_lists <- function(d) {
215 # if you have internal lists inside your dataframe. if you always use
216 # matrix2df this should never happen. but sometimes it does. yikes!
217 for(n in names(d))
218 if (is.list(d[,n]))
219 d[,n] = list2v(d[,n])
220 d
221 }
222
223 util$flipleft <- function(x, named_vec, by) {
224 # Kinda dangerous but sometimes convenient:
225 # Left join data frame `x` against named_vec, matching named_vec's names
226 # against a column in x.
227 # Returns the new column as a bare vector, same height as x.
228 if (is.null(names(named_vec))) {
229 stopifnot(length(named_vec) == nlevels(x[,by]))
230 names(named_vec) = levels(x[,by])
231 }
232 y = data.frame(row.names=names(named_vec), ze_y_value=named_vec)
233 x$ze_orig_order = 1:nrow(x)
234 merged = merge(x,y, by.x=by, by.y=0, all.x=T, all.y=F)
235
236 merged$ze_y_value[order( merged$ze_orig_order )]
237 }
238
239 #######
240
7b5506d3 » brendano 2009-03-24 hintonplot & more fun 241 util$read.tsv <- function(...) read.delim(..., quote='') # honest-to-goodness vanilla tsv with header
242
73fe52a2 » brendano 2008-05-29 generic analysis initial co... 243 util$msg <- function(...) cat(..., "\n", file=stderr())
244
245 util$strlen <- function(s) length(strsplit(s,"")[[1]])
246
247 util$strmatch <- function(pat,s) length(grep(pat,s)) > 0
248
249 util$strstrip <- function(s) gsub("^\\s*|\\s*$", "", s)
250
d60c16b1 » brendano 2008-06-25 xval stuff, c++ extension f... 251 util$as.c <- as.character
252
3d974a50 » brendano 2008-05-30 accuracy, prec/recall analysis 253 util$is_empty <- function(collection) length(collection) == 0
254
d4d16257 » brendano 2008-06-29 gold calibration for contin... 255 util$first <- function(x) head(x, 1)
256
1d4b8217 » brendano 2008-06-19 EM works! a little bit at ... 257 util$last <- function(x) tail(x, 1)
258
73fe52a2 » brendano 2008-05-29 generic analysis initial co... 259 util$unwhich <- function(indices, len=length(indices)) {
260 ret = rep(F,len)
261 ret[indices] = T
262 ret
263 }
264
4aedf99c » brendano 2008-07-11 util cleanups 265 util$nna <- function(...) !is.na(...) # i type this a lot, i think its worth 3 characters + shift key
266
7b5506d3 » brendano 2009-03-24 hintonplot & more fun 267 util$kna <- function(x) x[nna(x)] # kill NA's (from vector)
268
4aedf99c » brendano 2008-07-11 util cleanups 269 # hm: unitnorm and rescale are both subsumed in reshape:rescaler
270
b2ff0e9e » brendano 2008-07-03 checkpoint for stuff 271 util$unitnorm <- function(x, na.rm=FALSE, ...) (x - mean(x,na.rm=na.rm,...)) / sd(x,na.rm=na.rm)
272
273 util$renorm <- function(x, mean=0, sd=1, ...) (unitnorm(x,...) * sd) + mean
d60c16b1 » brendano 2008-06-25 xval stuff, c++ extension f... 274
4aedf99c » brendano 2008-07-11 util cleanups 275 util$rescale <- function(x, bounds=range(x)) (x-bounds[1]) / (bounds[2]-bounds[1])
276
d60c16b1 » brendano 2008-06-25 xval stuff, c++ extension f... 277 util$rbern <- function(n, p=0.5) rbinom(n, size=1, prob=p)
278
7b5506d3 » brendano 2009-03-24 hintonplot & more fun 279 # util$my_rmultinom <- function(n, w=c(1,1,8)) {
280 # # because rmultinom() doesn't make sense to me
281 # # w are per-class weights (unnorm probs)
282 # p = w / sum(w)
283 # cutoffs = cumsum(p)
284 # unif = runif(n)
285 # mat = sapply(unif, function(x) x < cutoffs)
286 # apply(mat, 2, function(x) min(which(x)) - 1)
287 # }
288
289 util$boot_binom <- function(n, p) rbinom(1,n,p)/n
290
d60c16b1 » brendano 2008-06-25 xval stuff, c++ extension f... 291 util$shuffle <- function(...) UseMethod("shuffle")
292
293 util$shuffle.default <- function(x) x[order(runif(length(x)))]
294
295 util$shuffle.data.frame <- function(x) x[order(runif(nrow(x))),]
42fbe53d » brendano 2008-06-20 more EM stuff 296
50e282fb » brendano 2009-02-14 dfagg that can concatenate ... 297 util$sample_df <- function(d, size=10, ...) {
298 samp = sample(1:nrow(d), size=size, ...)
299 d[samp,]
300 }
301
42fbe53d » brendano 2008-06-20 more EM stuff 302 util$present_levels <- function(x) intersect(levels(x), x)
303
d4d16257 » brendano 2008-06-29 gold calibration for contin... 304 util$trim_levels <- function(...) UseMethod("trim_levels")
305
306 util$trim_levels.factor <- function(x) factor(x, levels=present_levels(x))
307
308 util$trim_levels.data.frame <- function(x) {
309 for (n in names(x))
310 if (is.factor(x[,n]))
311 x[,n] = trim_levels(x[,n])
312 x
313 }
314
315 util$kill_names <- function(x) { names(x) = NULL; x }
42fbe53d » brendano 2008-06-20 more EM stuff 316
492df59a » brendano 2009-02-11 util reorg 317
318 # Variants on table()
319
320 util$table.freq <- function(...) table(...) / sum(table(...))
321
322 util$table.square <- function(x,y, ..., values=unique(c(as.c(x),as.c(y)))) {
323 # intended for factor data
324 x=as.c(x); y=as.c(y)
325 for (i in 1:length(values)) for (j in 1:length(values)) {
326 x = c(x, values[i]); y = c(y, values[j])
327 }
328 legit_inds = as.c(x) %in% values & as.c(y) %in% values
329 t = table(x[legit_inds], y[legit_inds], ...)
330 # print(t)
331 t = t - 1
332 t
333 }
334
335 util$table.cond <- function(...) {
50e282fb » brendano 2009-02-14 dfagg that can concatenate ... 336 # for two args x,y: x on rows, y on cols, cells are P(y|x)
492df59a » brendano 2009-02-11 util reorg 337 t = table(...)
338 for (x1 in 1:nrow(t))
339 t[x1,] = t[x1,] / sum(t[x1,])
340 t
341 }
342
343 util$table.range <- function(x, min=NULL, max=NULL) {
344 # like table(), but only for integers, and forces a contiguous range of bins
345 # so counts of 0 can appear. Useful if you want to compare tables between
346 # different datasets.
347 if (is.null(min)) min = min(x)
348 if (is.null(max)) max = max(x)
349 x2 = rep(0, max-min+1)
350 t = table(x)
351 t_inds = as.integer(names(t))
352 t_mask = t_inds >= min & t_inds <= max
353 t_inds = t_inds[t_mask]
354 mask = t_inds - min + 1
355 x2[mask] = x2[mask] + t[t_mask]
356 names(x2) = min:max
357 x2
358 }
359
360 # Tie breakers
361
1d4b8217 » brendano 2008-06-19 EM works! a little bit at ... 362 util$fair_gt <- function(x,y) {
363 # breaks ties arbitrarily. # of TRUE's should be halfway between > and >=.
364 ret = rep(NA, length(x))
365 ret[x > y] = TRUE
366 ret[x < y] = FALSE
367 # which filler order? should randomly chooise either c(T,F) vs c(F,T) as the
368 # seed (or a random permutation of 50/50 distribution on the whole length),
369 # but not clear how to stably but arbitrarily choose one... hash the bitmap
370 # of the concatenation of x and y perhaps. don't know how to do in highlevel R.
371 filler_length = length(which(x==y))
372 filler = rep(c(TRUE,FALSE), ceiling(filler_length/2) )[1:filler_length]
373 ret[which(x == y)] = filler
374 ret
375 }
376
377 util$fair_lt <- function(x,y) ! fair_gt(x,y)
378
379 util$rand_gt <- function(x,y) {
380 # breaks ties randomly.
381 ret = rep(NA, length(x))
382 ret[x > y] = TRUE
383 ret[x < y] = FALSE
384 filler_length = length(which(x==y))
385 filler = as.logical(rbern(filler_length))
386 ret[which(x == y)] = filler
387 ret
388 }
389
390 util$rand_lt <- function(x,y) ! rand_gt(x,y)
391
37beba92 » brendano 2008-06-16 code used for blog.dl.c/?p=... 392 util$most_common <- function(x) names(which.max(table(x, exclude=NULL)))
393
5909eefd » brendano 2008-06-16 worker modeling off the gol... 394 util$p2o <- function(p) p / (1-p) # probability -> odds ratio
395 util$o2p <- function(o) o / (1+o) # odds ratio -> probability
1d4b8217 » brendano 2008-06-19 EM works! a little bit at ... 396 util$lo2p <-function(lo) o2p(2^lo)
d60c16b1 » brendano 2008-06-25 xval stuff, c++ extension f... 397 util$p2lo <-function(p) log2(p2o(p)) # i like base-2 logits best.
c7cc1fd4 » dolores 2008-08-18 new amt changes + other stu... 398 util$lno2p <-function(lo) o2p(exp(lo))
399 util$p2lno <-function(p) log(p2o(p))
5909eefd » brendano 2008-06-16 worker modeling off the gol... 400
37beba92 » brendano 2008-06-16 code used for blog.dl.c/?p=... 401 util$merge.list <- function(x,y,only.new.y=FALSE,append=FALSE,...) {
402 # http://tolstoy.newcastle.edu.au/R/devel/04/11/1469.html
403 out=x
404
405 ystructure = names(c(y,recursive=TRUE))
406 xstructure = names(c(x,recursive=TRUE))
407 yunique = ystructure[! ystructure %in% xstructure]
408
409 ystructure = sapply(ystructure,FUN=function(element) strsplit(element,"\\."))
410 xstructure = sapply(xstructure,FUN=function(element) strsplit(element,"\\."))
411 yunique = sapply(yunique,FUN=function(element) strsplit(element,"\\."))
412
413 if (only.new.y)
414 lapply(yunique, FUN=function(index) out[[index]]<<-y[[index]])
415 else {
416 if (!append) {
417 lapply(ystructure, FUN=function(index) out[[index]]<<-y[[index]])
418 }
419 else lapply(ystructure, FUN=function(index) out[[index]]<<-c(out[[index]],y[[index]]))
420 }
421 return(out)
422 }
423
73fe52a2 » brendano 2008-05-29 generic analysis initial co... 424 util$lax_rbind <- function(...) {
425 inputs = list(...)
426 each_names = sapply(inputs, names)
427 all_names = unique(c(each_names, recursive=TRUE))
2355b0c5 » brendano 2009-10-02 bugfixes 428 if (is.data.frame(inputs[[1]]) && all(dim(inputs[[1]])==c(0,0)))
429 inputs[[1]] = NULL
73fe52a2 » brendano 2008-05-29 generic analysis initial co... 430 for (k in 1:length(inputs)) {
431 if (is.null(inputs[[k]])) next
432 more = setdiff(all_names, names(inputs[[k]]))
2355b0c5 » brendano 2009-10-02 bugfixes 433 names(inputs[[k]]) = c(names(inputs[[k]]), more)
434 # inputs[[k]][,more] = NA
73fe52a2 » brendano 2008-05-29 generic analysis initial co... 435 }
436 do.call(rbind, inputs)
437 }
438
439 util$fill_bool <- function(bool, true='yes', false='no') {
440 ret = rep(NA,length(bool))
441 names(ret) = names(bool)
442 ret[bool] = true
443 ret[!bool] = false
444 ret
445 }
446
3d974a50 » brendano 2008-05-30 accuracy, prec/recall analysis 447 util$trmap <- function(vec, translation_table) {
4aedf99c » brendano 2008-07-11 util cleanups 448 # pre-obsoleted by chartr() ? like unix "tr"
3d974a50 » brendano 2008-05-30 accuracy, prec/recall analysis 449 ret = rep(NA, length(vec))
450 for (x in names(translation_table))
451 ret[as.c(vec)==x] = translation_table[[x]]
452 ret
453 }
73fe52a2 » brendano 2008-05-29 generic analysis initial co... 454
455 util$bgrep <- function(pat,x, ...) {
3d974a50 » brendano 2008-05-30 accuracy, prec/recall analysis 456 # "boolean" grep: return a logical vector ready for &, | etc ops.
457 # so bgrep works in the world of vector ops like ==, %in%, etc.
73fe52a2 » brendano 2008-05-29 generic analysis initial co... 458 unwhich(grep(pat,x,...), length(x))
459 }
460
d60c16b1 » brendano 2008-06-25 xval stuff, c++ extension f... 461 util$ngrep <- function(pat,x, ...)
462 # "normal" grep: return values, not indices
463 x[grep(pat,x,...)]
37beba92 » brendano 2008-06-16 code used for blog.dl.c/?p=... 464
73fe52a2 » brendano 2008-05-29 generic analysis initial co... 465 util$tapply2 <- function(x, ...) {
4aedf99c » brendano 2008-07-11 util cleanups 466 # like tapply but preserves factors
73fe52a2 » brendano 2008-05-29 generic analysis initial co... 467 if (is.factor(x)) {
468 r = factor(tapply(as.character(x), ...), levels=levels(x))
469 } else {
470 r = tapply(x, ...)
471 }
472 r
473 }
474
475 util$inject <- function(collection, start, fn) {
d60c16b1 » brendano 2008-06-25 xval stuff, c++ extension f... 476 # like lisp reduce. (named after ruby)
73fe52a2 » brendano 2008-05-29 generic analysis initial co... 477 acc = start
478 for (x in collection)
479 acc = fn(acc, x)
480 acc
481 }
482
483 util$select <- function(collection, fn) {
4aedf99c » brendano 2008-07-11 util cleanups 484 # like lisp filter. (named after ruby)
485 # nice for lists. not useful for vectors, use boolean vector indexing instead.
73fe52a2 » brendano 2008-05-29 generic analysis initial co... 486 r = c()
487 for (x in collection)
488 if (fn(x))
489 r = c(r, x)
490 r
491 }
492
493 util$xprod <- function(xs,ys) {
494 ret = list()
495 i=0
496 for (x in xs) for (y in ys) {
497 i = i+1
498 ret[[i]] = list(x=x,y=y)
499 }
500 ret
501 }
502
d60c16b1 » brendano 2008-06-25 xval stuff, c++ extension f... 503 util$multi_xprod <- function(args) {
504 # args = list(...)
505 pair_xprod <- function(xs,ys) {
506 ret = list()
507 i=0
508 for (x in xs) for (y in ys) {
509 i = i+1
510 ret[[i]] = c(x,y)
511 }
512 ret
513 }
514 ret = list(NA)
515 for (i in 1:length(args)) {
516 ret = pair_xprod(ret, args[[i]])
517 }
518 lapply(ret, function(x) x[2:length(x)])
519 }
520
7b5506d3 » brendano 2009-03-24 hintonplot & more fun 521 util$printf <- function(...) cat(sprintf(...))
522
523 util$listprint <- function(x) {
524 s = paste(sapply(names(x), function(n) sprintf("%s=%s", n,x[[n]])), collapse=' ')
525 printf("%s\n", s)
526 }
527
528
4aedf99c » brendano 2008-07-11 util cleanups 529 # routines to help manage longrunning jobs and optimize performance.
d60c16b1 » brendano 2008-06-25 xval stuff, c++ extension f... 530 # so much more potential here...
531
532 util$timeit <- function(expr, name=NULL) {
533 # print how long the expression takes, and return its value too.
4aedf99c » brendano 2008-07-11 util cleanups 534 # So you can interpose timeit({ blabla }) around any chunk of code "blabla".
73fe52a2 » brendano 2008-05-29 generic analysis initial co... 535 start = Sys.time()
d60c16b1 » brendano 2008-06-25 xval stuff, c++ extension f... 536 ret = eval(expr)
73fe52a2 » brendano 2008-05-29 generic analysis initial co... 537 finish = Sys.time()
d60c16b1 » brendano 2008-06-25 xval stuff, c++ extension f... 538 if (!is.null(name)) cat(name,": ")
2355b0c5 » brendano 2009-10-02 bugfixes 539 print(finish-start)
73fe52a2 » brendano 2008-05-29 generic analysis initial co... 540 invisible(ret)
541 }
542
b2ff0e9e » brendano 2008-07-03 checkpoint for stuff 543 util$dotprogress <- function(callback, interval=10) {
d60c16b1 » brendano 2008-06-25 xval stuff, c++ extension f... 544 # intended to wrap the anonymous callback for sapply() or somesuch.
73fe52a2 » brendano 2008-05-29 generic analysis initial co... 545 count = 0
546 return(function(...) {
547 if ((count <<- count+1) %% interval == 0)
548 cat(".")
549 callback(...)
550 })
551 }
552
553
492df59a » brendano 2009-02-11 util reorg 554 ########
73fe52a2 » brendano 2008-05-29 generic analysis initial co... 555
7b5506d3 » brendano 2009-03-24 hintonplot & more fun 556 util$hintonplot <- function(mat, max_value=max(abs(mat)), mid_value=0, ...) {
557 # example for counts:
558 # > t=table(cyl=mtcars$cyl, mpg=cut(mtcars$mpg,3))
559 # > t
560 # mpg
561 # cyl (10.4,18.2] (18.2,26.1] (26.1,33.9]
562 # 4 0 6 5
563 # 6 2 5 0
564 # 8 12 2 0
565 # > hintonplot(t) # same thing but graphically
566
567 plot.new()
568 plot.window(xlim=c(0.5,ncol(mat)+0.5), ylim=c(0.5,nrow(mat)+0.5))
569
570 x_mid = 1:ncol(mat)
571 y_mid = 1:nrow(mat)
572
573 area = abs(mat) / max_value
574 side = sqrt(area)
575
576
577 for (x in 1:ncol(mat)) {
578 for (y in nrow(mat):1) {
579 # ym = (nrow(mat):1)[y]
580 ym = y
581 d = side[ym,x] / 2
582 rect(x-d, y-d, x+d, y+d, col=if (mat[ym,x]>0) 'darkblue' else 'darkred')
583 }
584 }
585
586 axis(1, 1:ncol(mat), labels=colnames(mat))
587 # axis(2, nrow(mat):1, labels=row.names(mat))
588 axis(2, 1:nrow(mat), labels=row.names(mat))
589 title(xlab=names(dimnames(mat))[2], ylab=names(dimnames(mat))[1], ...)
590 }
591
4a6ecb46 » brendano 2009-09-01 reframe(), linelight(), and... 592 util$linelight <- function(x,y, lty='dashed', col='lightgray', ...) {
593 # highlight a point with lines running to the axes.
594 left = par('usr')[1]
595 bot = par('usr')[3]
596 segments(left,y, x,y, lty=lty, col=col, ...)
597 segments(x,bot, x,y, lty=lty, col=col, ...)
598 }
599
600 util$binary_eval <- function(pred,labels, cutoff='naive', repar=TRUE, ...) {
7559a34b » brendano 2009-03-31 binary_eval() and vim() 601 library(ROCR)
602 # plot(performance(prediction(pred,y),'acc'))
603 rocr_pred = prediction(pred,labels)
604 acc = performance(rocr_pred,'acc')
605 f1 = performance(rocr_pred,'f')
606 auc = performance(rocr_pred,'auc')@y.values[[1]]
607 roc = performance(rocr_pred,'rec','spec')
608 # sensspec = performance(rocr_pred,'rec','spec')
609 pr_curve = performance(rocr_pred,'prec','rec')
4a6ecb46 » brendano 2009-09-01 reframe(), linelight(), and... 610 rp_curve = performance(rocr_pred,'rec','prec')
7559a34b » brendano 2009-03-31 binary_eval() and vim() 611
612 printf("AUC = %.3f\n", auc)
613
614 if (cutoff=='naive') {
615 if (all(pred>=0) & all(pred<=1)) {
616 printf("Predictions seem to be probabilities, so ")
617 cutoff = 0.5
618 } else if (any(pred<0) & any(pred>0)) {
619 printf("Predictions seem to be real-valued scores, so ")
620 cutoff = 0
621 } else {
622 warning("cant tell what naive cutoff should be")
623 cutoff = NULL
624 }
625 printf("using naive cutoff %s:\n", cutoff)
626 } else if (class(cutoff)=='character') {
627 printf("Using %s-best cutoff ", cutoff)
4a6ecb46 » brendano 2009-09-01 reframe(), linelight(), and... 628 perf = performance(rocr_pred, cutoff, ...)
7559a34b » brendano 2009-03-31 binary_eval() and vim() 629 cutoff_ind = which.max(perf@y.values[[1]])
4a6ecb46 » brendano 2009-09-01 reframe(), linelight(), and... 630 cutoff = if (cutoff=='prbe') perf@x.values[[1]][1] else rocr_pred@cutoffs[[1]][cutoff_ind]
7559a34b » brendano 2009-03-31 binary_eval() and vim() 631 printf("%f:\n", cutoff)
632 } else {
633 printf("For cutoff %s:\n", cutoff)
634 }
635 cutoff_ind = last(which(rocr_pred@cutoffs[[1]] >= cutoff))
636
637
4a6ecb46 » brendano 2009-09-01 reframe(), linelight(), and... 638 if (repar) par(mfrow=c(2,2))
639
640 pp = function(perf) {
641 if (is.finite(cutoff_ind)) {
642 x=perf@x.values[[1]][cutoff_ind]
643 y=perf@y.values[[1]][cutoff_ind]
644 points(x,y, col='blue')
645 linelight(x,y, col='lightblue')
646 }
647 }
7559a34b » brendano 2009-03-31 binary_eval() and vim() 648 plot(acc); pp(acc)
649 plot(f1); pp(f1)
650 plot(roc); pp(roc)
651 abline(a=1,b=-1,lty='dashed',col='gray')
652 legend('bottomleft',legend=sprintf("AUC = %.3f",auc))
4a6ecb46 » brendano 2009-09-01 reframe(), linelight(), and... 653 plot(rp_curve); pp(rp_curve)
654 pp = function(ind,...) points(rp_curve@x.values[[1]][ind], rp_curve@y.values[[1]][ind], ...)
7559a34b » brendano 2009-03-31 binary_eval() and vim() 655 best_f1 = which.max(f1@y.values[[1]])
4a6ecb46 » brendano 2009-09-01 reframe(), linelight(), and... 656 pp(best_f1, pch=2,col='green')
657 f05 = performance(rocr_pred,'f',beta=0.5)
658 best_f05 = which.max(f05@y.values[[1]])
659 pp(best_f05,pch=2,col='green')
660 f2 = performance(rocr_pred,'f',beta=2)
661 best_f2 = which.max(f2@y.values[[1]])
662 pp(best_f2,pch=2,col='green')
663
664 prbe = performance(rocr_pred,'prbe')@y.values[[1]]
665 linelight(prbe,prbe,col='lightgray')
7559a34b » brendano 2009-03-31 binary_eval() and vim() 666
667 # printf("Acc = %.3f\n", mean((pred >= cutoff) == (labels > 0)))
668 printf("Acc = %.3f\n", acc@y.values[[1]][cutoff_ind])
669
670 printf(" F = %.3f\n", f1@y.values[[1]][cutoff_ind])
671 printf(" Prec = %.3f\n", pr_curve@y.values[[1]][cutoff_ind])
672 printf(" Rec = %.3f\n", pr_curve@x.values[[1]][cutoff_ind])
673 printf(" Spec = %.3f\n", roc@x.values[[1]][cutoff_ind])
674 if (rocr_pred@n.pos[[1]] != rocr_pred@n.neg[[1]])
675 printf("Balanced Acc = %.3f\n", mean(c(roc@x.values[[1]][cutoff_ind], roc@y.values[[1]][cutoff_ind])))
676
677
678 invisible(rocr_pred)
679 }
680
7b5506d3 » brendano 2009-03-24 hintonplot & more fun 681
682
683
684 ########
685
492df59a » brendano 2009-02-11 util reorg 686 # for interactivity...
d60c16b1 » brendano 2008-06-25 xval stuff, c++ extension f... 687
492df59a » brendano 2009-02-11 util reorg 688 util$excel <- function(d) {
7b5506d3 » brendano 2009-03-24 hintonplot & more fun 689 f = paste("/tmp/tmp.", round(runif(1)*1000),".csv", sep='')
690 # con = file(f, "w", encoding="MACROMAN")
691 con = file(f, "w")
692 write.csv(d, con, row.names=FALSE)
693 close(con)
492df59a » brendano 2009-02-11 util reorg 694 # system(paste("open -a 'Microsoft Excel' ",f, sep=''))
695 system(paste("open -a '/Applications/Microsoft Office 2008/Microsoft Excel.app' ",f, sep=''))
696 }
d60c16b1 » brendano 2008-06-25 xval stuff, c++ extension f... 697
492df59a » brendano 2009-02-11 util reorg 698 util$mate <- function(...) {
699 system(paste("mate", ...))
700 }
d60c16b1 » brendano 2008-06-25 xval stuff, c++ extension f... 701
7559a34b » brendano 2009-03-31 binary_eval() and vim() 702 util$vim <- function(...) {
703 system(paste("vim",...))
704 }
705
492df59a » brendano 2009-02-11 util reorg 706 util$ppy <- function(x, column.major=FALSE, ...) {
707 # pretty-print as yaml. intended for rows with big textual cells.
708 # a la mysql's \G operator
709 # same usecase as ppy() in dotfiles.org/~brendano/.irbrc
d60c16b1 » brendano 2008-06-25 xval stuff, c++ extension f... 710
492df59a » brendano 2009-02-11 util reorg 711 library(yaml)
712 cat(as.yaml(x, column.major=column.major), ...)
713 cat("\n", ...)
714 }
d60c16b1 » brendano 2008-06-25 xval stuff, c++ extension f... 715
492df59a » brendano 2009-02-11 util reorg 716 util$newwin <- function(x) {
717 f = paste("/tmp/tmp.", round(runif(1)*100),".txt", sep='')
718 capture.output(print(x),file=f)
719 # system("FILE_TO_VIEW=/tmp/tmp.txt /Applications/Utilities/Terminal.app/Contents/MacOS/Terminal /users/brendano/sw/bin/lame_viewer.sh")
720 # system("DISPLAY=:0 /usr/X11R6/bin/xterm -geometry 80x60 -e less /tmp/tmp.txt &")
721 system(paste("mate ",f," &", sep=''))
d60c16b1 » brendano 2008-06-25 xval stuff, c++ extension f... 722 }
73fe52a2 » brendano 2008-05-29 generic analysis initial co... 723
d60c16b1 » brendano 2008-06-25 xval stuff, c++ extension f... 724
492df59a » brendano 2009-02-11 util reorg 725 ##########
37beba92 » brendano 2008-06-16 code used for blog.dl.c/?p=... 726
492df59a » brendano 2009-02-11 util reorg 727 while("util" %in% search())
728 detach("util")
729 attach(util)
73fe52a2 » brendano 2008-05-29 generic analysis initial co... 730
492df59a » brendano 2009-02-11 util reorg 731 ##########
73fe52a2 » brendano 2008-05-29 generic analysis initial co... 732
733
734
4aedf99c » brendano 2008-07-11 util cleanups 735
736
492df59a » brendano 2009-02-11 util reorg 737
738 ## deprecated ##
4aedf99c » brendano 2008-07-11 util cleanups 739
d4d16257 » brendano 2008-06-29 gold calibration for contin... 740 util$mymerge <- function(x,y, row.x=F,row.y=F, keep.y=NULL, by=NULL, ...) {
d60c16b1 » brendano 2008-06-25 xval stuff, c++ extension f... 741 # Wrapper around merge(). turns out this is not needed because i didnt
742 # read merge()'s manual page carefully enough: it has a facility for
743 # joining on rownames. merge() is great.
744
3d974a50 » brendano 2008-05-30 accuracy, prec/recall analysis 745 if (row.x) x[,by] = row.names(x)
746 if (row.y) y[,by] = row.names(y)
747
d4d16257 » brendano 2008-06-29 gold calibration for contin... 748 ret = merge(x,y,by=by, suffixes=c('','.y'), ...)
5909eefd » brendano 2008-06-16 worker modeling off the gol... 749 if (row.x && nrow(ret)==nrow(x)) row.names(ret) = row.names(x)
750 if (row.y && nrow(ret)==nrow(y)) row.names(ret) = row.names(y)
42fbe53d » brendano 2008-06-20 more EM stuff 751
752 if (!is.null(keep.y))
753 ret = ret[ ,c(names(x),keep.y) ]
3d974a50 » brendano 2008-05-30 accuracy, prec/recall analysis 754 ret
73fe52a2 » brendano 2008-05-29 generic analysis initial co... 755 }
756
757 util$read.xmlss <- function(f) {
7b5506d3 » brendano 2009-03-24 hintonplot & more fun 758 # ALTERNATIVE: read.tsv(pipe("xlsx2tsv ...")) with github.com/brendano/tsvutils
759 # xlsx is DIFFERENT from xmlss. on mac, need excel 2008 to get it
760
d60c16b1 » brendano 2008-06-25 xval stuff, c++ extension f... 761 ## BAD BUG: the xml skips cells sometimes. tricky to parse, argh.
73fe52a2 » brendano 2008-05-29 generic analysis initial co... 762 # Mac Excel 2004 calls this "XML Spreadsheet". It's nice because it's UTF-8.
763 # [ mac .xls seems to be macroman, but xls2csv (perl converter) f's it up,.
764 # and then iconv can't recover. boo! ]
7b5506d3 » brendano 2009-03-24 hintonplot & more fun 765
766
73fe52a2 » brendano 2008-05-29 generic analysis initial co... 767 csv_pipe = pipe(paste('ruby <<EOF
768 require "rubygems"
769 require "hpricot"
770 require "fastercsv"
771 h = Hpricot(File.read("',f,'"))
772 mat = (h.at("worksheet")/"row").map{|row| (row/"cell").map{|data| data.inner_text}}
773 mat.each{|row| puts row.to_csv}
774 ', sep=''))
775 df = read.csv(csv_pipe)
776 # close(csv_pipe)
777 df
778 }
779
492df59a » brendano 2009-02-11 util reorg 780 util$list2v <- function(x) sapply(x, I) # turns list's values into a vector. index names are dropped. pre-obsoleted by unlist() ?
d60c16b1 » brendano 2008-06-25 xval stuff, c++ extension f... 781