Skip to content

Commit

Permalink
Added contribution analysis for WikiSym 2009, measures text live time.
Browse files Browse the repository at this point in the history
  • Loading branch information
lucadealfaro committed Oct 24, 2009
1 parent a0a76d6 commit aa37029
Show file tree
Hide file tree
Showing 4 changed files with 256 additions and 2 deletions.
2 changes: 2 additions & 0 deletions analysis/Makefile
Expand Up @@ -79,6 +79,7 @@ OUR_LIB_OBJS = textbuf.cmo text.cmo prioq.cmo filesystem_store.cmo \
revcount_analysis.cmo intertime_analysis.cmo \
trust_origin_analysis.cmo revs_to_files_analysis.cmo \
trust_for_online_analysis.cmo word_frequency.cmo \
contribution_analysis.cmo \
page_factory.cmo do_eval.cmo \
online_revision.cmo db_page.cmo online_page.cmo event_feed.cmo \
online_command_line.cmo updater.cmo
Expand All @@ -93,6 +94,7 @@ OUR_OPT_LIB_OBJS = textbuf.cmx text.cmx prioq.cmx filesystem_store.cmx \
revcount_analysis.cmx intertime_analysis.cmx \
trust_origin_analysis.cmx revs_to_files_analysis.cmx \
trust_for_online_analysis.cmx word_frequency.cmx \
contribution_analysis.cmx \
page_factory.cmx do_eval.cmx \
online_revision.cmx db_page.cmx online_page.cmx event_feed.cmx \
online_command_line.cmx updater.cmx
Expand Down
239 changes: 239 additions & 0 deletions analysis/contribution_analysis.ml
@@ -0,0 +1,239 @@
(*
Copyright (c) 2007-2009 The Regents of the University of California
All rights reserved.
Authors: Luca de Alfaro
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. The names of the contributors may not be used to endorse or promote
products derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*)

(** This analysis measures how much "text display time" each author
has contributed. Precisely, for each author, it computes the total of
the number of words inserted by the author, multiplied by the number
of seconds for which the word was present in the Wiki. *)
(** We are not making the claim that this is the best way to measure
author contribution. For one thing, this measure under-estimates the
work of people who revise and reformat revisions. Nevertheless, this
is a possible measure, and could be even more interesting if one then
multiplied the "text display time" for each page, by the average
number of daily page views. See our WikiSym 2008 paper for a more
thorough comparison of user contribution measures. *)

type word = string
exception Ins_text_in_deleted_chunk
exception Illegal_call_eval_newest
open Eval_defs


class page
(id: int)
(title: string)
(out_file: out_channel)
(end_time: float) (* Time at which the wiki dump was produced. *)
=
object (self)

(* These are, respectively, the revision just added, and the
immediately preceding revision. *)
val mutable old_rev_opt : Revision.revision option = None
val mutable new_rev_opt : Revision.revision option = None

(* Arrays of text chunks for the revision. Chunk 0 is the live text;
the subsequent chunks are the portions of "dead" text. *)
val mutable chunks_a : word array array = [| [| |] |]
(* For each word, we keep track of who is the author of the word
(we remember the author id). *)
val mutable author_a : int array array = [| [| |] |]

(* This is a hashtable associating to each user_id the contribution. *)
val contributions : (int, float) Hashtbl.t = Hashtbl.create 100
(* This is a hashtable associating to each user_id the user name. *)
val names : (int, string) Hashtbl.t = Hashtbl.create 100

(** This is one of the class methods that we inherit. Normally, it does
nothing. We over-ride it, so that it prints the page title and id:
this is useful e.g. if we wanted to then multiply the contributions
of each author to the page, by the page views of the page. *)
method print_id_title =
Printf.fprintf out_file "Page: %i Title: %S\n" id title;
flush out_file

(** This method increments the contribution of an author. *)
method private inc_contribution (user_id: int) (amount: float) : unit =
try
let old_amount = Hashtbl.find contributions user_id in
Hashtbl.replace contributions user_id (old_amount +. amount)
with Not_found -> Hashtbl.add contributions user_id amount

(** This method analyzes the newest revision. It first determines
which text has been inserted new, and which has been copied,
from the previous revision, and it then gives credit for that
text to the author. *)
method private eval_newest : unit =
(* Gets old and new revisions. Note that the method does not make
sense unless we have both. *)
let new_rev = match new_rev_opt with
None -> raise Illegal_call_eval_newest
| Some r -> r
in
(* Extracts user id and name for the latest revision.
See revision.ml for a list of all the methods on revisions. *)
let user_id = new_rev#get_user_id in
let user_name = new_rev#get_user_name in
(* Computes the difference in time between the last and previous
revisions. *)
let time_delta = match old_rev_opt with
None -> 0.
| Some old_rev -> new_rev#get_time -. old_rev#get_time
in
(* Stores the username *)
if not (Hashtbl.mem names user_id)
then Hashtbl.add names user_id user_name;
(* Gets the word list of the new revision. When the revision object
is created, its text is automatically analyzed, so that this list
of words has been already computed and is ready for use. *)
let new_wl = new_rev#get_words in
(* Calls the function that tracks the text across revisions. *)
let (new_chunks_a, medit_l) = Chdiff.text_tracking chunks_a new_wl in
(* Constructs new_author_a, which contains the authors of each
word in the new text. Initially, we just create chunks of the
right size, but filled with 0. *)
let f x = Array.create (Array.length x) 0 in
let new_author_a = Array.map f new_chunks_a in
(* Now, goes over medit_l, and fills in new_author_a properly.
It also gives credit to each user. *)
let rec f = function
Editlist.Mins (word_idx, l) -> begin
(* This is text added in the current version. We mark
who is the author of the text, but since the text has been
live for 0 time so far, we do not increase user contributions. *)
for i = word_idx to word_idx + l - 1 do
new_author_a.(0).(i) <- user_id
done
end
| Editlist.Mmov (src_word_idx, src_chunk_idx,
dst_word_idx, dst_chunk_idx, l) -> begin
(* This is moved text. *)
if src_chunk_idx = 0 then begin
(* If the text has been moved from chunk 0, then it has been
alive in the meantime, and we add credit to the user,
as well as copying the information of who is the author of
the text. *)
for i = 0 to l - 1 do begin
let a = author_a.(src_chunk_idx).(src_word_idx + i) in
new_author_a.(dst_chunk_idx).(dst_word_idx + i) <- a;
self#inc_contribution a time_delta
end done
end else begin
(* The text was not live. Only copies the author
information across. *)
for i = 0 to l - 1 do
new_author_a.(dst_chunk_idx).(dst_word_idx + i) <-
author_a.(src_chunk_idx).(src_word_idx + i);
done
end
end
| Editlist.Mdel (word_idx, chunk_idx, l) -> begin
(* If the text was live, we give credit for it. *)
if chunk_idx = 0 then begin
for i = word_idx to word_idx + l - 1 do
let a = author_a.(chunk_idx).(word_idx) in
self#inc_contribution a time_delta
done
end
end
in
(* Does this analysis on the whole edit list. *)
List.iter f medit_l;
(* Replaces the old information with the new one. *)
chunks_a <- new_chunks_a;
author_a <- new_author_a


(** This method outputs the results of the analysis. *)
method private output_rep_contributions : unit =
(* The function f is iterated on the contributions hash table. *)
let f (id: int) (c: float) =
(* Gets also the user name *)
let name = Hashtbl.find names id in
(* Outputs the results, using days for the time measurement. *)
Printf.fprintf out_file "Contrib by %d %S is %f\n" id name (c /. 86400.)
in
Hashtbl.iter f contributions;
(* We flush the file, so if we break, we know in which page we break. *)
flush out_file


(** This method is called to add a new revision to be evaluated. *)
method add_revision
(id: int) (* revision id *)
(page_id: int) (* page id *)
(timestamp: string) (* timestamp string *)
(time: float) (* time, as a floating point *)
(contributor: string) (* name of the contributor *)
(user_id: int) (* user id *)
(ip_addr: string)
(username: string) (* name of the user *)
(is_minor: bool)
(comment: string)
(text_init: string Vec.t) (* Text of the revision, still to be
split into words *)
: unit =
(* Creates a new revision object. This also takes care of parsing the
text, etc. *)
let r = new Revision.revision id page_id timestamp time contributor
user_id ip_addr username is_minor comment text_init true in
new_rev_opt <- Some r;
(* Analyzes the text of the new revision and increments the
contributions. *)
self#eval_newest;
(* Copies the new revision into the old one. *)
old_rev_opt <- new_rev_opt;

(** This method is called once the method add_revision is called for all
revisions of a page, and is used to do any final processing. In our case,
we use to increase the contribution amount due to the text that is
still live in the page. *)
method eval: unit =
match old_rev_opt with
None -> () (* There were no revisions, nothing to do. *)
| Some r -> begin
(* Increment the contributions due to the text that is still live
in the page. *)
let time_delta = end_time -. r#get_time in
(* The function f will be iterated on the author chunks. *)
let f (user_id: int) : unit =
self#inc_contribution user_id time_delta in
Array.iter f author_a.(0);
(* Outputs the results *)
self#output_rep_contributions
end

end (* Contribution page *)

15 changes: 13 additions & 2 deletions analysis/page_factory.ml
Expand Up @@ -61,6 +61,7 @@ type analysis_t =
| Revisions_to_text
| AuthorText
| WordFequency
| Contribution_analysis

(** This is the class that stores a page, i.e., an article, and
contains the methods to work on it. This is the simplest implementation,
Expand Down Expand Up @@ -178,6 +179,9 @@ class page_factory
val mutable n_sigs = Online_types.n_past_revs
(* Set of robots *)
val mutable robots = Read_robots.empty_robot_set
(* End date for dump, when measuring user contributions.
The default is the current time *)
val mutable dump_end_date = Unix.time()

(* Files for output *)
val mutable out_file : out_channel = stderr (* also used for eval_file *)
Expand All @@ -199,6 +203,7 @@ class page_factory
| Do_nothing -> Printf.fprintf stderr "noop\n"; flush stderr
| AuthorText -> ()
| WordFequency -> ()
| Contribution_analysis -> Printf.fprintf stderr "contrib\n"; flush stderr

(* These methods are used to set the appropriate evaluation *)
method set_reputation () = mode <- Reputation_analysis
Expand All @@ -210,9 +215,11 @@ class page_factory
method set_trust_for_online () = mode <- Trust_for_online
method set_prune () = mode <- Prune_revisions
method set_revs_to_text () = mode <- Revisions_to_text

method set_author_text () = mode <- AuthorText
method set_word_freq () = mode <- WordFequency
method set_contribution_analysis () = mode <- Contribution_analysis
method set_dump_end_date (n: string) =
dump_end_date <- Timeconv.convert_time n

(* This sets various attributes *)
method set_eval_zip_error () = eval_zip_error <- true
Expand Down Expand Up @@ -249,6 +256,8 @@ class page_factory
("-word-freq", Arg.Unit self#set_word_freq, "Counts the frequency of each word");
("-compute_stats", Arg.Unit self#set_reputation, "Produces the reduced stats files used to compute author reputation.");
("-do_text", Arg.Set do_text, "Uses also text longevity to compute reputation increments.");
("-eval_contrib", Arg.Unit self#set_contribution_analysis, "Evaluates the text display contribution given by each user.");
("-dump_end_date", Arg.String self#set_dump_end_date, "Date at which the dump ends (date is in Wiki format, e.g., 2006-11-22T14:25:19Z )");
("-color_trust", Arg.Unit self#set_trust_color, "Outputs text colored by trust.");
("-color_local_trust", Arg.Unit self#set_trust_local_color, "Colors according to the local trust.");
("-trust_and_origin", Arg.Unit self#set_trust_and_origin, "Colors the text according to trust and adds text origin information.");
Expand Down Expand Up @@ -295,6 +304,8 @@ class page_factory
| Reputation_analysis -> new Reputation_analysis.page id title out_file
eval_zip_error be_precise
n_text_judging n_edit_judging !equate_anons !do_text
| Contribution_analysis -> new Contribution_analysis.page id title
out_file dump_end_date
(* Trust_color does not also do the origin *)
| Trust_color -> new Trust_analysis.page id title xml_file rep_histories
trust_coeff_lends_rep trust_coeff_read_all
Expand Down Expand Up @@ -361,7 +372,7 @@ class page_factory
begin
match mode with
Reputation_analysis -> out_file <- open_out stats_name
| Revcount_analysis | Intertime_analysis
| Revcount_analysis | Intertime_analysis | Contribution_analysis
-> out_file <- open_out default_name
| Trust_color | Trust_syntactregion_color | Trust_and_origin
| AuthorText | WordFequency | Prune_revisions | Revisions_to_text
Expand Down
2 changes: 2 additions & 0 deletions analysis/page_factory.mli
Expand Up @@ -99,6 +99,8 @@ class page_factory :
method set_prune : unit -> unit
method set_revs_to_text : unit -> unit
method set_trust_for_online : unit -> unit
method set_contribution_analysis : unit -> unit
method set_dump_end_date : string -> unit

method print_mode : unit

Expand Down

0 comments on commit aa37029

Please sign in to comment.