In [190]:
import pickle
from itertools import groupby
import random
import mwparserfromhell
import re
from nltk import sent_tokenize

In [251]:
keyfile_tsv_path = '/Users/rpryzant/Desktop/Wiki_NPOV/en_npov_edits_2008.tsv'
revision_pkl_path = '/Users/rpryzant/Desktop/Wiki_NPOV/en_npov_edits_2008.revision_text.pkl'

In [252]:
def load_comments(tsv_path):
    out = {}
    for l in open(tsv_path):
        parts = l.strip().split('\t')
        out[parts[0]] = {
            'rev_comment': parts[1],
            'rev_user': parts[2],
            'rev_user_text': parts[3],
            'rev_timestamp': parts[4],
            'rev_minor_edit': parts[5]
        }
    return out

comments = load_comments(keyfile_tsv_path)
revisions = pickle.load(open(revision_pkl_path, 'rb'))

In [257]:
def prep_tokenized_wikitext(token_list):
    if 0 in token_list:
        # multiple edits
        return
    
    x = ' '.join(token_list)
    # fix tags
    x = x.replace('< ', '<')
    x = x.replace('</ ', '</')
    x = x.replace(' >', '>')
    x = x.replace(' />', '/>')
    
    parse = mwparserfromhell.parse(x)
    plaintext = parse.strip_code()
    
    # fix pre-tokenization errors
    # replace links with their name
    m = re.match('\[{2}.*\|(.*)\]{2}', plaintext)
    if m:
        plaintext = re.sub('\[{2}.*\|(.*)\]{2}', m.group(1), plaintext)

    # Othwise get rid of the links (no name)
    plaintext = plaintext.replace('[[', '')
    plaintext = plaintext.replace(']]', '')
    
    # rm [urls] and urls
    plaintext = re.sub('\[.*?\]', '', plaintext)
    # TODO -- tokenized urls 
    
    return plaintext


def diff(prev_str, next_str):
    prev_set = set(prev_str.split())
    next_set = set(next_str.split())
    
    return prev_set.symmetric_difference(next_set)


def get_sents(prev_edit_str, next_edit_str):
    prev_sents = sent_tokenize(prev_edit_str)
    next_sents = sent_tokenize(next_edit_str)
    if len(prev_sents) != len(next_sents):
        return
    
    for i, (prev_sent, next_sent) in enumerate(zip(prev_sents, next_sents)):
        diff_size = len(diff(prev_sent, next_sent))
        if diff_size > 0:
            prev_ctx = prev_sents[i - 1] if i > 0 else ''
            post_ctx = prev_sents[i + 1] if i < len(prev_sents) - 1 else ''
            yield prev_sent, next_sent, prev_ctx + ' || ' + post_ctx


def examples_from_revision(rev_id):
    print(rev_id)
    metadata = comments[rev_id]

    if rev_id not in revisions:
        yield 'ERROR: id mismatch'

    revision = revisions[rev_id]
    prevs, nexts = revision
    
    prev_text = prep_tokenized_wikitext(prevs)
    next_text = prep_tokenized_wikitext(nexts)

    if prev_text is None or next_text is None:
        yield 'MULTIPLE EDITS'

    if not prev_text or not next_text:
        yield 'LACKING TEXT'
    
    for prev_sent, next_sent, context in get_sents(prev_text, next_text):
        ex = (
            rev_id,
            metadata['rev_comment'],
            prev_sent,
            next_sent,
            context
        )
        yield ex
        
def sample_revision(rev_id):
    try:
        ex = next(examples_from_revision(rev_id))
    except:
        ex = 'NO EXAMPLES'
    print('=' * 80)
    if isinstance(ex, str):
        print(ex)
    else:
        print('EDIT: %s \n COMMENT: %s \n\t%s\n\t%s\n\t%s' % ex)
    
#sample_revision('259892922')

In [248]:
for i in range(100):
    sample_revision(random.choice(list(comments)))

NO EXAMPLES
MULTIPLE EDITS
EDIT: 222535314 
 COMMENT: npov 
	Interestingly ,  Bellone ' s goal was the only one of France ' s fourteen goals to be scored by a striker .
	Bellone ' s goal was the only one of France ' s fourteen goals to be scored by a striker .
	Bruno Bellone  doubled France ' s lead in the final minute to give them a 20 victory . || 
NO EXAMPLES
EDIT: 229258160 
 COMMENT: removing Britishism to more neutral wording 
	5 February   2007  saw the album  in   high   street  stores on  CD  and  DVD .
	5 February   2007  saw the album  released   to  stores on  CD  and  DVD .
	It was released on the  Internet  in December  2006 . || The album consists of both live and studio recordings of old and new songs .
MULTIPLE EDITS
MULTIPLE EDITS
MULTIPLE EDITS
MULTIPLE EDITS
EDIT: 242586722 
 COMMENT: /* The melody */ neutral language 
	This is  deservedly  so much prized that even where Reform Judaism has abolished the recital of the Chaldaic text , the air is often preserved , in 

In [228]:
sample_revision('226054404')

EDIT: 226054404  COMMENT: On second thought: "Northern Epirus" is not a neutral geographical term, but I guess as a marker of the minority it does make sense here
	 The population  is indigenous ethnic  Greek   http :// mondediplo .
	 It  is  home to an  indigenous ethnic  Greek   population , part of the  Northern Epirote  Greek minority   http :// mondediplo .
	 It is located less than 1 km north - west of the  Greek  border . || com / maps / albanianmdv1999 The Albanians , a scattered people by Philippe Rekacewicz , Le Monde diplomatique , January 1999  http :// www .



In [261]:
i = 0
for rev_id in list(comments)[1:]:
    for ex in examples_from_revision(rev_id):
        if not isinstance(ex, str):
            i += 1
print(i)

237443965
237443965
243920179
243920179


TypeError: expected string or bytes-like object