## Set up archives for easy access...

In [1]:
# relevant packages
import json
import numpy as np
import editdistance as ed
from scrapy.selector import Selector
import enchant
import re

from lxml import html

## Characterize the vote-detecting functions...

In [9]:
def findVotes(post):
    "Returns list of votes present in the posts content"
    sel = Selector(text=post['content'])
    
    # pull out all relevant tags
    boldtags = ([each.extract() for each in sel.xpath('/html/body/p/span[@class="noboldsig"]//text()')] +
                [each.extract() for each in sel.xpath('/html/body/span[@class="noboldsig"]//text()')] +
                [each.extract() for each in sel.xpath('/html/body/p/span/span[@class="noboldsig"]//text()')] +
                [each.extract() for each in sel.xpath('/html/body/span/span[@class="noboldsig"]//text()')] +
                [''.join(each.xpath('span//text()').extract()) for
                 each in sel.xpath('/html/body/p/span[@class="noboldsig"]')] +
                [''.join(each.xpath('span//text()').extract()) for
                 each in sel.xpath('/html/body/span[@class="noboldsig"]')]
               )
    votetags = ([each.extract() for each in sel.xpath('/html/body/p/span[@class="bbvote"]//text()')] +
                [each.extract() for each in sel.xpath('/html/body/span[@class="bbvote"]//text()')] + 
                [each.extract() for each in sel.xpath('/html/body/p/span/span[@class="bbvote"]//text()')] +
                [each.extract() for each in sel.xpath('/html/body/span/span[@class="bbvote"]//text()')] +
                [''.join(each.xpath('span//text()').extract()) for
                 each in sel.xpath('/html/body/p/span[@class="bbvote"]')] +
                [''.join(each.xpath('span//text()').extract()) for
                 each in sel.xpath('/html/body/span[@class="bbvote"]')]
               )
    
    # first of all, though, we handle broken bold tags similarly after some preprocessing, so let's add those
    for content in (sel.xpath('/html/body/text()').extract() +
                    sel.xpath('/html/body/p/text()').extract()):
        if content.count('[/b]') > 0:
            tagline = content[:content.find('[/b]')].lstrip().rstrip() # up to broken tag
            boldtags.append(tagline)
        if content.count('[b]') > 0:
            tagline = content[content.find('[b]')+3:].lstrip().rstrip() # starting at broken tag
            boldtags.append(tagline)
            
    boldtags = boldtags + votetags # we want votetags to have priority, so add them to the pool here
    boldtags = [b.rstrip().lstrip() for b in boldtags]
    
    # they need to have 'vote' or 'veot' early in their string
    boldtags = [b for b in boldtags if b[:7].lower().count('vote') + b[:7].lower().count('veot') > 0]
    
    # rfind 'vote' and 'unvote' (and their key mispellings) to locate vote
    for i, v in enumerate(boldtags):
        voteloc = max(v.lower().rfind('vote'), v.lower().rfind('veot'))
        unvoteloc = max(v.lower().rfind('unvote'), v.lower().rfind('unveot'))
        
        # if position of unvote is position of vote - 2, then the last vote is an unvote
        if unvoteloc > -1 and unvoteloc == voteloc - 2:
            boldtags[i] = 'not voting'
            
        # otherwise vote is immediately after 'vote' text and perhaps some crap
        else:
            boldtags[i] = v[voteloc+4:].replace(': ', ' ').replace(':', ' ').replace('\n', ' ').rstrip().lstrip()

    votes = boldtags
    return votes

In [4]:
def includesVote(post):
    """Returns whether a vote is present in the post's content or not"""
    return True if len(findVotes(post)) > 0 else False # yuck

def votesFromPost(post, players):
    """tries to identify vote's target from the post"""
    
    votes = findVotes(post)
    
    # so sometimes the earlier vote in a post matters (like in t=17276, p=252)
    # therefore we'll yield a list of votes in a post and process them all 'in order'
    # with the exception of same-line unvote-then-vote happenings
    for vote in votes:
        if testing:
            print('vote being processed in votesFromPost:', vote)
        
        if vote == 'not voting':
            yield 'not voting'
            continue
            
        # make sure player isn't asking for a 'vote count'; need playerlist to make sure
        if (vote[:5].lower() == 'count' and
                len([p for p in players if p.lower()[:5] == 'count']) == 0):
                continue
        
        # first check if vote is just a 0char misspelling of a player's name
        threshold = 0
        nearspellings = [p for p in players
                         if ed.eval(p.lower(), vote.lower()) <= threshold]
        if testingNearSpellings:
            print(nearspellings)
        if len(nearspellings) == 1:
            yield nearspellings[0] # if for loop doesn't ever break,
            continue
        elif len(nearspellings) > 1 and testing:
            print('nearspelling', str(threshold),
                  'method found multiple players; abandoning:', nearspellings)
            
        # second check if vote is just a 1char misspelling of a player's name
        threshold = 1
        nearspellings = [p for p in players if ed.eval(p.lower(), vote.lower()) <= threshold]
        if testingNearSpellings:
            print(nearspellings)
        if len(nearspellings) == 1:
            yield nearspellings[0] # if for loop doesn't ever break,
            continue
        elif len(nearspellings) > 1 and testing:
            print('nearspelling', str(threshold),
                  'method found multiple players; abandoning:', nearspellings)
        
        # try to infer acronym from capitalizations in player usernames
        threshold = 0
        capmatches = [p for p in players if
                      ed.eval(regup.sub('', p).lower(), regall.sub('', vote.lower())) <= threshold]
        if len(capmatches) == 1:
            yield capmatches[0]
            continue
        elif len(capmatches) > 1 and testing:
            print('capitalizations', str(threshold),
                  'method found multiple players; abandoning:', capmatches)

        # now check for perfect english word acronym matching
        threshold = 0
        acromatches = [p for p in players if 
                       ed.eval(playerabbrevs[p].lower(), regall.sub('', vote).lower()) <= threshold]
        if len(acromatches) == 1:
            yield acromatches[0]
            continue
        elif len(acromatches) > 1 and testing:
            print('shortest acronym', str(threshold),
                  'method found multiple players; abandoning:', acromatches)
        
        # now check if vote is a substring of a player's name, but only for len(vote) >= 3
        suboccurrences = [p for p in players if p.lower().count(vote.lower()) > 0 and len(vote) >= 3]
        if len(suboccurrences) == 1:
            yield suboccurrences[0]
            continue
        elif len(suboccurrences) > 1 and testing:
            print('vote substring method found multiple players; abandoning:', suboccurrences)
        
        # now check if vote is the shortest english-word acronym of a player username
        # with levenshtein distance threshold ranging up to 2;
        # for every player,  find their shortest english-word acronym
        threshold = 1
        acromatches = [p for p in players if
                       ed.eval(playerabbrevs[p].lower(), vote.lower()) <= threshold]
        if len(acromatches) == 1:
            yield acromatches[0]
            continue
        elif len(acromatches) > 1 and testing:
            print('shortest acronym', str(threshold),
                  'method found multiple players; abandoning:', acromatches)
        
        # now check if vote is a substring of a player's name
        suboccurrences = [p for p in players if p.lower().count(vote.lower()) > 0]
        if len(suboccurrences) == 1:
            yield suboccurrences[0]
            continue
        elif len(suboccurrences) > 1 and testing:
            print('vote substring method found multiple players; abandoning:', suboccurrences)
        
        # two char misspelling
        threshold = 2
        nearspellings = [p for p in players if
                         ed.eval(p.lower(), vote.lower()) <= threshold]
        if testingNearSpellings:
            print(nearspellings)
        if len(nearspellings) == 1:
            yield nearspellings[0]
            continue
        elif len(nearspellings) > 1 and testing:
            print('nearspelling', str(threshold),
                  'method found multiple players; abandoning:', nearspellings)
            
        # capitalization method, but check if acronym uses same letters
        capmatches = [p for p in players if sorted(regup.sub('', p).lower()) == sorted(vote.lower())]
        if len(capmatches) == 1:
            yield capmatches[0]
            continue
        elif len(capmatches) > 1 and testing:
            print('capitalizations', str(threshold),
                  'method found multiple players; abandoning:', capmatches)
        
        threshold = 2 # second pass w/ shortest english-word acronym
        acromatches = [p for p in players if
                       ed.eval(playerabbrevs[p].lower(), vote.lower()) <= threshold]
        if len(acromatches) == 1:
            yield acromatches[0]
            continue
        elif len(acromatches) > 1 and testing:
            print('shortest acronym', str(threshold),
                  'method found multiple players; abandoning:', acromatches)

        # if a player's name is a substring of the vote
        suboccurrences = [p for p in players if vote.lower().count(p.lower()) > 0]
        if len(suboccurrences) == 1:
            yield suboccurrences[0]
            continue
        elif len(suboccurrences) > 1 and testing:
            print('player substring method found multiple players; abandoning:', suboccurrences)

        # if spaced-out parts of a player's name are a substring of the vote
        suboccurrences = [p for p in players if
                          len([s for s in p.split(' ') if vote.lower().count(s.lower()) > 0]) > 0]
        if len(suboccurrences) == 1:
            yield suboccurrences[0]
            continue
        elif len(suboccurrences) > 1 and testing:
            print('spaced substring method found multiple players; abandoning:', suboccurrences)
        
        # if length>3 english-divided parts of a player's name are a substring of the vote
        suboccurrences = [p for p in players if
                          len([s for s in englishdivides(p)[0] if
                               (vote.lower().count(s.lower()) > 0 and len(s) > 3)]) > 0]
        if len(suboccurrences) == 1:
            yield suboccurrences[0]
            continue
        elif len(suboccurrences) > 1 and testing:
            print('spaced substring method found multiple players; abandoning:', suboccurrences)
        
        # if vote is a two letter abbreviation of a playername including partial english
        threshold = 0
        acromatches = [p for p in players if
                       ed.eval(''.join([each[0] for each in englishdivides(p)[0][1:3]]).lower(), vote.lower()) <= threshold]
        if len(acromatches) == 1:
            yield acromatches[0]
            continue
        elif len(acromatches) > 1 and testing:
            print('shortest acronym', str(threshold),
                  'method found multiple players; abandoning:', acromatches)
        
        # check if vote is slightly misspelled substring of a player's name
        threshold = 1
        suboccurrences = []
        for p in players:
            if len(vote) < len(p):
                for i in range(len(p)):
                    if ed.eval(vote.lower(), p[i:min(i+len(vote)+1, len(p))].lower()) <= threshold:
                        suboccurrences.append(p)
                        break
                for i in range(1, len(vote)+1):
                    if ed.eval(vote.lower(), p[:i].lower()) <= threshold:
                        suboccurrences.append(p)
                        break
        if len(suboccurrences) == 1:
            yield suboccurrences[0]
            continue
        elif len(suboccurrences) > 1 and testing:
            print('spaced misspelled substring method' + str(threshold) +
                  'found multiple players; abandoning:', suboccurrences)
            
        threshold = 2
        suboccurrences = []
        for p in players:
            if len(vote) < len(p):
                for i in range(len(p)):
                    if ed.eval(vote.lower(), p[i:min(i+len(vote)+1, len(p))].lower()) <= threshold:
                        suboccurrences.append(p)
                        break
                for i in range(1, len(vote)+1):
                    if ed.eval(vote.lower(), p[:i].lower()) <= threshold:
                        suboccurrences.append(p)
                        break
        if len(suboccurrences) == 1:
            yield suboccurrences[0]
            continue
        elif len(suboccurrences) > 1 and testing:
            print('spaced misspelled substring method' + str(threshold) +
                  'found multiple players; abandoning:', suboccurrences)
        
        # if vote is a mix of abbreviations and parts of a spaced out playername
        suboccurrences = []
        for p in players:
            broke = p.split(' ')
            for i in range(len(broke)):
                cand = ''.join([broke[j][0] if j != i else broke[j] for j in range(len(broke))])
                if ed.eval(cand.lower(), vote.lower()) < 2:
                    suboccurrences.append(p)
        if len(suboccurrences) == 1:
            yield suboccurrences[0]
            continue
        elif len(suboccurrences) > 1 and testing:
            print('mixed spaced substring method found multiple players; abandoning:', suboccurrences)
        
        # if every character in vote is a character in a single playername, we can pair them
        matches = [p for p in players if set(vote.lower()) <= set(p.lower())]
        if len(matches) == 1:
            yield matches[0]
            continue
        elif len(matches) > 1 and testing:
            print('player substring method found multiple players; abandoning:', suboccurrences)
        
        
        # last resort, just return playername with lowest distance from the vote
        distances = [ed.eval(vote.lower(), p.lower()) for p in players]
        if testing:
            print('using last resort for', vote)
        yield players[distances.index(min(distances))]

In [5]:
def englishdivides(playername):
    string = regall.sub('', playername)
    passes = [[['']]]
    fulldivides = []
    while len(passes[-1]) > 0:
        passes.append([])
        for p in passes[-2]:
            for i in range(len(''.join(p))+1, len(string)+1):
                substring = string[len(''.join(p)):i]
                if (dEn.check(substring) or
                    dCA.check(substring) or
                    dGB.check(substring) or
                    dUS.check(substring)):
                    passes[-1].append(p + [substring])

                    if len(''.join(p + [substring])) == len(string):
                        fulldivides.append(p + [substring])
    if testing:
        pass #[print(each) for each in fulldivides]
    return fulldivides

In [6]:
def processposts(posts):
    global success
    global total
    
    for post in posts:

        # stop at specified post 
        if len(breakat) > 0:
            if int(post['number']) == int(breakat):
                break

        # ignore posts not made by players
        if players.count(post['user']) == 0:
            continue

        # ignore posts that don't include a vote; otherwise get vote
        if testing:
            print('Detected player post at:', post['number'])
        if not includesVote(post):
            continue

        # get target of vote from post
        votes = [vote for vote in votesFromPost(post, players)]
        for voted in votes:
            # get voterslot and votedslot
            voterslot = [slots.index(s) for s in slots if s.count(post['user']) > 0][0]
            votedslot = len(slots) if voted == "not voting" else [slots.index(s) for s in slots if s.count(voted) > 0][0]

            # update votesByVoter, temporarily track the old vote
            oldvotedslot = votesByVoter[voterslot]
            votesByVoter[voterslot] = votedslot

            # update votesByVoted
            del votesByVoted[oldvotedslot][votesByVoted[oldvotedslot].index(voterslot)]
            votesByVoted[votedslot].append(voterslot)

            if (votedslot < len(slots)):
                if testing:
                    print('Votecounter believes', post['user'], 'voted', slots[votedslot], 'in post', post['number'])
            else:
                if testing:
                    print('Votecounter believes', post['user'], 'unvoted in post', post['number'])
                continue

            # if target has a majority of votes, mark him as our choice and finish this thread
            if len(votesByVoted[votedslot]) > len(slots)/2.0:
                choice = slots[votedslot]

                # 65 is a game w/ a doublevoter, so things are weird
                success += choice == correct if gamearchive.index(game) not in deserveSuccess else 1

                if testing:
                    print('Votecounter chose', choice, '!')
                    print('Correct response', correct, '.')
                    print('So far', success, 'successes over', total, 'trials.\n')

                if stopAtFail and (correct != choice):
                    if gamearchive.index(game) not in acceptFailure:
                        raise ValueError('got one wrong, index:' + str(gamearchive.index(game)))
                return
    
    if gamearchive.index(game) in deserveSuccess:
        success += 1
    if stopAtFail:
        if gamearchive.index(game) not in acceptFailure:
            raise ValueError('never found lynch; game index: ' + str(gamearchive.index(game)))

## Testing Framework
Scans through game archive and tests how well the vote-detecting functions can identify D1 lynches.

In [7]:
dEn = enchant.Dict("en")
dCA = enchant.Dict("en_CA")
dGB = enchant.Dict("en_GB")
dUS = enchant.Dict("en_US")
regall = re.compile('[^a-zA-Z]')
regup = re.compile('[^A-Z]')

# open game archive, separate by game
with open('archive.txt') as f:
    gamearchive = f.read().split('\n\n\n')
    
# open thread archive, sort posts, prepare selector
posts = []
with open('posts.jl') as f:
    posts = [json.loads(p) for p in f.read().split('\n')]
posts = sorted(posts, key = lambda element: (int(element['thread']), int(element['number'])))
threads = np.array([p['thread'] for p in posts])
posts = np.array(posts)

In [10]:
# startup happens here
success = 0
total = 0
testing = 0
testingNearSpellings = 0
breakat = ''
stopAtFail = 1

# 22 (1156) has a vote i cant handle by Greymarble; it's inside a quote and i don't think a mod should accept it
# 30 (1177) a broken tag vote by Slaxx happens that my votecounter detects but mod refuses; i must accept this error
# 31 (1180) mod ignored a game-critical vote; my vote counter made no mistake
# 47 has a doublevoter too (how did this go?); also the thread is not complete. **Come back to this one**
# 63 has a doublevoter so things are weird but my votecounter made no mistake **Come back to this one to confirm**
# 124 the moderator mistakenly lynched the wrong person and couldn't undo it; votecounter made no mistake
# 147 has a vote that could apply to a currently playing person or someone who will soon replace in. If I could track when replacements happen, I'd catch, but since I can't the VCer can't be said to have messed up.
# 149 has a vote that requires human language understanding to work
# 154 was decided by a doublevoter; votecounter has final votecount perfectly
# 158 has someone die / votecount reset in middle; final votecount is otherwise accurate though
# 162 same as 154
# 175 had a N0 death; however votecounter found final votecount perfectly
# 177 (1651) had both a D1 modkill and ended in a no lynch. Nonetheless, votecounter captured final VC perfectly.
# 217 had an unclear vote (votecounter wouldn't have parsed it) that the moderator rejected; my votecounter accepted the unclear vote, improperly ignoring one that would've contributed to a lynch
# 222 had a N0 death, but; however votecounter find final votecount of Day perfectly
# 226 had a no lynch but votecounter *did* captured final votes of D1 perfectly
# 232 failed to reach a lynch but the mod killed the guy with the highest votes anyway; final votecount was perfect
# 239 was another double voter
# 254 no lynch, but perfect final vote
# 276 has a mod w/ strict voting standards, so i count a lot of votes that he wouldn't; impossible to fully verify performance
# 284 double voter

acceptFailure = [22, 30, 31, 47, 63, 124, 146, 147, 149, 154, 158, 162, 175, 177, 185, 217, 222, 226, 232, 239, 254, 276, 284]
deserveSuccess = [31, 47, 63, 124, 147, 154, 158, 162, 175, 177, 222, 226, 232, 239, 254, 276, 284]

for game in gamearchive[:1]:
    total += 1
    print(gamearchive.index(game))
    print(game[:game.find('\n', game.find('\n')+1)]) # print game title
    link = game[:game.find('\n')]

    # use archive to get list of slots and the players in them
    slots = []
    players = []
    correct = None
    for s in game[game.find('\nPlayers\n')+9:].split('\n'):
        # s = s.lower()
        slots.append(s.split(', ')[0].split(' replaced '))
        players += s.split(', ')[0].split(' replaced ')
        
        if s.lower().split(', ')[2].count('lynched') > 0 and s.lower().split(', ')[2].count('day 1') > 0:
            correct = s.split(', ')[0].split(' replaced ')

    # make an acronym dictionary for each player
    playerabbrevs = {}
    for p in players:
        if testing:
            print(p)
            print(englishdivides(p))
        playerabbrevs[p] = ''.join([each[0] for each in englishdivides(p)[0][1:]])

    # use playerlist to initialize votecount
    votesByVoter = {}
    votesByVoted = {}
    for i in range(len(slots)):
        votesByVoter[i] = len(slots)
        votesByVoted[i] = []
    votesByVoted[len(slots)] = list(np.arange(len(slots)))

    # use archive to get associated thread number
    if link.count('&') == 1:
        number = link[link.find('&t=')+3:]
    else:
        number = link[link.find('&t=')+3:link.rfind('&')]

    # use thread number to get all the thread's posts
    processposts(posts[threads == number])

0
http://forum.mafiascum.net/viewtopic.php?f=53&t=15787
Game 1091: Mafia Mania


In [11]:
# look at the current vote count; useful for testing after stopping votecounter at markers
for i in votesByVoted.keys():
    voted = 'not voting' if i == len(slots) else slots[i]
    voters = [slots[voter] for voter in votesByVoted[i]]
    print(voted, '-', str(len(voters)), 'votes:')
    for each in voters:
        print(each)
    print('')

# voter | assigned | actual
# Mathblade | not voting | 

['Beefster'] - 7 votes:
['brokenscraps', 'LordChronos']
['Ant_to_the_max']
['Mariyta']
['werewolf555', 'Hiraki']
['AntB']
['Substrike22']
['boberz']

['werewolf555', 'Hiraki'] - 4 votes:
['Dekes']
['Empking', 'Xtoxm', 'moose200x']
['Antihero', 'Lateralus22', 'caelum']
['Beefster']

['Substrike22'] - 0 votes:

['Antihero', 'Lateralus22', 'caelum'] - 0 votes:

['boberz'] - 0 votes:

['Mariyta'] - 0 votes:

['brokenscraps', 'LordChronos'] - 0 votes:

['AntB'] - 0 votes:

['pappums rat', 'Mr Wright'] - 0 votes:

['Ant_to_the_max'] - 0 votes:

['Dekes'] - 0 votes:

['Empking', 'Xtoxm', 'moose200x'] - 1 votes:
['pappums rat', 'Mr Wright']

not voting - 0 votes:



In [325]:
# useful tool for debugging specific posts
testing = 1
testingAcro = 1
post = posts[threads=='65380'][489]
print('Post object:', post, '\n')
print('includesVote output:', includesVote(post))
print('Votes from post:',[vote for vote in votesFromPost(post, players)])

Post object: {'forum': '53', 'thread': '65380', 'pagelink': 'https://forum.mafiascum.net/viewtopic.php?f=53&t=65380&sid=eee8a4356bf672bbdee4268321c423a7&start=475', 'number': '489', 'timestamp': 'Fri Mar 04, 2016 11:24 am', 'user': 'Not_Mafia', 'content': '<span class="bbvote" title="This is an official vote.">VOTE: iron</span>'} 

includesVote output: True
vote being processed in votesFromPost: iron
Votes from post: ['iraonavp']


In [294]:
# for seeing post after processing through Selector
print('selector extract')
sel = Selector(text=posts[threads=='62192'][398]['content'])
print(sel.extract())
print()

[''.join(each.xpath('span//text()').extract()) for
                 each in sel.xpath('/html/body/p/span[@class="noboldsig"]')]

selector extract
<html><body><p>alright have read all of this, almost fell asleep but yeah can confirm I'm up to date<br><br><span style="text-decoration: underline"><span style="font-size: 150%; line-height: 116%;"><span style="font-family: 'vivaldi';">green beans with extra carotenoids and vitamin town</span></span></span><br><span style="color: #00BF00"><span class="nocolorsig"><span class="noboldsig">lalalalalalalala</span> obviously imma good duck!</span></span><br><span style="color: #00BFBF"><span class="nocolorsig"><span class="noboldsig">KayP</span> hey Bruce Lee</span></span><br><span style="color: green"><span class="nocolorsig"><span class="noboldsig">Bob Lob</span> on a corn cob</span></span><br><span style="color: #BFBF00"><span class="nocolorsig">groggy Mr. <span class="noboldsig">Froggy</span></span></span><br><span style="color: #80BF80"><span class="nocolorsig"><span class="noboldsig">Glork</span> is a funny name!</span></span><br><span style="color: #408080"><span cl

['Bicycle Bob', 'VOTE: BICYCLEPHALOUS BOB ROBBED ME D:']

In [8]:
# code to print success rate
print(success)
print(total)
print(success/total)

294
298
0.9865771812080537
