In [10]:
# A notebook for organising AI progress metrics

from datetime import date

# We have the following structures:
#
# problem 
#     \   \
#      \   metrics  -  measures 
#       \
#        - subproblems
#             \
#           metrics
#              \
#             measures
#
# problems are tagged with attributes:
# eg, vision, abstract-games, language, world-modelling, safety
#     agi       -- most capable humans can do this, so AGIs can do this
#     super     -- the very best humans can do this, or human organisations have solved this
#     verysuper -- neither humans nor human orgs have solved this
#
# problems can have "subproblems", including simpler cases and preconditions

class Problem:
    def __init__(self, name, attributes=[], solved=False):
        self.name = name
        self.attributes = attributes
        self.subproblems = []
        self.superproblems = []
        self.metrics = []
        self.solved = solved
        
    def subproblem(self, other_problem):
        self.superproblems.append(other_problem)
        other_problem.subproblems.append(self)
        
    def metric(self, *args, **kwargs):
        m = Metric(*args, **kwargs)
        self.metrics.append(m)
        return m


# Different metrics and measurements for progress are made on very different types of scales
# we have some helper functions to regularise these a little bit, so we can tell (for instance)
# whether progress on some metric appears to be accelerating or decelerating.

# Interface:
#    improvement(score1, score2): retrns a consistent measure of how much better score2 is than score1
#    pseudolinear(score): returns a modified version of score where we would expect vaguely linear progress

class Linear:
    def improvement(self, score1, score2):
        return score2 - score1
    def pseudolinear(self, score):
        return score
linear = Linear()

class ELO:
    def improvement(self, score1, score2):
        """
        Normalise an ELO score
        
        An ELO increase of 400 improves your odds by 10x, so we could justify something like
        return 10.0 ** ((score2 - score1)/400.)
        However, it seems that at least for chess ELO progress has been roughly linear over
        time, both for humans and computers (though with different coefficients). Perhaps this
        tracks exponential increases in ability to search the game's state space, driven directly
        by Moore's law on the computer side, and indirectly for humans by access to better training
        tools and more profound libraries of past play.
        
        So for now let's treat this as linear? But ELO is not a chess-specific measure, and in other
        contexts we may want to do exponentiation as documented above?
        """
        return score2 - score1
    def pseudolinear(self, score):
        return score
    
elo = ELO()

class ErrorRate:
    """Many labelling contests use these measures"""
    def improvement(self, score1, score2):
        # 0.5 / 0.25
        return score1 / score2
    def pseudolinear(self, score):
        # The choice of base here is arbitrary. But since this is computer science, let's use base2!
        from math import log
        return log(score) / log(2.0)
error_rate = ErrorRate()
    
class Metric:
    def __init__(self, name, url=None, solved=False, notes="", scale=linear):
        self.name = name
        self.measures = []
        self.solved = solved
        self.url = url
        self.notes = notes
        self.scale = scale
        
    def measure(self, *args, **kwargs):
        m = Measurement(*args, **kwargs)
        self.measures.append(m)
        # Add logic for detection solutions at this point
        return m


class Measurement:
    def __init__(self, date, value, name, url, uncertainty=0, minval=None, maxval=None):
        self.date = date
        self.value = value
        self.name = name
        self.url = url
        self.minval = minval if minval else value - uncertainty
        self.maxval = maxval if maxval else value + uncertainty


In [11]:
# BEGIN ACTUALLY CLASSIFYING PROBLEMS

scene_description = Problem("Scene description", ["agi", "vision", "language", "world-modelling"])
image_classification = Problem("Image classification", ["vision", "agi"])
scene_description.subproblem(image_classification)

imagenet = image_classification.metric("imagenet", "http://image-net.org", scale=error_rate)
imagenet.notes = """
Correctly label images from the Imagenet dataset. As of 2016, this includes:
 - Object localization for 1000 categories.
 - Object detection for 200 fully labeled categories.
 - Object detection from video for 30 fully labeled categories.
 - Scene classification for 365 scene categories (Joint with MIT Places team) on Places2 Database http://places2.csail.mit.edu.
 - Scene parsing for 150 stuff and discrete object categories (Joint with MIT Places team).
WARNING: these subchallenges were added in successive years of the Imagenet challenge, so results from years are not directly
comparable; however progress should probably be understated by comparing them?
"""

# Data points gathered by Jack Clark:
imagenet.measure(date(2010,8,31), 0.28191, "NEC UIUC", "http://image-net.org/challenges/LSVRC/2010/results")
"""
** 2010: 0.28191**
**NEC UIUC**
http://image-net.org/challenges/LSVRC/2010/results

** 2011: 0.25770
 XRCE**

** 2012: 0.16422**
** Supervision**
http://image-net.org/challenges/LSVRC/2012/results.html

** 2013: 0.11743 **
**Clarifai**
http://www.image-net.org/challenges/LSVRC/2013/results.php

** 2014: 0.07405**
**VGG**
http://image-net.org/challenges/LSVRC/2014/index
 

**2015: 0.03567**
**MSRA**
http://image-net.org/challenges/LSVRC/2015/results

** 2016: 0.02991**
**Trimps-Soushen**
http://image-net.org/challenges/LSVRC/2016/results
* * *
"""
None

In [15]:
# Abstract games like chess, go, checkers etc can be played with no knowldege of the human world
# Although this domain has largely been solved to super-human performance levels, there are a
# few ends that need to be completed in terms of having agents learn rules for arbitrary 
# abstract games effectively

abstract_strategy_games = Problem("Abstract strategy games", ["agi", "abstract-games"])

playing_with_hints = Problem("Playing abstract games with extensive hints", ["abstract-games"], solved=True)
playing_with_hints.notes = """
  Complex abstract strategy games have been solved to super-human levels
  by computer systems with extensive rule-hinting and heuristics,
  in some cases combined with machine learning techniques.
"""
computer_chess = playing_with_hints.metric("computer chess", scale=elo)
# For some caveats, see https://en.wikipedia.org/w/index.php?title=Chess_engine&oldid=764341963#Ratings
computer_chess.measure(date(2017,02,27), 3393, "Stockfish", uncertainty=50,
                           url="https://web.archive.org/web/20170227044521/http://www.computerchess.org.uk/ccrl/4040/")
computer_chess.measure(date(1997,05,11), 2725, "Deep Blue", uncertainty=25,
                           url="https://www.quora.com/What-was-Deep-Blues-Elo-rating")

mastering_historical_games = Problem("Mastering human abstract strategy games", ["super", "abstract-games"])
mastering_chess = mastering_historical_games.metric("mastering chess")
mastering_chess.notes = """
  Beating all humans at chess, given a corpus of past play amongst masters,
  but no human-crafted policy constraints and heuristics. This will probably fall out
  immediately once learning_abstract_game_rules is solved, since playing_with_hints
  has been solved.
"""

# Are there any published metrics for these yet?
learning_abstract_game_rules = Problem("Learning the rules of complex strategy games from examples", ["agi", "abstract-games"])
learning_chess = learning_abstract_game_rules.metric("learning chess")
learning_chess.notes = """
  Chess software contains hard-coded policy constraints for valid play; this metric is whether RL
  or other agents can correctly build those policy constraints from examples or oracles"""
learning_go = learning_abstract_game_rules.metric("learning go")
learning_go.notes = """
  Go software contains policy constraints for valid play and evaluating the number of
  liberties for groups. This metric is whether RL or other agents can correctly build those 
  policy constraints from examples or oracles"""
learning_arbitrary_abstract_games = Problem("Play an arbitrary abstract game, first learning the rules", ["agi", "abstract-games"])
                           
