# Cybernetic Game Theory


In [1]:
# THIS IS ACTIVELY W.I.P. August-November 2020.  Began transforming sketch into functions.  
# Instead of deleting sketch work, I am just commenting out to preserve thought process.
# Next step is to separate out functions and create new notebook/script to continue development.

# This notebook aims to illustrate a toy model of a cybernetic regulator along the lines of W.R. Ashby's work.
# It is instructive to see the game-theoretic foundations of other popular regulators, like Artificial Neural Networks.
# The regulator can "learn" a probability distribution of disturbances, using reinforcement learning.
# The result is effective control, channeling the flow of information from the environment into desired outcomes (states).

In [2]:
import numpy as np
import pandas as pd

In [3]:
# Create a game matrix for two players: Environment and Regulator
# Choose a goal for Regulator
# Environment goes first (row_i)
# Regulator goes second (column_j)
# Outcome is matrix element m_ij

In [4]:
#game_matrix = np.random.randint(10, size=(7,5))
#game_matrix

In [5]:
# Rows are plays (a.k.a. "disturbances") for the environment.  
# Create vector to use for pandas index and later to link up with probabilities.
def create_game(size):
    game_matrix = np.random.randint(10, size=size)
    rows = [i+1 for i in range(len(game_matrix))]
    print(rows)
    return pd.DataFrame(data = game_matrix, columns=[i+1 for i in range(game_matrix.shape[1])], index=rows)


In [6]:
#game = create_game((7,12))
#game

# Environment chooses play (row)

In [7]:
# Create distribution for environmental "plays" or "disturbances".
#dist = np.random.dirichlet(alpha=rows)

In [8]:
# Check that we have probabilities summing to 1.
#print(dist)
#sum(dist)

In [9]:
# Choose a play.  Distribution of environment must remain constant!
# There must be something to learn, and if dist is changing, the world is random.
def environment_play(game,dist):
    '''
    
    '''
    return np.random.choice(game.index, size=1, p=dist).item()

In [10]:
#dist = np.random.dirichlet(alpha=game.index)
#env_play = environment_play(game,dist)
#env_play

# Regulator chooses action (column)

In [11]:
# We can use a Polya urn instead to define probabilities of actions for the regulator.
#urn = np.random.randint(10, size=len(game.columns))
#urn

In [12]:
# Probabilities of drawing from urn
#probs = np.array([(i/sum(urn)) for i in urn])
#probs

In [13]:
#sum(probs)

In [14]:
# Choose random draws from plays in the urn with probabilities according to the composition of the urn.
# We actually just care to draw from the plays, and not from the urn itself, although the urn is what will be updated/reinforced.
def regulator_action(game,probs):
    return np.random.choice(game.columns, size=1, p=probs).item()


In [15]:
#reg_action = regulator_action(game,probs)
#reg_action

# Update/Reinforce the action of the regulator.

In [16]:
# Use .item to get the value from the game table out of the locations (rows/columns) encoded in arrays.
#out = game.loc[env_play,reg_action]
#out

In [17]:
# Convince yourself that this row/column outcome corresponds to the game.
#game

In [18]:
#  We need to set a goal for the regulator to achieve.
#goal = 8

In [19]:
# Define regulator as dictionary of plays and associated probabilities.
#regulator = dict(zip(game.columns,urn))
#regulator

In [20]:
# Compare outcome with goal, and reinforce (increase probability) action which regulator took in response to environment.
# It makes sense to reinforce individual successes more than we weaken, since we expect to fail a lot in the beginning.
# Try adding len(regulator), although this may be impractical for some examples.
#if out == goal:
#    print("success: reinforced the regulator's action", reg_action, "from", regulator[reg_action], "to", regulator[reg_action] + len(regulator))
#    regulator[reg_action] += len(regulator)
#    print('now we need to recalculate the probabilities according to the reinforced urn')
#else: print('fail')

In [21]:
# Confirm updated urn.
#regulator

In [22]:
# Recalculate probabilities and confirm that correct play increases in probability and all others go down.
def prob_calc(regulator_dict):
    sum_reg = np.array(sum([regulator_dict[i] for i in regulator_dict]))
    return np.array([regulator_dict[i]/sum_reg for i in regulator_dict])
    #sum(updated_probs),updated_probs

In [23]:
#prob_calc(regulator)

In [24]:
# Original probs
#probs

In [25]:
# We also want the regulator to update when it fails.  How might this look?
# Remember that the regulator's action distribution was random (the composition of the urn).  
# Consider that the regulator was perhaps overconfident, how can we make the "urn" less "confident"?
# Lets "sqeeze" the distribution in the urn, by making it less confident for any particular action.

# We could make drastic changes to our distribution, but we should keep in mind that we may want to use 
# the same fail-update procedure over and over again in an automated learning process.  
# If our "squeeze" is too drastic (e.g. making the urn parts equal / probabilities uniform) then even if
# our regulator is "learned" but makes a mistake, it will "forget" the learned distribution.

# We can still use the mean of the urn composition in a function that slightly squeezes, 
# incrementing those actions which are below the mean, and decrementing those below.

In [26]:
# FIX: IS SQUEEZING ALWAYS WORKING PROPERLY?  NEEDS EXPERIMENTATION
# Notice how multiple (failing) applications of this function will "converge" around the mean.
# More precisely, it will slightly oscillate around the convergence point.
def squeeze(regulator_dict, urn_list):
    '''
    This update function takes two arguments.
    
    regulator_dict: a regulator defined as a dictionary of key labels (plays or columns) 
    and integer values (from a distribution or urn).
    
    urn_list: a list of integers interpreted as the composition of a Polya urn.
    
    The function calculates the mean of the urn composition, and compares each value
    in the regulator_dict with the mean.  
    
    The resulting regulator_dict is updated by incrementing values smaller than the
    mean, and decrementing values greater than the mean.
    
    '''
    mean = np.mean(urn_list)
    for i in regulator_dict:
        if regulator_dict[i] >= mean:
            regulator_dict[i] -= 1
            #print('squeeze down:', regulator_dict[i])
        else:
            regulator_dict[i] += 1
            #print('squeeze up:', regulator_dict[i])



In [27]:
# Our new function which includes update protocols for both success and failure.
# Experiment with squeeze on/off and with magnitude of reinforcement for regulator action. 
# With squeeze on, it might need to be len(regulator_dict), whereas the sqrt of that might be sufficient without squeeze.
def update(regulator_dict,action,out,goal,urn_list,skweez=False):
    success = 0
    if out == goal:
        #print(action)
        #print("success: reinforced the regulator's action", action, "from", regulator_dict[action], "to", regulator_dict[action]+len(regulator_dict))
        regulator_dict[action] += len(regulator_dict)**(1/2)
        success += 1
        print("success!")
        #print('now we need to recalculate the probabilities according to the reinforced urn')
    elif skweez:
        print('fail: squeezing.')
        squeeze(regulator_dict, urn_list)
    return success


In [28]:
#update(regulator,reg_action,out,goal,urn)

In [29]:
#regulator


In [30]:
#prob_calc(regulator)

In [31]:
# original probabilities
#probs

# Put it all together

In [32]:
# Now lets repetitively train a regulator on multiple disturbances from the environment.
# After each outcome, we will update, hoping to improve the likelihood of successful plays.
# This function incorporates most of what we have done separately above.

def train(game_size,goal,epochs,skweez):
    game = create_game(game_size)
    print(game)
    urn = np.random.randint(100, size=len(game.columns))
    probs = np.array([(i/sum(urn)) for i in urn])
    #print("probs:",probs)
    regulator = dict(zip(game.columns,urn))
    print("regulator:",regulator)
    dist = np.random.dirichlet(alpha=game.index)
    successes = 0
    i=1
    while i <= epochs:
        
        print("Epoch: ",i)
        
        # Environment chooses play.
        play = environment_play(game,dist)
        
        # Regulator chooses action.
        action = regulator_action(game,probs)
        
        # Compute state of the world that is output (index in game matrix)
        out = game.loc[play,action]
        #print("out:",out)
        
        # Update regulator.
        successes += update(regulator,action,out,goal,urn,skweez=skweez)
        print("successes per epoch:",successes / i)
        
        # Recalculate regulator probabilities.
        probs = prob_calc(regulator)
        #print("updated probs:",probs)
        
        
        #Increment i.
        i += 1
    return regulator

# Experiment
Notice the accuracy improvement when the regulator has access to more "plays" (columns).

In [36]:
trained_regulator = train(game_size=(10,200),goal=5,epochs=10,skweez=False)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    1    2    3    4    5    6    7    8    9    10   ...  191  192  193  194  \
1     1    5    0    5    5    6    4    5    0    5  ...    9    6    4    2   
2     0    6    8    8    5    5    5    1    4    9  ...    8    9    9    1   
3     9    3    9    2    6    3    9    7    9    1  ...    6    6    2    8   
4     2    7    6    8    1    0    7    7    4    3  ...    0    4    2    8   
5     1    0    3    8    5    6    8    3    9    5  ...    7    1    3    2   
6     8    3    8    4    4    1    0    3    5    8  ...    6    7    4    9   
7     2    5    8    8    7    6    9    6    2    0  ...    7    8    3    1   
8     7    0    6    2    8    8    6    0    8    9  ...    6    0    8    7   
9     4    3    2    6    2    4    3    9    6    0  ...    3    3    6    5   
10    5    1    4    5    2    5    8    2    6    6  ...    0    3    3    0   

    195  196  197  198  199  200  
1     0    1    0    7    7    4  
2     

In [37]:
trained_regulator

{1: 80,
 2: 9,
 3: 70,
 4: 86,
 5: 27,
 6: 5,
 7: 83,
 8: 20,
 9: 86,
 10: 43,
 11: 70,
 12: 20,
 13: 24,
 14: 47,
 15: 72,
 16: 72,
 17: 6,
 18: 16,
 19: 97,
 20: 29,
 21: 22,
 22: 17,
 23: 21,
 24: 80,
 25: 23,
 26: 82,
 27: 50,
 28: 70,
 29: 13,
 30: 31,
 31: 4,
 32: 41,
 33: 14,
 34: 37,
 35: 95,
 36: 82,
 37: 0,
 38: 72,
 39: 22,
 40: 87,
 41: 11,
 42: 13,
 43: 91,
 44: 36,
 45: 62,
 46: 14,
 47: 11,
 48: 89,
 49: 49,
 50: 20,
 51: 42,
 52: 69,
 53: 14,
 54: 60,
 55: 71,
 56: 18,
 57: 28,
 58: 77,
 59: 83,
 60: 73,
 61: 49,
 62: 21,
 63: 13,
 64: 83,
 65: 42,
 66: 12,
 67: 34,
 68: 21,
 69: 5,
 70: 56,
 71: 65,
 72: 82,
 73: 72,
 74: 40,
 75: 47,
 76: 36,
 77: 5,
 78: 72,
 79: 88,
 80: 78,
 81: 31,
 82: 21,
 83: 111.14213562373095,
 84: 27,
 85: 4,
 86: 99.14213562373095,
 87: 78,
 88: 83,
 89: 2,
 90: 61,
 91: 73,
 92: 70,
 93: 3,
 94: 36,
 95: 31,
 96: 2,
 97: 15,
 98: 25,
 99: 90,
 100: 79,
 101: 12,
 102: 88,
 103: 36,
 104: 38,
 105: 5,
 106: 62,
 107: 25,
 108: 50,
 109: 12,