In [1]:
import math
import random

#States
States = ["Work", "Game", "Sleep"]

#Actions
Actions = {"Work": ["Continue", "Stop"],
           "Game": ["Continue", "Stop"],
           "Sleep":["Continue", "Work", "Game"]}


#Transition Function
Transition = {"Work": 
                 {"Continue": ([1,0,0], [2,0,0]),
                  "Stop": ([0,0.4,0.6], [0,6,3])},
              "Game":
                 {"Continue": ([0,1,0], [0,-4,0]),
                  "Stop": ([0,0,1],[0,0,6])},
              "Sleep":
                 {"Continue": ([0,0,1], [0,0,2]),
                  "Work": ([1,0,0], [1,0,0]),
                  "Game": ([0,1,0], [0,6,0])}
             }

#Policy Initial value
Policy = {"Work": [15,15],
          "Game": [15,15],
          "Sleep":[15,15,15]}

nb_iterations = 2000
gamma = 0.8
current_state = "Sleep"
history = []

In [2]:
def policy_softmax(p):
    result = p.copy()
    for state, action_dist in p.items():
        exp_row = [math.exp(elem) for elem in action_dist]
        denom = sum(exp_row)
        result[state] = [elem/denom for elem in exp_row]
            
    return result

In [3]:
def update_policy(history, reward):
    for i, state in enumerate(reversed(history)):
        if(i < 4):
            Policy[state[0]][Actions[state[0]].index(state[1])] += math.pow(gamma, i) * reward
        

In [4]:
for i in range(2000):
    policy_probability = policy_softmax(Policy)
    action = random.choices(Actions[current_state], weights = policy_probability[current_state])[0]
    paths = Transition[current_state][action]
    
    next_state = random.choices(States, weights = paths[0])[0]
    reward = paths[1][States.index(next_state)] * 0.01
        
    history.append((current_state, action))
    update_policy(history, reward)
    
    print(current_state,"->", next_state,"=", Policy)
    current_state = next_state
    

Sleep -> Work = {'Work': [15, 15], 'Game': [15, 15], 'Sleep': [15, 15.01, 15]}
Work -> Work = {'Work': [15.02, 15], 'Game': [15, 15], 'Sleep': [15, 15.026, 15]}
Work -> Game = {'Work': [15.068, 15.06], 'Game': [15, 15], 'Sleep': [15, 15.0644, 15]}
Game -> Sleep = {'Work': [15.106399999999999, 15.108], 'Game': [15, 15.06], 'Sleep': [15, 15.09512, 15]}
Sleep -> Game = {'Work': [15.13712, 15.1464], 'Game': [15, 15.108], 'Sleep': [15, 15.09512, 15.06]}
Game -> Game = {'Work': [15.13712, 15.12592], 'Game': [14.96, 15.0824], 'Sleep': [15, 15.09512, 15.028]}
Game -> Game = {'Work': [15.13712, 15.12592], 'Game': [14.888000000000002, 15.06192], 'Sleep': [15, 15.09512, 15.0024]}
Game -> Sleep = {'Work': [15.13712, 15.12592], 'Game': [14.974400000000001, 15.121920000000001], 'Sleep': [15, 15.09512, 15.03312]}
Sleep -> Game = {'Work': [15.13712, 15.12592], 'Game': [15.043520000000001, 15.169920000000001], 'Sleep': [15, 15.09512, 15.09312]}
Game -> Sleep = {'Work': [15.13712, 15.12592], 'Game': [15

Work -> Work = {'Work': [39.538079999999376, 16.189679999999985], 'Game': [15.107520000000008, 18.666239999999917], 'Sleep': [15.628799999999991, 15.849759999999993, 17.924159999999944]}
Work -> Work = {'Work': [39.59711999999938, 16.189679999999985], 'Game': [15.107520000000008, 18.666239999999917], 'Sleep': [15.628799999999991, 15.849759999999993, 17.924159999999944]}
Work -> Work = {'Work': [39.65615999999938, 16.189679999999985], 'Game': [15.107520000000008, 18.666239999999917], 'Sleep': [15.628799999999991, 15.849759999999993, 17.924159999999944]}
Work -> Work = {'Work': [39.715199999999385, 16.189679999999985], 'Game': [15.107520000000008, 18.666239999999917], 'Sleep': [15.628799999999991, 15.849759999999993, 17.924159999999944]}
Work -> Work = {'Work': [39.77423999999939, 16.189679999999985], 'Game': [15.107520000000008, 18.666239999999917], 'Sleep': [15.628799999999991, 15.849759999999993, 17.924159999999944]}
Work -> Work = {'Work': [39.83327999999939, 16.189679999999985], 'Ga

Work -> Work = {'Work': [60.32016000000046, 16.189679999999985], 'Game': [15.107520000000008, 18.666239999999917], 'Sleep': [15.628799999999991, 15.849759999999993, 17.924159999999944]}
Work -> Work = {'Work': [60.379200000000466, 16.189679999999985], 'Game': [15.107520000000008, 18.666239999999917], 'Sleep': [15.628799999999991, 15.849759999999993, 17.924159999999944]}
Work -> Work = {'Work': [60.43824000000047, 16.189679999999985], 'Game': [15.107520000000008, 18.666239999999917], 'Sleep': [15.628799999999991, 15.849759999999993, 17.924159999999944]}
Work -> Work = {'Work': [60.49728000000047, 16.189679999999985], 'Game': [15.107520000000008, 18.666239999999917], 'Sleep': [15.628799999999991, 15.849759999999993, 17.924159999999944]}
Work -> Work = {'Work': [60.556320000000476, 16.189679999999985], 'Game': [15.107520000000008, 18.666239999999917], 'Sleep': [15.628799999999991, 15.849759999999993, 17.924159999999944]}
Work -> Work = {'Work': [60.61536000000048, 16.189679999999985], 'Ga

Work -> Work = {'Work': [80.21663999999956, 16.189679999999985], 'Game': [15.107520000000008, 18.666239999999917], 'Sleep': [15.628799999999991, 15.849759999999993, 17.924159999999944]}
Work -> Work = {'Work': [80.27567999999955, 16.189679999999985], 'Game': [15.107520000000008, 18.666239999999917], 'Sleep': [15.628799999999991, 15.849759999999993, 17.924159999999944]}
Work -> Work = {'Work': [80.33471999999955, 16.189679999999985], 'Game': [15.107520000000008, 18.666239999999917], 'Sleep': [15.628799999999991, 15.849759999999993, 17.924159999999944]}
Work -> Work = {'Work': [80.39375999999955, 16.189679999999985], 'Game': [15.107520000000008, 18.666239999999917], 'Sleep': [15.628799999999991, 15.849759999999993, 17.924159999999944]}
Work -> Work = {'Work': [80.45279999999954, 16.189679999999985], 'Game': [15.107520000000008, 18.666239999999917], 'Sleep': [15.628799999999991, 15.849759999999993, 17.924159999999944]}
Work -> Work = {'Work': [80.51183999999954, 16.189679999999985], 'Game

Work -> Work = {'Work': [99.99503999999821, 16.189679999999985], 'Game': [15.107520000000008, 18.666239999999917], 'Sleep': [15.628799999999991, 15.849759999999993, 17.924159999999944]}
Work -> Work = {'Work': [100.05407999999821, 16.189679999999985], 'Game': [15.107520000000008, 18.666239999999917], 'Sleep': [15.628799999999991, 15.849759999999993, 17.924159999999944]}
Work -> Work = {'Work': [100.1131199999982, 16.189679999999985], 'Game': [15.107520000000008, 18.666239999999917], 'Sleep': [15.628799999999991, 15.849759999999993, 17.924159999999944]}
Work -> Work = {'Work': [100.1721599999982, 16.189679999999985], 'Game': [15.107520000000008, 18.666239999999917], 'Sleep': [15.628799999999991, 15.849759999999993, 17.924159999999944]}
Work -> Work = {'Work': [100.2311999999982, 16.189679999999985], 'Game': [15.107520000000008, 18.666239999999917], 'Sleep': [15.628799999999991, 15.849759999999993, 17.924159999999944]}
Work -> Work = {'Work': [100.29023999999819, 16.189679999999985], 'Ga

Work -> Work = {'Work': [112.98383999999733, 16.189679999999985], 'Game': [15.107520000000008, 18.666239999999917], 'Sleep': [15.628799999999991, 15.849759999999993, 17.924159999999944]}
Work -> Work = {'Work': [113.04287999999733, 16.189679999999985], 'Game': [15.107520000000008, 18.666239999999917], 'Sleep': [15.628799999999991, 15.849759999999993, 17.924159999999944]}
Work -> Work = {'Work': [113.10191999999732, 16.189679999999985], 'Game': [15.107520000000008, 18.666239999999917], 'Sleep': [15.628799999999991, 15.849759999999993, 17.924159999999944]}
Work -> Work = {'Work': [113.16095999999732, 16.189679999999985], 'Game': [15.107520000000008, 18.666239999999917], 'Sleep': [15.628799999999991, 15.849759999999993, 17.924159999999944]}
Work -> Work = {'Work': [113.21999999999731, 16.189679999999985], 'Game': [15.107520000000008, 18.666239999999917], 'Sleep': [15.628799999999991, 15.849759999999993, 17.924159999999944]}
Work -> Work = {'Work': [113.27903999999731, 16.189679999999985],

In [7]:
for h in history:
    print(h[0], h[1])

Sleep Work
Work Continue
Work Stop
Game Stop
Sleep Game
Game Continue
Game Continue
Game Stop
Sleep Game
Game Stop
Sleep Continue
Sleep Work
Work Continue
Work Continue
Work Stop
Sleep Continue
Sleep Game
Game Continue
Game Continue
Game Continue
Game Continue
Game Stop
Sleep Work
Work Stop
Sleep Work
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Stop
Game Stop
Sleep Game
Game Stop
Sleep Work
Work Continue
Work Continue
Work Continue
Work Stop
Sleep Game
Game Continue
Game Stop
Sleep Work
Work Continue
Work Continue
Work Continue
Work Stop
Game Continue
Game Continue
Game Stop
Sleep Game
Game Continue
Game Continue
Game Continue
Game Stop
Sleep Game
Game Stop
Sleep Continue
Sleep Continue
Sleep Continue
Sleep Work
Work Stop
Sleep Game
Game Stop
Sleep Continue
Sleep Work
Work Continue
Work Continue
Work Stop
Sleep Continue
Sleep Work
Work Continue
Work Continue


Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work Continue
Work C

In [6]:
print(policy_softmax(Policy))

{'Work': [1.0, 5.724925617112917e-49], 'Game': [0.02768685947930131, 0.9723131405206987], 'Sleep': [0.08213361453156497, 0.10244308403682129, 0.8154233014316137]}
