In [None]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt

class Agent: 
    def __init__(self, x, y, talent, square_size, learning_rate, discount_factor):
        self.x = x
        self.y = y
        self.talent = talent
        self.capital = 10
        self.green_hits = 0
        self.red_hits = 0
        self.square_size = square_size
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.Q = {}

    def choose_action(self, epsilon):
        state = (self.x, self.y)
        if random.uniform(0, 1) < epsilon:
            action = random.choice(["up", "down", "right", "left"])
        else:
            if state in self.Q:
                if False:  #エラーが出るので保留
                    action = random.choice(["up", "down", "right", "left"])
                    maxq = self.Q[state][action]
                    for a in self.Q[state]:
                        if maxq < self.Q[state][a]:
                            action = a
                            maxq = self.Q[state][a]
                else:
                    action = max(self.Q[state], key=self.Q[state].get)
            else:
                action = random.choice(["up", "down", "right", "left"])
        return action

    def get_next_position(self, action):
        #next_x, next_y = self.x, self.y
        if action == "up":
            self.y = (self.y + 1) % self.square_size
            self.x = self.x
        elif action == "down":
            self.y = (self.y - 1) % self.square_size
            self.x = self.x
        elif action == "left":
            self.x = (self.x - 1) % self.square_size
            self.y = self.y
        elif action == "right":
            self.x = (self.x + 1) % self.square_size
            self.y = self.y
        return self.x, self.y
    
    #def get_q_table(self):
       # return self.Q

class Simulation:
    def __init__(self, square_size, num_agents, num_moves, num_green_balls, num_red_balls, num_multiple, learning_rate, discount_factor, epsilon):
        self.square_size = square_size
        self.num_agents = num_agents
        self.num_moves = num_moves
        self.num_green_balls = num_green_balls
        self.num_red_balls = num_red_balls
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.epsilon = epsilon
        self.agents = []
        self.green_balls = []
        self.red_balls = []
        self.num_multiple = num_multiple

    def initialize_agents(self, position_only = False, position_capital_only = False):
        if position_only:
            for _ in range(self.num_agents):
                self.agents[_].x = np.random.randint(self.square_size)
                self.agents[_].y = np.random.randint(self.square_size)
        elif position_capital_only:
            for _ in range(self.num_agents):
                self.agents[_].x = np.random.randint(self.square_size)
                self.agents[_].y = np.random.randint(self.square_size)
                self.agents[_].capital = 10
        else:
            for _ in range(self.num_agents):
                x = np.random.randint(self.square_size)
                y = np.random.randint(self.square_size)
                talent = np.random.normal(0.6, 0.1)
                agent = Agent(x, y, talent, self.square_size, self.learning_rate, self.discount_factor)
                self.agents.append(agent)

    def initialize_balls(self):
        for _ in range(self.num_green_balls):
            x = np.random.randint(self.square_size)
            y = np.random.randint(self.square_size)
            self.green_balls.append((x, y))
        for _ in range(self.num_red_balls):
            x = np.random.randint(self.square_size)
            y = np.random.randint(self.square_size)
            self.red_balls.append((x, y))
            
            
    def get_q_tables(self):
        a_to_int = {'up':0, 'down':1, 'left':2, 'right':3}
        q_tables = np.zeros((self.square_size, self.square_size, 4))
        for agent in self.agents:
            for position in agent.Q:
                for action in agent.Q[position]:
                    q_tables[position[0], position[1], a_to_int[action]] += agent.Q[position][action]
        return q_tables / self.num_agents

            

    def run_simulation(self):
        self.initialize_agents()
        self.initialize_balls()

        for agent in self.agents:
            for x in range(self.square_size):
                for y in range(self.square_size):
                    #agent.Q[(x, y)] = {"up": 0, "down": 0, "left": 0, "right": 0}  # 行動価値関数の初期化
                    agent.Q[(x, y)] = {"up": np.random.rand()/10000, "down": np.random.rand()/10000, "left": np.random.rand()/10000, "right": np.random.rand()/10000}  # 行動価値関数の初期化

        # Training (40 episodes)
        agentlog_train = []
        for _ in range(self.num_moves * 40):
            #self.initialize_agents(position_only=True)
            for k, agent in enumerate(self.agents):
                agent.learning_rate = np.random.rand() * self.learning_rate 
                #agent.learning_rate = np.random.normal(self.learning_rate / 2, 0.005)
                state = (agent.x, agent.y)
                action = agent.choose_action(self.epsilon)
                next_x, next_y = agent.get_next_position(action)
                next_state = (next_x, next_y)
                green_hit = (next_x, next_y) in self.green_balls  # グリーンボールに当たるか判定
                red_hit = (next_x, next_y) in self.red_balls  # レッドボールに当たるか判定

                # ボールに当たった場合の報酬とQ値の更新
                reward = 0 
                if green_hit:
                    if agent.talent >= np.random.random():
                        reward = agent.capital / 10**4
                        #reward = 1
                        agent.capital *= self.num_multiple
                if red_hit:
                        agent.capital /= self.num_multiple
                        reward = -agent.capital / 10**4
                        #reward = -1

                

                # Q値の更新
                if state in agent.Q:
                    if next_state not in agent.Q:
                        agent.Q[next_state] = {"up": 0, "down": 0, "left": 0, "right": 0}# 新しい状態のQ値を初期化
                    max_q = agent.Q[next_state][max(agent.Q[next_state], key=agent.Q[next_state].get)]
                    agent.Q[state][action] += agent.learning_rate * (reward + agent.discount_factor * max_q - agent.Q[state][action])
                    
                agentlog_train.append([_, k, agent.x, agent.y])
        agentlog_train = pd.DataFrame(columns=['t', 'a', 'x', 'y'], data=np.array(agentlog_train, dtype=int))

        # Reset agents' capital
        for agent in self.agents:
            agent.capital = 10

        # Test (1 episode)
        self.initialize_agents(position_capital_only=True)
        agentlog_test = []
        for _ in range(self.num_moves):
            for k, agent in enumerate(self.agents):
                state = (agent.x, agent.y)
                action = agent.choose_action(self.epsilon)
                next_x, next_y = agent.get_next_position(action)
                next_state = (next_x, next_y)
                green_hit = (next_x, next_y) in self.green_balls  # グリーンボールに当たるか判定
                red_hit = (next_x, next_y) in self.red_balls  # レッドボールに当たるか判定

                # ボールに当たった場合の報酬とQ値の更新
                if green_hit:
                    if agent.talent >= np.random.random():
                        agent.capital *= self.num_multiple
                        agent.green_hits += 1
                if red_hit:
                        agent.capital /= self.num_multiple
                        agent.red_hits += 1

                #reward = agent.capital  # 報酬は資本の変化

                # Q値の更新
                #if state in agent.Q:
                    #if next_state not in agent.Q:
                        #agent.Q[next_state] = {"up": 0, "down": 0, "left": 0, "right": 0}  # 新しい状態のQ値を初期化
                    #max_q = max(agent.Q[state], key=agent.Q[state].get)
                    #agent.Q[state][action] += agent.learning_rate * (reward + agent.discount_factor * agent.Q[next_state][max_q] - agent.Q[state][action])
                agentlog_test.append([_, k, agent.x, agent.y])
        agentlog_test = pd.DataFrame(columns=['t', 'a', 'x', 'y'], data=np.array(agentlog_test, dtype=int))
        return agentlog_train, agentlog_test

    def save_results_to_csv(self, filename):
        results = {'Agent': [], 'Learning_Rate': [],'Capital': [], 'Talent': [], 'Green Hits': [], 'Red Hits': []}
        for i, agent in enumerate(self.agents):
            results['Agent'].append(i + 1)
            results['Learning_Rate'].append(agent.learning_rate)
            results['Capital'].append(agent.capital)
            results['Talent'].append(agent.talent)
            results['Green Hits'].append(agent.green_hits)
            results['Red Hits'].append(agent.red_hits)

        df = pd.DataFrame(results)
        df.to_csv(filename, index=False)

    def visualize_results(self):
        plt.figure(figsize=(8, 8))
        for agent in self.agents:
            plt.scatter(agent.x, agent.y, color='blue', marker='*', alpha=0.6, s=100)

        for ball in self.green_balls:
            plt.scatter(ball[0], ball[1], color='green', marker='o', alpha=0.6, s=50)
        for ball in self.red_balls:
            plt.scatter(ball[0], ball[1], color='red', marker='o', alpha=0.6, s=50)

        plt.xlim(0, self.square_size)
        plt.ylim(0, self.square_size)
        plt.title('Agent and Ball Positions')
        plt.xlabel('X')
        plt.ylabel('Y')
        plt.show()

# シミュレーションの実行と結果の保存
simulation = Simulation(square_size=25, num_agents=1000, num_moves=80, num_green_balls=50, num_red_balls=50, num_multiple = 1.2, learning_rate=0.02, discount_factor=0.9, epsilon=0.1)
agentlog_train, agentlog_test = simulation.run_simulation()
simulation.save_results_to_csv('simulation_results_rl.csv')
simulation.visualize_results()
q_tables = simulation.get_q_tables()

In [None]:
def gain_capital_highest_agent(times):
    df_high = pd.DataFrame()
    for i in range(times):
        if i%10==0:print(f'time:{i}')
        simulation = Simulation(square_size=25, num_agents=1000, num_moves=80, num_green_balls=50, num_red_balls=50, num_multiple = 1.2, learning_rate=0.005, discount_factor=0.9, epsilon=0.1)
        agentlog_train, agentlog_test = simulation.run_simulation()
        simulation.save_results_to_csv('simulation_results_rl.csv')
        df = pd.read_csv('simulation_results_rl.csv')
        
        a = df[['Talent', 'Capital', 'Learning_Rate']].loc[df.nlargest(1,'Capital').index]
        list_ = [[a['Learning_Rate'].iloc[-1], a['Capital'].iloc[-1], a['Talent'].iloc[-1]]]
        
        
        df_new = pd.DataFrame(list_, columns=['Learning_rate', 'Capital', 'Talent'])
        df_high = pd.concat([df_high, df_new])
    
    return df_high

In [None]:
df_highest = gain_capital_highest_agent(10000)