In [3]:
import pandas as pd 
import numpy as np 
import random

data = pd.read_csv('D:\Master_Folder\Data Science Course\Projects\StockMarket\stock_data/SUZLON.NS_2023-01-01_to_2024-11-21_ML_QL.csv')

class MarketEnvironment:
    def __init__(self, data):
        self.data = data.reset_index()
        self.current_step = 0
        self.done = False

    def reset(self):
        self.current_step = 0
        self.done = False

        return self.data.iloc[self.current_step][['Temporal_Features', 'Price_Features', 'Upward_Downward_Probability', 'Cluster', 'Anomaly']].values

    def step(self, action):

        current_close = self.data['Close'].iloc[self.current_step]
        next_close = self.data['Close'].iloc[self.current_step + 1] if self.current_step + 1 < len(self.data) else current_close

        if action == 0:
            reward = next_close - current_close
        elif action == 1: 
            reward = current_close - next_close
        elif action == 2:
            reward = -0.01

        self.current_step +=1
        if self.current_step >=len(self.data) - 1:
            self.done = True

        next_state = (self.data.iloc[self.current_step][['Temporal_Features', 'Price_Features', 'Upward_Downward_Probability', 'Cluster', 'Anomaly']].values if not self.done else None)

        return next_state, reward, self.done

class QLearningAgent:
    def __init__(self, state_size, action_size, learning_rate=0.1, discount_factor=0.95, exploration_rate=1.0, exploration_decay=0.995): 
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_rate = exploration_rate
        self.exploration_decay = exploration_decay
        self.q_table = {}

    def get_q_values(self, state):
        state_tuple = tuple(state)
        if state_tuple not in self.q_table:
            self.q_table[state_tuple] = np.zeros(self.action_size)
        return self.q_table[state_tuple]

    def choose_action(self, state):
        if np.random.rand() < self.exploration_rate:
            return np.random.choice(self.action_size)
        q_values = self.get_q_values(state)
        return np.argmax(q_values)

    def learn(self, state, action, reward, next_state): 
        state_tuple = tuple(state)
        next_state_tuple = tuple(next_state) if next_state is not None else None

        q_values = self.get_q_values(state)
        q_next = np.max(self.get_q_values(next_state)) if next_state_tuple else 0

        q_values[action] += self.learning_rate * (reward + self.discount_factor * q_next - q_values[action])

        self.exploration_rate *= self.exploration_decay

        # print(f"Exploration Rate after decay: {self.exploration_rate}")
        

def train_rl(data, episodes=10):
    
    env = MarketEnvironment(data)
    agent = QLearningAgent(state_size=5, action_size=3)

    rewards_per_episode = []

    # print("pt 1")

    for episode in range(episodes):
        state = env.reset()
        total_reward = 0
        # print("pt 2")
        while not env.done:
            action = agent.choose_action(state)
            next_state, reward, done = env.step(action)
            agent.learn(state, action, reward, next_state)

            state = next_state
            total_reward += reward

        rewards_per_episode.append(total_reward)

        # if (episode + 1) % 100 == 0:
        print(f"Episode {episode}, Total Reward: {total_reward:.2f}, Exploration Rate: {agent.exploration_rate:.4f}")
    print((pd.DataFrame(agent.q_table)).T)
    print("----------------------------------------------------------------------------------------------------------")

    return agent, rewards_per_episode

trained_agent, rewards = train_rl(data)



Episode 0, Total Reward: 1.89, Exploration Rate: 0.7403
Episode 1, Total Reward: 3.19, Exploration Rate: 0.5480
Episode 2, Total Reward: 11.01, Exploration Rate: 0.4057
Episode 3, Total Reward: 14.65, Exploration Rate: 0.3003
Episode 4, Total Reward: 12.62, Exploration Rate: 0.2223
Episode 5, Total Reward: 13.67, Exploration Rate: 0.1646
Episode 6, Total Reward: 14.49, Exploration Rate: 0.1218
Episode 7, Total Reward: 14.31, Exploration Rate: 0.0902
Episode 8, Total Reward: 15.41, Exploration Rate: 0.0668
Episode 9, Total Reward: 16.82, Exploration Rate: 0.0494
                                           0         1         2
-0.059697 -2.917016 -2.520329 2 -1  0.000000  0.162638 -0.000525
 0.261311 -2.921657 -2.520329 2 -1  0.031263 -0.004874 -0.001000
 0.174668 -2.925137 -2.520329 2 -1  0.013984  0.000902 -0.001000
 0.088026 -2.927459 -2.520329 2 -1  0.001699  0.053409 -0.001000
 0.001383 -2.933238 -2.520329 2 -1 -0.054200  0.114631  0.000000
-0.085260 -2.937886 -1.892661 2 -1 -0.0350