# Setup Model

In [2]:
import numpy as np
import random
from collections import deque
import tensorflow as tf


# Define the DQN class
class DQNAgent:
    def __init__(self, state_size):
        self.state_size = state_size
        self.action_size = 201
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = self._build_model()

    def _build_model(self):
        model = tf.keras.models.Sequential()
        model.add(tf.keras.layers.Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(tf.keras.layers.Dense(24, activation='relu'))
        model.add(tf.keras.layers.Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(lr=self.learning_rate))
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(-30, 31)  # Random action between -action_size and action_size
        act_values = self.model.predict(state)
        return np.clip(int(np.argmax(act_values[0])) - self.action_size, -self.action_size, self.action_size)  # Clip the action to the range of -action_size to action_size


    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = (reward + self.gamma * np.amax(self.model.predict(next_state)[0]))
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

# Define your environment and state representation
class StudentEnvironment:
    def __init__(self,max_steps=7):
        self.state_size = 4  # Current score, correct questions / 6,  user feedback, current streak
        self.action_size = 201  # Score update from -100 to 100
        self.current_streak = 0
        self.max_steps = max_steps
        self.steps = 0  # Initialize steps counter
        self.target_score = random.randint(0, 100)  # Randomly generate target score

    def reset(self):
        # Reset the environment and return initial state
        self.steps = 0
        self.current_streak = 0
        self.target_score = random.randint(0, 100)  # Randomly generate target score
        return np.zeros(self.state_size)

 

    def step(self, state, action,correct_questions,user_feedback):
        # Simulate taking an action in the environment and return the next state and reward
        self.steps += 1  # Increment step counter
        next_state = state.copy()

        next_state[0] += action  # Update score
        next_state[0] = np.clip(next_state[0], 0, 100)
        next_state[1] = correct_questions 
        next_state[2] = user_feedback
        if next_state[1] == 6 or next_state[1] == 0:
            self.current_streak += 1  # Increase streak if 6/6
        else:
            self.current_streak = 0  # Reset streak otherwise
        next_state[3] = self.current_streak  # Update current streak
     
        # Calculate reward based on how close the score is to the target of 3/6
        target_correct = 3

        
        if next_state[1] == target_correct:
            # Reward for being at the target difficulty
            reward = 1 
        elif next_state[1] == target_correct + 1 or next_state[1] == target_correct - 1:
            reward = 0.25 
        elif next_state[1] == target_correct + 2 or next_state[1] == target_correct - 2:
            reward = 0
        else:
            # Penalty for being away from the target difficulty
            reward = -0.25 


        # Apply heavy penalty for streaks
        reward -= next_state[2]

        # Ensure reward is within valid range
        reward = max(-1, reward)
        if self.steps >= self.max_steps:
            done = True
        else:
            done = False
        return next_state, reward, done
    
def progressive_randomizer(diff):
        choices = {
            range(55,101):[6],
            range(35,55):[6]*20 + [5]*16 + [4]*2 + [3] ,
            range(25, 35): [6]*15 + [5]*11 + [4]*8 + [3]*4 + [2],
            range(15, 25): [6]*6 + [5]*10 + [4]*6 + [3]*4 + [2]*3 + [1]*2 ,
            range(5, 15): [6]*3 + [5]*5 + [4]*7 + [3]*5 + [2]*3 + [1]*2 + [0],
            range(0, 5): [6]*2 + [5]*3 + [4]*5 + [3]*7 + [2]*4 + [1]*2 + [0],
            range(-5, 0): [6] + [5]*2 + [4]*4 + [3]*7 + [2]*5 + [1]*3 + [0]*2,
            range(-15, -5): [6] + [5]*2 + [4]*3 + [3]*5 + [2]*7 + [1]*5 + [0]*3,
            range(-25, -15):  [5] + [4]*2+ [3]*4 + [2]*6 + [1]*10 + [0]*6,
            range(-35, -25):  [4] + [3]*2 + [2]*8 + [1]*11 + [0]*15,
            range(-55, -35): [3] + [2]*2 + [1]*16 + [0]*20,
            range(-100, -55): [0],
        }

        for key in choices:
            if diff in key:
                return random.choice(choices[key])







version 2

In [2]:
import numpy as np
import random
from collections import deque
import tensorflow as tf


# Define the DQN class
class DQNAgent:
    def __init__(self, state_size):
        self.state_size = state_size
        self.action_size = 201
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = self._build_model()

    def _build_model(self):
      model = tf.keras.models.Sequential()
      model.add(tf.keras.layers.Dense(24, input_dim=self.state_size, activation='relu'))
      model.add(tf.keras.layers.Dropout(0.2))  # Adding dropout with a rate of 0.2 (20%)
      model.add(tf.keras.layers.Dense(24, activation='relu'))
      model.add(tf.keras.layers.Dropout(0.2))  # Adding dropout with a rate of 0.2 (20%)
      model.add(tf.keras.layers.Dense(self.action_size, activation='linear'))
      model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(lr=self.learning_rate))
      return model


    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(-30, 31)  # Random action between -action_size and action_size
        act_values = self.model.predict(state)
        return np.clip(int(np.argmax(act_values[0])) - self.action_size, -self.action_size, self.action_size)  # Clip the action to the range of -action_size to action_size


    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = (reward + self.gamma * np.amax(self.model.predict(next_state)[0]))
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

# Define your environment and state representation
class StudentEnvironment:
    def __init__(self,max_steps=7):
        self.state_size = 5 # Current score, correct questions / 6, streak of 6/6, user feedback, current streak, user score
        self.action_size = 201  # Score update from -100 to 100
        self.current_streak = 0
        self.max_steps = max_steps
        self.steps = 0  # Initialize steps counter
        self.target_score = random.randint(0, 100)  # Randomly generate target score

    def reset(self):
        # Reset the environment and return initial state
        self.steps = 0
        self.current_streak = 0
        self.target_score = random.randint(0, 100)  # Randomly generate target score
        return np.zeros(self.state_size)

    def progressive_randomizer(self,score):
        choices = {
            range(55,101):[6],
            range(35,55):[6]*20 + [5]*16 + [4]*5,
            range(25, 35): [6]*15 + [5]*11 + [4]*8 + [3]*4 ,
            range(15, 25): [6]*6 + [5]*8 + [4]*6 + [3]*4 + [2]*3 ,
            range(5, 15): [6] + [5]*5 + [4]*10 + [3]*8 + [2]*3 + [1]*2 + [0],
            range(0, 5): [6] + [5]*3 + [4]*5 + [3]*15 + [2]*4 + [1]*2 + [0],
            range(-5, 0): [6] + [5]*2 + [4]*4 + [3]*15 + [2]*5 + [1]*3 + [0]*2,
            range(-15, -5): [0] + [1]*5 + [2]*10 + [3]*8 + [4]*3 + [5]*2 + [6],
            range(-25, -15): [0]*6 + [1]*8 + [2]*6 + [3]*4 + [4]*3 ,
            range(-35, -25): [0]*15 + [1]*11 + [2]*8 + [3]*4 ,
            range(-55,-35) : [0]*20 + [1]*16 + [2]*5,
            range(-100,-55):[0]
        }

        for key in choices:
            if score in key:
                return random.choice(choices[key])

    def step(self, state, action):
        # Simulate taking an action in the environment and return the next state and reward
        self.steps += 1  # Increment step counter
        next_state = state.copy()

        next_state[0] += action  # Update score
        next_state[0] = np.clip(next_state[0], 0, 100)
        diff = self.target_score-next_state[0]
        next_state[1] = self.progressive_randomizer(diff)  # Randomly generate correct questions
        next_state[2] = random.choice([-1,-0.5,0,0,0,0,0,0,0,0.5,1])  # Randomly generate user feedback
        if next_state[1] == 6 or next_state[1] == 0:
            self.current_streak += 1  # Increase streak if 6/6
        else:
            self.current_streak = 0  # Reset streak otherwise
        next_state[3] = self.current_streak  # Update current streak
        next_state[4] = next_state[0]  # Set user score equal to current score
        # Calculate reward based on how close the score is to the target of 3/6
        target_correct = 3
        penalty_multiplier = 0.25  # Adjust as needed


        if next_state[1] == target_correct:
            # Reward for being at the target difficulty
            reward = 1
        elif next_state[1] == target_correct + 1 or next_state[1] == target_correct - 1:
            reward = 0.25
        elif next_state[1] == target_correct + 2 or next_state[1] == target_correct - 2:
            reward = 0
        else:
            # Penalty for being away from the target difficulty
            reward = -0.25


        # Apply heavy penalty for streaks
        reward -= next_state[2] * penalty_multiplier

        # Ensure reward is within valid range
        reward = max(-1, reward)
        if self.steps >= self.max_steps:
            done = True
        else:
            done = False
        return next_state, reward, done








# Train model

In [3]:
# Training Loop
env = StudentEnvironment(max_steps=10)
state_size = env.state_size
action_size = env.action_size
agent = DQNAgent(state_size)

batch_size = 32
episodes = 100
            
for episode in range(episodes):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    done = False
    total_reward = 0

    while not done:
        action = agent.act(state)
        print(f"action: {action}")
        diff = env.target_score - state[0][0]
        correct_questions = progressive_randomizer(diff)
        user_feedback = random.choice([-1,-0.5,0,0,0,0,0,0,0,0.5,1])  # Randomly generate user feedback
        next_state, reward, done = env.step(state[0], action,correct_questions,user_feedback)
        next_state = np.reshape(next_state, [1, state_size])
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward

        if len(agent.memory) > batch_size:
            agent.replay(batch_size)

    if agent.epsilon > agent.epsilon_min:
        agent.epsilon *= agent.epsilon_decay

    print(f"Episode {episode + 1}/{episodes}, Total Reward: {total_reward}")
agent.model.save_weights("dqn_model_final_weights.h5")
agent.model.save("dqn_model_final.h5")







action: 21


NameError: name 'progressive_randomizer' is not defined

version 2

In [4]:
# Training Loop
env = StudentEnvironment(max_steps=6)
state_size = env.state_size
action_size = env.action_size
agent = DQNAgent(state_size)

batch_size = 32
episodes = 150

for episode in range(episodes):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    done = False
    total_reward = 0

    while not done:
        action = agent.act(state)
        next_state, reward, done = env.step(state[0], action)
        next_state = np.reshape(next_state, [1, state_size])
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward

        if len(agent.memory) > batch_size:
            agent.replay(batch_size)

    if agent.epsilon > agent.epsilon_min:
        agent.epsilon *= agent.epsilon_decay

    print(f"Episode {episode + 1}/{episodes}, Total Reward: {total_reward}")

agent.model.save_weights("dqn_model_final_weights_v2.h5")
agent.model.save("dqn_model_final_v2.h5")



Episode 1/150, Total Reward: -1.0
Episode 2/150, Total Reward: 2.375
Episode 3/150, Total Reward: 0.875
Episode 4/150, Total Reward: 0.25
Episode 5/150, Total Reward: -0.375






Episode 6/150, Total Reward: -1.875
Episode 7/150, Total Reward: 1.375
Episode 8/150, Total Reward: 1.25
Episode 9/150, Total Reward: 1.125
Episode 10/150, Total Reward: -0.875
Episode 11/150, Total Reward: -1.375
Episode 12/150, Total Reward: 0.0
Episode 13/150, Total Reward: -1.0
Episode 14/150, Total Reward: -0.125
Episode 15/150, Total Reward: -1.375
Episode 16/150, Total Reward: 2.5
Episode 17/150, Total Reward: -0.75
Episode 18/150, Total Reward: -0.25
Episode 19/150, Total Reward: 0.625
Episode 20/150, Total Reward: 1.75
Episode 21/150, Total Reward: -0.5
Episode 22/150, Total Reward: 1.375
Episode 23/150, Total Reward: -0.625
Episode 24/150, Total Reward: -1.5
Episode 25/150, Total Reward: 2.25
Episode 26/150, Total Reward: -1.375
Episode 27/150, Total Reward: -1.375
Episode 28/150, Total Reward: -1.375
Episode 29/150, Total Reward: 0.125
Episode 30/150, Total Reward: 0.625
Episode 31/150, Total Reward: -0.625
Episode 32/150, Total Reward: -2.0
Episode 33/150, Total Reward: 1.2

  saving_api.save_model(


# Test Model

## test model

In [5]:
# Initialize environment and agent
env = StudentEnvironment(max_steps=10)
state_size = env.state_size
action_size = env.action_size

# Build the model architecture
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(24, input_dim=state_size, activation='relu'))
model.add(tf.keras.layers.Dense(24, activation='relu'))
model.add(tf.keras.layers.Dense(action_size, activation='linear'))

# Load the trained model weights
model.load_weights("dqn_model_final_weights_v2.h5")

# Testing parameters
test_episodes = 100

# Testing Loop
for episode in range(test_episodes):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    done = False
    total_reward = 0

    while not done:
        # Use model.predict to get Q-values for each action
        q_values = model.predict(state)[0]
        
        # Clip the Q-values to the action space
        q_values = np.clip(q_values, -action_size, action_size)
        
        # Select the action with the highest Q-value
        action = np.argmax(q_values)
        
        # Adjust the action to be within the desired range
        action -= action_size
        action = np.clip(action, -action_size, action_size)  # Ensure action is within range
        
        print(f"Action: {action}")
        
        diff = env.target_score - state[0][0]
        correct_questions = progressive_randomizer(diff)
        user_feedback = random.choice([-1,-0.5,0,0,0,0,0,0,0,0.5,1])  # Randomly generate user feedback
        next_state, reward, done = env.step(state[0], action, correct_questions, user_feedback)
        next_state = np.reshape(next_state, [1, state_size])
        state = next_state
        total_reward += reward

        # Print target score and current score
        print(f"Target Score: {env.target_score}, Current Score: {state[0][0]}, Reward: {reward}")

    print(f"Test Episode {episode + 1}/{test_episodes}, Total Reward: {total_reward}")


Action: -190


NameError: name 'progressive_randomizer' is not defined

version 2

In [6]:
# Initialize environment and agent
env = StudentEnvironment(max_steps=10)
state_size = env.state_size
agent = DQNAgent(state_size)

# Load the trained model weights (assuming the model is saved after training)
agent.model.load_weights("dqn_model_final_weights_v2.h5")

# Testing parameters
test_episodes = 25

# Testing Loop
for episode in range(test_episodes):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    done = False
    total_reward = 0

    while not done:
        action = agent.act(state)  # Choose action based on learned policy
        print(f"action : {action}")
        next_state, reward, done = env.step(state[0], action)  # Take action in the environment
        next_state = np.reshape(next_state, [1, state_size])
        state = next_state
        total_reward += reward

        # Print target score and current score
        print(f"Target Score: {env.target_score}, Current Score: {state[0][0]}, reward: {reward}")

    print(f"Test Episode {episode + 1}/{test_episodes}, Total Reward: {total_reward}")




action : 9
Target Score: 22, Current Score: 9.0, reward: 0.375
action : 18
Target Score: 22, Current Score: 27.0, reward: -0.125
action : 23
Target Score: 22, Current Score: 50.0, reward: 0.0
action : -4
Target Score: 22, Current Score: 46.0, reward: 0.0
action : 15
Target Score: 22, Current Score: 61.0, reward: 0.125
action : -16
Target Score: 22, Current Score: 45.0, reward: 0.0
action : -3
Target Score: 22, Current Score: 42.0, reward: 0.25
action : -24
Target Score: 22, Current Score: 18.0, reward: 1.0
action : -27
Target Score: 22, Current Score: 0.0, reward: 0.125
action : 17
Target Score: 22, Current Score: 17.0, reward: 1.0
Test Episode 1/25, Total Reward: 2.75
action : -29
Target Score: 81, Current Score: 0.0, reward: -0.375
action : 24
Target Score: 81, Current Score: 24.0, reward: -0.25
action : 17
Target Score: 81, Current Score: 41.0, reward: 0.25
action : -14
Target Score: 81, Current Score: 27.0, reward: 0.0
action : -17
Target Score: 81, Current Score: 10.0, reward: -0.