<a href="https://colab.research.google.com/github/2303A51758/2303A51758-b-11-PDS/blob/main/reinforcement_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import numpy as np
import random
import gym
from gym import spaces
import torch
import torch.nn as nn
import torch.optim as optim

# -------------------------------
# Step 1: Custom RL Environment
# -------------------------------
class SocialMediaEnv(gym.Env):
    def __init__(self):
        super(SocialMediaEnv, self).__init__()

        # Example state: [hours_on_social_media, GPA, sleep_hours, stress_level]
        self.observation_space = spaces.Box(low=np.array([0, 0, 0, 0]),
                                            high=np.array([12, 10, 12, 10]),
                                            dtype=np.float32)

        # Actions: 0 = No change, 1 = Reduce usage, 2 = Awareness program
        self.action_space = spaces.Discrete(3)

        self.state = None
        self.reset()

    def step(self, action):
        hours, gpa, sleep, stress = self.state

        # Simulate action effects
        if action == 1:  # reduce usage
            hours = max(0, hours - 1)
            gpa = min(10, gpa + 0.1)
            stress = max(0, stress - 0.1)
        elif action == 2:  # awareness program
            hours = max(0, hours - 0.5)
            sleep = min(12, sleep + 0.2)

        # Reward: better GPA & sleep, lower stress
        reward = gpa + sleep - hours - stress
        self.state = np.array([hours, gpa, sleep, stress])
        done = gpa >= 9.5 or sleep >= 10  # stop if very good condition
        return self.state, reward, done, {}

    def reset(self):
        self.state = np.array([random.randint(3, 8),  # hours
                               random.uniform(4, 7), # GPA
                               random.uniform(5, 8), # sleep
                               random.uniform(3, 6)]) # stress
        return self.state

# -------------------------------
# Step 2: Q-Learning
# -------------------------------
def q_learning(env, episodes=500):
    q_table = np.zeros((20, 20, 20, 20, env.action_space.n))  # discretized

    def discretize(state):
        return tuple((state // 1).astype(int))  # bucket

    alpha, gamma, epsilon = 0.1, 0.9, 0.2

    for _ in range(episodes):
        state = env.reset()
        d_state = discretize(state)

        done = False
        while not done:
            if random.uniform(0, 1) < epsilon:
                action = env.action_space.sample()
            else:
                action = np.argmax(q_table[d_state])

            new_state, reward, done, _ = env.step(action)
            d_new = discretize(new_state)

            q_table[d_state][action] = (1 - alpha) * q_table[d_state][action] + alpha * (reward + gamma * np.max(q_table[d_new]))

            d_state = d_new
    return q_table

# -------------------------------
# Step 3: SARSA
# -------------------------------
def sarsa(env, episodes=500):
    q_table = np.zeros((20, 20, 20, 20, env.action_space.n))

    def discretize(state):
        return tuple((state // 1).astype(int))

    alpha, gamma, epsilon = 0.1, 0.9, 0.2

    for _ in range(episodes):
        state = env.reset()
        d_state = discretize(state)

        action = env.action_space.sample()
        done = False
        while not done:
            new_state, reward, done, _ = env.step(action)
            d_new = discretize(new_state)

            if random.uniform(0, 1) < epsilon:
                new_action = env.action_space.sample()
            else:
                new_action = np.argmax(q_table[d_new])

            q_table[d_state][action] += alpha * (reward + gamma * q_table[d_new][new_action] - q_table[d_state][action])

            d_state, action = d_new, new_action
    return q_table

# -------------------------------
# Step 4: Deep Q-Network (DQN)
# -------------------------------
class DQN(nn.Module):
    def __init__(self, state_size, action_size):
        super(DQN, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(state_size, 64),
            nn.ReLU(),
            nn.Linear(64, action_size)
        )
    def forward(self, x):
        return self.fc(x)

def train_dqn(env, episodes=500):
    model = DQN(env.observation_space.shape[0], env.action_space.n)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.MSELoss()

    gamma, epsilon = 0.9, 0.2

    for _ in range(episodes):
        state = torch.tensor(env.reset(), dtype=torch.float32)
        done = False
        while not done:
            if random.random() < epsilon:
                action = env.action_space.sample()
            else:
                with torch.no_grad():
                    action = torch.argmax(model(state)).item()

            new_state, reward, done, _ = env.step(action)
            new_state = torch.tensor(new_state, dtype=torch.float32)

            target = reward + gamma * torch.max(model(new_state)).item()
            pred = model(state)[action]

            loss = criterion(pred, torch.tensor(target, dtype=torch.float32))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            state = new_state

# -------------------------------
# Step 5: Policy Gradient (REINFORCE)
# -------------------------------
class PolicyNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(PolicyNetwork, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(state_size, 64),
            nn.ReLU(),
            nn.Linear(64, action_size),
            nn.Softmax(dim=-1)
        )
    def forward(self, x):
        return self.fc(x)

def reinforce(env, episodes=500):
    policy = PolicyNetwork(env.observation_space.shape[0], env.action_space.n)
    optimizer = optim.Adam(policy.parameters(), lr=0.001)

    for _ in range(episodes):
        state = torch.tensor(env.reset(), dtype=torch.float32)
        rewards, log_probs = [], []
        done = False
        while not done:
            probs = policy(state)
            action = torch.multinomial(probs, 1).item()
            log_prob = torch.log(probs[action])

            new_state, reward, done, _ = env.step(action)
            rewards.append(reward)
            log_probs.append(log_prob)
            state = torch.tensor(new_state, dtype=torch.float32)

        returns = torch.tensor(sum(rewards), dtype=torch.float32)
        loss = -sum([log_prob * returns for log_prob in log_probs])

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

# -------------------------------
# Step 6: Actor-Critic
# -------------------------------
class ActorCritic(nn.Module):
    def __init__(self, state_size, action_size):
        super(ActorCritic, self).__init__()
        self.fc = nn.Linear(state_size, 64)
        self.actor = nn.Linear(64, action_size)
        self.critic = nn.Linear(64, 1)

    def forward(self, x):
        x = torch.relu(self.fc(x))
        return torch.softmax(self.actor(x), dim=-1), self.critic(x)

def actor_critic(env, episodes=500):
    ac = ActorCritic(env.observation_space.shape[0], env.action_space.n)
    optimizer = optim.Adam(ac.parameters(), lr=0.001)

    for _ in range(episodes):
        state = torch.tensor(env.reset(), dtype=torch.float32)
        done = False
        while not done:
            probs, value = ac(state)
            action = torch.multinomial(probs, 1).item()

            new_state, reward, done, _ = env.step(action)
            new_state = torch.tensor(new_state, dtype=torch.float32)
            _, new_value = ac(new_state)

            advantage = torch.tensor(reward + (0.9 * new_value.item() if not done else 0) - value.item(), dtype=torch.float32)

            actor_loss = -torch.log(probs[action]) * advantage
            critic_loss = advantage ** 2
            loss = actor_loss + critic_loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            state = new_state

# -------------------------------
# Example Run
# -------------------------------
env = SocialMediaEnv()
q_learning(env)
sarsa(env)
train_dqn(env)
reinforce(env)
actor_critic(env)

print("All RL algorithms implemented and trained!")

All RL algorithms implemented and trained!
