In [8]:
!pip install nbconvert



In [9]:
# -*- coding: utf-8 -*-
"""
PPO Dynamic Pricing with Gymnasium + RPT Integrated
T√≠ch h·ª£p ƒë·∫ßy ƒë·ªß: RPT Demand ‚Üí PPO Pricing ‚Üí Gi√° ƒë·ªÅ xu·∫•t
Kh√¥ng ph·ª• thu·ªôc file rpt.py / news_embeding.py
"""

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pickle
import json
import pandas as pd
from datetime import datetime, timedelta
import os
import warnings
from typing import Tuple
from collections import deque
import gymnasium as gym
from gymnasium import spaces
!git clone https://github.com/FongNgoo/Basic_Dynamic_Prices_base_on_Demand_Model.git
%cd Basic_Dynamic_Prices_base_on_Demand_Model
warnings.filterwarnings("ignore")

Cloning into 'Basic_Dynamic_Prices_base_on_Demand_Model'...
remote: Enumerating objects: 187, done.[K
remote: Counting objects: 100% (187/187), done.[K
remote: Compressing objects: 100% (144/144), done.[K
remote: Total 187 (delta 44), reused 185 (delta 42), pack-reused 0 (from 0)[K
Receiving objects: 100% (187/187), 10.73 MiB | 8.11 MiB/s, done.
Resolving deltas: 100% (44/44), done.
/content/Basic_Dynamic_Prices_base_on_Demand_Model


In [13]:

!jupyter nbconvert --to python AUTO_SYNC_COLAB_GITHUB_SSH.ipynb

/content:
Basic_Dynamic_Prices_base_on_Demand_Model  drive  sample_data

/content/Basic_Dynamic_Prices_base_on_Demand_Model:
Cache
Data
Dynamic_Price_Project_Metadata.ipynb
dynamic-pricing-strategy-driven-by-deep-reinforcement-learning-with-empirical-analysis-on-the-collaborative.pdf
Git_Repo.ipynb
Logs
News_Embeding.ipynb
Output
PPO.ipynb
RPT.ipynb

/content/Basic_Dynamic_Prices_base_on_Demand_Model/Cache:
summary_01-02-24.json  summary_10-08-24.json  summary_21-01-24.json
summary_01-06-24.json  summary_10-10-24.json  summary_21-02-24.json
summary_01-07-24.json  summary_10-12-24.json  summary_21-05-24.json
summary_01-08-24.json  summary_11-01-24.json  summary_21-08-24.json
summary_01-09-24.json  summary_11-02-24.json  summary_21-12-24.json
summary_01-10-24.json  summary_11-03-24.json  summary_22-01-24.json
summary_02-01-24.json  summary_11-05-24.json  summary_22-03-24.json
summary_02-04-24.json  summary_11-06-24.json  summary_22-05-24.json
summary_02-05-24.json  summary_11-07-24.json 

# Config Set Up

In [None]:
RPT_MODEL_PATH = "/content/drive/MyDrive/Colab_Notebooks/Basic_Dynamic_Prices_base_on_Demand_Model/Output/rpt_demand_best.pth"
PREPROCESSED_NPZ = "/content/drive/MyDrive/Colab_Notebooks/Basic_Dynamic_Prices_base_on_Demand_Model/Output/preprocessed_data.npz"
SCALERS_PKL = "/content/drive/MyDrive/Colab_Notebooks/Basic_Dynamic_Prices_base_on_Demand_Model/Output/scalers.pkl"
PPO_MODEL_SAVE = "/content/drive/MyDrive/Colab_Notebooks/Basic_Dynamic_Prices_base_on_Demand_Model/Output/ppo_pricing_best.pth"

## PPO hyperparameters

In [None]:
STATE_DIM = 3 + 8 + 3 + 45 + 11  # demand + env + available + news + rev_hist(10) + current_rev
ACTION_DIM = 3
HIDDEN_DIM = 256
PPO_EPOCHS = 4
CLIP_EPS = 0.2
GAMMA = 0.99
LAMBDA = 0.95
BATCH_SIZE = 32
LEARNING_RATE = 3e-4
MAX_GRAD_NORM = 0.5
PARITY_PENALTY = 1e7  # Ph·∫°t c·ª±c n·∫∑ng n·∫øu vi ph·∫°m

## Price Constraints



In [None]:
MIN_PRICE = {'single': 800000, 'double': 1200000, 'vip': 2500000}
MAX_PRICE = {'single': 3000000, 'double': 5000000, 'vip': 10000000}

PRICE_PARITY = {
    'double_single_ratio': (1.3, 1.8),
    'vip_single_ratio': (2.0, 3.0)
}

WINDOW_SIZE = 60  # ƒê√£ d√πng ƒë√∫ng trong sliding window

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Load scalers
with open(SCALERS_PKL, 'rb') as f:
    scalers = pickle.load(f)
price_scaler = scalers['price']

# Import RPT
!jupyter nbconvert --to python RPT.ipynb
import RPT
from RPT import *
print("IMPORT QUA GIT TH√ÄNH C√îNG!")

[NbConvertApp] Converting notebook RPT.ipynb to python
[NbConvertApp] Writing 10737 bytes to RPT.py
IMPORT QUA GIT TH√ÄNH C√îNG!


# GI√Å H·ª¢P L·ªÜ (MIN/MAX + PARITY)

In [None]:
def is_valid_price(prices: np.ndarray) -> bool:
    p_s, p_d, p_v = prices
    if not (MIN_PRICE['single'] <= p_s <= MAX_PRICE['single'] and
            MIN_PRICE['double'] <= p_d <= MAX_PRICE['double'] and
            MIN_PRICE['vip'] <= p_v <= MAX_PRICE['vip']):
        return False
    r_ds = p_d / p_s
    r_vs = p_v / p_s
    if not (PRICE_PARITY['double_single_ratio'][0] <= r_ds <= PRICE_PARITY['double_single_ratio'][1] and
            PRICE_PARITY['vip_single_ratio'][0] <= r_vs <= PRICE_PARITY['vip_single_ratio'][1]):
        return False
    return True

def project_to_valid(prices: np.ndarray) -> np.ndarray:
    """Chi·∫øu gi√° v·ªÅ v√πng h·ª£p l·ªá (min/max + parity)"""
    p = prices.copy()
    p[0] = np.clip(p[0], MIN_PRICE['single'], MAX_PRICE['single'])
    p[1] = np.clip(p[1], MIN_PRICE['double'], MAX_PRICE['double'])
    p[2] = np.clip(p[2], MIN_PRICE['vip'], MAX_PRICE['vip'])

    p_s = p[0]
    # ƒêi·ªÅu ch·ªânh double
    min_d = max(MIN_PRICE['double'], PRICE_PARITY['double_single_ratio'][0] * p_s)
    max_d = min(MAX_PRICE['double'], PRICE_PARITY['double_single_ratio'][1] * p_s)
    p[1] = np.clip(p[1], min_d, max_d)

    # ƒêi·ªÅu ch·ªânh vip
    min_v = max(MIN_PRICE['vip'], PRICE_PARITY['vip_single_ratio'][0] * p_s)
    max_v = min(MAX_PRICE['vip'], PRICE_PARITY['vip_single_ratio'][1] * p_s)
    p[2] = np.clip(p[2], min_v, max_v)

    return p

# PPO ACTOR-CRITIC V·ªöI ACTION MASKING

In [None]:
class PPOActor(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(STATE_DIM, HIDDEN_DIM),
            nn.Tanh(),
            nn.Linear(HIDDEN_DIM, HIDDEN_DIM),
            nn.Tanh(),
        )
        self.mu_head = nn.Linear(HIDDEN_DIM, ACTION_DIM)
        self.log_std = nn.Parameter(torch.zeros(ACTION_DIM))

    def forward(self, x):
        x = self.net(x)
        mu = torch.tanh(self.mu_head(x))  # [-1,1]
        std = torch.exp(self.log_std.clamp(-20, 2))
        return mu, std

In [None]:
class PPOCritic(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(STATE_DIM, HIDDEN_DIM),
            nn.Tanh(),
            nn.Linear(HIDDEN_DIM, HIDDEN_DIM),
            nn.Tanh(),
            nn.Linear(HIDDEN_DIM, 1)
        )
    def forward(self, x):
        return self.net(x)

In [None]:
class PPOAgent:
    def __init__(self):
        self.actor = PPOActor().to(DEVICE)
        self.critic = PPOCritic().to(DEVICE)
        self.actor_optim = optim.Adam(self.actor.parameters(), lr=LEARNING_RATE)
        self.critic_optim = optim.Adam(self.critic.parameters(), lr=LEARNING_RATE)

    def _normalize(self, prices: np.ndarray) -> np.ndarray:
        mins = np.array([MIN_PRICE['single'], MIN_PRICE['double'], MIN_PRICE['vip']])
        maxs = np.array([MAX_PRICE['single'], MAX_PRICE['double'], MAX_PRICE['vip']])
        return 2 * (prices - mins) / (maxs - mins + 1e-8) - 1

    def _denormalize(self, norm_prices: np.ndarray) -> np.ndarray:
        mins = np.array([MIN_PRICE['single'], MIN_PRICE['double'], MIN_PRICE['vip']])
        maxs = np.array([MAX_PRICE['single'], MAX_PRICE['double'], MAX_PRICE['vip']])
        return 0.5 * (norm_prices + 1) * (maxs - mins) + mins

    def select_action(self, state: np.ndarray) -> Tuple[np.ndarray, torch.Tensor]:
        state_t = torch.FloatTensor(state).unsqueeze(0).to(DEVICE)
        mu, std = self.actor(state_t)
        dist = torch.distributions.Normal(mu, std)
        action_norm = dist.sample()
        log_prob = dist.log_prob(action_norm).sum(-1)

        action_raw = self._denormalize(action_norm.cpu().detach().numpy()[0])
        action_raw = project_to_valid(action_raw)  # ‚Üê MASKING
        action_norm_final = torch.FloatTensor(self._normalize(action_raw)).unsqueeze(0).to(DEVICE)

        # Recalculate log_prob cho action h·ª£p l·ªá
        log_prob_final = dist.log_prob(action_norm_final).sum(-1)

        return action_raw, log_prob_final.detach()

    def update(self, memory):
        if len(memory) < BATCH_SIZE:
            return

        states = torch.FloatTensor([m[0] for m in memory]).to(DEVICE)
        actions = torch.FloatTensor([self._normalize(m[1]) for m in memory]).to(DEVICE)
        old_log_probs = torch.FloatTensor([m[2] for m in memory]).to(DEVICE)
        advantages = torch.FloatTensor([m[3] for m in memory]).to(DEVICE)
        returns = torch.FloatTensor([m[4] for m in memory]).to(DEVICE)

        for _ in range(PPO_EPOCHS):
            mu, std = self.actor(states)
            dist = torch.distributions.Normal(mu, std)
            new_log_probs = dist.log_prob(actions).sum(-1, keepdim=True)
            entropy = dist.entropy().sum(-1, keepdim=True)

            ratio = (new_log_probs - old_log_probs).exp()
            surr1 = ratio * advantages
            surr2 = torch.clamp(ratio, 1 - CLIP_EPS, 1 + CLIP_EPS) * advantages
            actor_loss = -torch.min(surr1, surr2).mean() - 0.01 * entropy.mean()

            self.actor_optim.zero_grad()
            actor_loss.backward()
            torch.nn.utils.clip_grad_norm_(self.actor.parameters(), MAX_GRAD_NORM)
            self.actor_optim.step()

            critic_loss = ((self.critic(states) - returns) ** 2).mean()
            self.critic_optim.zero_grad()
            critic_loss.backward()
            torch.nn.utils.clip_grad_norm_(self.critic.parameters(), MAX_GRAD_NORM)
            self.critic_optim.step()

# ENVIRONMENT

In [None]:
class PricingEnv:
    def __init__(self, X_data, dates, rpt_model):
        self.X = X_data
        self.dates = dates
        self.rpt_model = rpt_model
        self.idx = WINDOW_SIZE
        self.revenue_history = deque(maxlen=11)

    def reset(self):
        self.idx = WINDOW_SIZE
        self.revenue_history.clear()
        return self._get_state()

    def _get_state(self):
        window = self.X[self.idx - WINDOW_SIZE:self.idx]
        last_feat = window[:, :, :19]
        last_news = window[:, 0, -45:]

        # Demand
        demand_pred = predict_demand(self.rpt_model, last_feat, last_news, DEVICE)
        demand_vec = np.array([demand_pred['single'], demand_pred['double'], demand_pred['vip']])

        # Env + available
        today = self.X[self.idx - 1]
        env = today[0, 0, 19:27]  # 8 env
        available = today[:, :, 1].sum(axis=0)

        # Revenue history
        rev_hist = np.array(list(self.revenue_history)[-10:])
        rev_hist = np.pad(rev_hist, (10 - len(rev_hist), 0))

        # News
        news_emb = today[0, 0, -45:]

        # Current revenue
        current_rev = np.sum(today[:, :, 0] * today[:, :, 2])  # price * sold

        state = np.concatenate([
            demand_vec, env, available, news_emb, rev_hist, [current_rev]
        ]).astype(np.float32)
        return state

    def step(self, action_prices: np.ndarray):
        today = self.X[self.idx]
        sold = today[:, :, 2].sum(axis=0)
        available = today[:, :, 1].sum(axis=0)
        revenue = np.sum(action_prices * np.minimum(sold, available))

        # K·ª≥ v·ªçng
        window = self.X[self.idx - WINDOW_SIZE:self.idx]
        last_feat = window[:, :, :19]
        last_news = window[:, 0, -45:]
        with torch.no_grad():
            pi, mu, _ = self.rpt_model(
                torch.FloatTensor(last_feat).unsqueeze(0).to(DEVICE),
                torch.FloatTensor(last_news).unsqueeze(0).to(DEVICE)
            )
            exp_demand = (pi.unsqueeze(-1) * mu).sum(1).cpu().numpy()[0]
        exp_revenue = np.sum(action_prices * exp_demand)

        reward = revenue - exp_revenue

        # Ph·∫°t parity
        if not is_valid_price(action_prices):
            reward -= PARITY_PENALTY

        self.revenue_history.append(revenue)
        self.idx += 1
        done = self.idx >= len(self.X) - 1

        return self._get_state(), reward, done, {"revenue": revenue, "prices": action_prices}

# TRAINING

In [None]:
def train_ppo():
    print("=== HU·∫§N LUY·ªÜN PPO V·ªöI PRICE PARITY & MIN/MAX ===")
    rpt_model = RPTModel().to(DEVICE)
    rpt_model.load_state_dict(torch.load(RPT_MODEL_PATH))
    rpt_model.eval()

    data = np.load(PREPROCESSED_NPZ, allow_pickle=True)
    X, dates = data['X'], data['dates']

    env = PricingEnv(X, dates, rpt_model)
    agent = PPOAgent()

    memory = []
    best_reward = -float('inf')

    for ep in range(50):
        state = env.reset()
        total_reward = 0
        while True:
            action, log_prob = agent.select_action(state)
            next_state, reward, done, info = env.step(action)
            value = agent.critic(torch.FloatTensor(state).unsqueeze(0).to(DEVICE)).item()

            memory.append((state, action, log_prob.item(), reward, reward + value))
            total_reward += reward
            state = next_state
            if done: break

        # Update
        values = [m[4] for m in memory]
        rewards = [m[3] for m in memory]
        dones = [False] * (len(memory) - 1) + [True]
        advantages = []
        gae = 0
        for i in reversed(range(len(rewards))):
            delta = rewards[i] + GAMMA * values[i+1] * (1-dones[i]) - values[i]
            gae = delta + GAMMA * LAMBDA * (1-dones[i]) * gae
            advantages.insert(0, gae)
        returns = [a + v for a, v in zip(advantages, values)]

        for i, m in enumerate(memory):
            m = list(m); m[3] = advantages[i]; m[4] = returns[i]; memory[i] = tuple(m)
        agent.update(memory)
        memory.clear()

        print(f"Ep {ep+1:2d} | Reward: {total_reward:,.0f} | "
              f"Gi√°: {action[0]:,.0f} | {action[1]:,.0f} | {action[2]:,.0f}")

        if total_reward > best_reward and is_valid_price(action):
            best_reward = total_reward
            torch.save(agent.actor.state_dict(), PPO_MODEL_SAVE)
            print("   ‚Üí L∆∞u model h·ª£p l·ªá!")

    return agent

# INFERENCE

In [None]:
def predict_tomorrow_price(agent, rpt_model, data, idx=-1):
    X = data['X']
    window = X[idx-WINDOW_SIZE:idx]
    last_feat = window[:, :, :19]
    last_news = window[:, 0, -45:]

    demand = predict_demand(rpt_model, last_feat, last_news, DEVICE)
    demand_vec = np.array(list(demand.values()))

    today = X[idx-1]
    env = today[0,0,19:27]
    available = today[:, :, 1].sum(0)
    news_emb = today[0,0,-45:]
    rev_hist = np.zeros(10)
    current_rev = np.sum(today[:,:,0] * today[:,:,2])

    state = np.concatenate([demand_vec, env, available, news_emb, rev_hist, [current_rev]])

    action, _ = agent.select_action(state)
    tomorrow = pd.to_datetime(data['dates'][idx]) + pd.Timedelta(days=1)

    print(f"\n=== GI√Å ƒê·ªÄ XU·∫§T NG√ÄY {tomorrow:%d/%m/%Y} ===")
    print(f"   ‚Ä¢ Single: {action[0]:,.0f} VND")
    print(f"   ‚Ä¢ Double: {action[1]:,.0f} VND")
    print(f"   ‚Ä¢ VIP:    {action[2]:,.0f} VND")
    print(f"   ‚Ä¢ T·ª∑ l·ªá: D/S = {action[1]/action[0]:.2f}, V/S = {action[2]/action[0]:.2f}")
    print(f"   ‚Ä¢ D·ª± b√°o ƒë·∫∑t: {sum(demand.values())} ph√≤ng")

    return action

# Main

In [None]:
if __name__ == "__main__":
    agent = train_ppo()
    data = np.load(PREPROCESSED_NPZ, allow_pickle=True)
    rpt_model = RPTModel().to(DEVICE)
    rpt_model.load_state_dict(torch.load(RPT_MODEL_PATH))
    rpt_model.eval()

    # Load best PPO
    agent.actor.load_state_dict(torch.load(PPO_MODEL_SAVE))

    predict_tomorrow_price(agent, rpt_model, data)

=== HU·∫§N LUY·ªÜN PPO V·ªöI PRICE PARITY & MIN/MAX ===


ValueError: too many values to unpack (expected 4)