In [None]:
!pip install nbconvert



In [None]:
# -*- coding: utf-8 -*-
"""
PPO Dynamic Pricing with Gymnasium + RPT Integrated
Tích hợp đầy đủ: RPT Demand → PPO Pricing → Giá đề xuất
Không phụ thuộc file rpt.py / news_embeding.py
"""

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pickle
import json
import pandas as pd
from datetime import datetime, timedelta
import os
import warnings
from typing import Tuple
from collections import deque
import gymnasium as gym
from gymnasium import spaces
import matplotlib.pyplot as plt
!git clone https://github.com/FongNgoo/Basic_Dynamic_Prices_base_on_Demand_Model.git
%cd Basic_Dynamic_Prices_base_on_Demand_Model
warnings.filterwarnings("ignore")

Cloning into 'Basic_Dynamic_Prices_base_on_Demand_Model'...
remote: Enumerating objects: 205, done.[K
remote: Counting objects: 100% (205/205), done.[K
remote: Compressing objects: 100% (156/156), done.[K
remote: Total 205 (delta 54), reused 198 (delta 47), pack-reused 0 (from 0)[K
Receiving objects: 100% (205/205), 12.08 MiB | 18.66 MiB/s, done.
Resolving deltas: 100% (54/54), done.
/content/Basic_Dynamic_Prices_base_on_Demand_Model/Basic_Dynamic_Prices_base_on_Demand_Model/Basic_Dynamic_Prices_base_on_Demand_Model/Basic_Dynamic_Prices_base_on_Demand_Model/Basic_Dynamic_Prices_base_on_Demand_Model/Basic_Dynamic_Prices_base_on_Demand_Model


# Config Set Up

In [None]:
RPT_MODEL_PATH = "/content/drive/MyDrive/Colab_Notebooks/Basic_Dynamic_Prices_base_on_Demand_Model/Output/rpt_demand_best.pth"
PREPROCESSED_NPZ = "/content/drive/MyDrive/Colab_Notebooks/Basic_Dynamic_Prices_base_on_Demand_Model/Output/preprocessed_data.npz"
SCALERS_PKL = "/content/drive/MyDrive/Colab_Notebooks/Basic_Dynamic_Prices_base_on_Demand_Model/Output/scalers.pkl"
PPO_MODEL_SAVE = "/content/drive/MyDrive/Colab_Notebooks/Basic_Dynamic_Prices_base_on_Demand_Model/Output/ppo_pricing_best.pth"

## PPO hyperparameters

In [None]:
STATE_DIM = 3 + 8 + 3 + 40 + 10 + 1  # demand + env + available + news + rev_hist(10) + current_rev = 65
ACTION_DIM = 3
HIDDEN_DIM = 256
PPO_EPOCHS = 4
CLIP_EPS = 0.2
GAMMA = 0.99
LAMBDA = 0.95
BATCH_SIZE = 32
LEARNING_RATE = 3e-4
MAX_GRAD_NORM = 0.5
PARITY_PENALTY = 1e7  # Phạt cực nặng nếu vi phạm

## Price Constraints



In [None]:
MIN_PRICE = {'single': 800000, 'double': 1200000, 'vip': 2500000}
MAX_PRICE = {'single': 3000000, 'double': 5000000, 'vip': 10000000}

PRICE_PARITY = {
    'double_single_ratio': (1.3, 1.8),
    'vip_single_ratio': (2.0, 3.0)
}

WINDOW_SIZE = 60  # Đã dùng đúng trong sliding window

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Load scalers
with open(SCALERS_PKL, 'rb') as f:
    scalers = pickle.load(f)
price_scaler = scalers['price_scaler']       # ← sửa 'price' → 'price_scaler'
revenue_scaler = scalers['revenue_scaler']
solds_scaler = scalers['solds_scaler']

# Import RPT
!jupyter nbconvert --to python RPT.ipynb
import RPT
from RPT import *
print("IMPORT QUA GIT THÀNH CÔNG!")

[NbConvertApp] Converting notebook RPT.ipynb to python
[NbConvertApp] Writing 10478 bytes to RPT.py
[AUTO] h = 144
   → h % (R*3) = 0 = 0
   → h % num_heads = 0 = 0
[AUTO] h = 144 → per_room: 48, per_kernel: 16, head_dim: 18


KeyError: 'price'

# GIÁ HỢP LỆ (MIN/MAX + PARITY)

In [None]:
def is_valid_price(prices: np.ndarray) -> bool:
    p_s, p_d, p_v = prices
    if not (MIN_PRICE['single'] <= p_s <= MAX_PRICE['single'] and
            MIN_PRICE['double'] <= p_d <= MAX_PRICE['double'] and
            MIN_PRICE['vip'] <= p_v <= MAX_PRICE['vip']):
        return False
    r_ds = p_d / p_s
    r_vs = p_v / p_s
    if not (PRICE_PARITY['double_single_ratio'][0] <= r_ds <= PRICE_PARITY['double_single_ratio'][1] and
            PRICE_PARITY['vip_single_ratio'][0] <= r_vs <= PRICE_PARITY['vip_single_ratio'][1]):
        return False
    return True

def project_to_valid(prices: np.ndarray) -> np.ndarray:
    """Chiếu giá về vùng hợp lệ (min/max + parity)"""
    p = prices.copy()
    p[0] = np.clip(p[0], MIN_PRICE['single'], MAX_PRICE['single'])
    p[1] = np.clip(p[1], MIN_PRICE['double'], MAX_PRICE['double'])
    p[2] = np.clip(p[2], MIN_PRICE['vip'], MAX_PRICE['vip'])

    p_s = p[0]
    # Điều chỉnh double
    min_d = max(MIN_PRICE['double'], PRICE_PARITY['double_single_ratio'][0] * p_s)
    max_d = min(MAX_PRICE['double'], PRICE_PARITY['double_single_ratio'][1] * p_s)
    p[1] = np.clip(p[1], min_d, max_d)

    # Điều chỉnh vip
    min_v = max(MIN_PRICE['vip'], PRICE_PARITY['vip_single_ratio'][0] * p_s)
    max_v = min(MAX_PRICE['vip'], PRICE_PARITY['vip_single_ratio'][1] * p_s)
    p[2] = np.clip(p[2], min_v, max_v)

    return p

# PPO ACTOR-CRITIC VỚI ACTION MASKING

In [None]:
class PPOActor(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(STATE_DIM, HIDDEN_DIM),
            nn.Tanh(),
            nn.Linear(HIDDEN_DIM, HIDDEN_DIM),
            nn.Tanh(),
        )
        self.mu_head = nn.Linear(HIDDEN_DIM, ACTION_DIM)
        self.log_std = nn.Parameter(torch.zeros(ACTION_DIM))

    def forward(self, x):
        x = self.net(x)
        mu = torch.tanh(self.mu_head(x))  # [-1,1]
        std = torch.exp(self.log_std.clamp(-20, 2))
        return mu, std

In [None]:
class PPOCritic(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(STATE_DIM, HIDDEN_DIM),
            nn.Tanh(),
            nn.Linear(HIDDEN_DIM, HIDDEN_DIM),
            nn.Tanh(),
            nn.Linear(HIDDEN_DIM, 1)
        )
    def forward(self, x):
        return self.net(x)

In [None]:
class PPOAgent:
    def __init__(self):
        self.actor = PPOActor().to(DEVICE)
        self.critic = PPOCritic().to(DEVICE)
        self.actor_optim = optim.Adam(self.actor.parameters(), lr=LEARNING_RATE)
        self.critic_optim = optim.Adam(self.critic.parameters(), lr=LEARNING_RATE)

    def _normalize(self, prices: np.ndarray) -> np.ndarray:
        mins = np.array([MIN_PRICE['single'], MIN_PRICE['double'], MIN_PRICE['vip']])
        maxs = np.array([MAX_PRICE['single'], MAX_PRICE['double'], MAX_PRICE['vip']])
        return 2 * (prices - mins) / (maxs - mins + 1e-8) - 1

    def _denormalize(self, norm_prices: np.ndarray) -> np.ndarray:
        mins = np.array([MIN_PRICE['single'], MIN_PRICE['double'], MIN_PRICE['vip']])
        maxs = np.array([MAX_PRICE['single'], MAX_PRICE['double'], MAX_PRICE['vip']])
        return 0.5 * (norm_prices + 1) * (maxs - mins) + mins

    def select_action(self, state: np.ndarray) -> Tuple[np.ndarray, torch.Tensor]:
        state_t = torch.FloatTensor(state).unsqueeze(0).to(DEVICE)
        mu, std = self.actor(state_t)
        dist = torch.distributions.Normal(mu, std)
        action_norm = dist.sample()
        log_prob = dist.log_prob(action_norm).sum(-1)

        action_raw = self._denormalize(action_norm.cpu().detach().numpy()[0])
        action_raw = project_to_valid(action_raw)  # ← MASKING
        action_norm_final = torch.FloatTensor(self._normalize(action_raw)).unsqueeze(0).to(DEVICE)

        # Recalculate log_prob cho action hợp lệ
        log_prob_final = dist.log_prob(action_norm_final).sum(-1)

        return action_raw, log_prob_final.detach()

    def update(self, memory):
        if len(memory) < BATCH_SIZE:
            return

        states = torch.FloatTensor([m[0] for m in memory]).to(DEVICE)
        actions = torch.FloatTensor([self._normalize(m[1]) for m in memory]).to(DEVICE)
        old_log_probs = torch.FloatTensor([m[2] for m in memory]).to(DEVICE)
        advantages = torch.FloatTensor([m[3] for m in memory]).to(DEVICE)
        returns = torch.FloatTensor([m[4] for m in memory]).to(DEVICE)

        for _ in range(PPO_EPOCHS):
            mu, std = self.actor(states)
            dist = torch.distributions.Normal(mu, std)
            new_log_probs = dist.log_prob(actions).sum(-1, keepdim=True)
            entropy = dist.entropy().sum(-1, keepdim=True)

            ratio = (new_log_probs - old_log_probs).exp()
            surr1 = ratio * advantages
            surr2 = torch.clamp(ratio, 1 - CLIP_EPS, 1 + CLIP_EPS) * advantages
            actor_loss = -torch.min(surr1, surr2).mean() - 0.01 * entropy.mean()

            self.actor_optim.zero_grad()
            actor_loss.backward()
            torch.nn.utils.clip_grad_norm_(self.actor.parameters(), MAX_GRAD_NORM)
            self.actor_optim.step()

            critic_loss = ((self.critic(states) - returns) ** 2).mean()
            self.critic_optim.zero_grad()
            critic_loss.backward()
            torch.nn.utils.clip_grad_norm_(self.critic.parameters(), MAX_GRAD_NORM)
            self.critic_optim.step()

# ENVIRONMENT

In [None]:
class PricingEnv:
    def __init__(self, X_data, news_emb_data, dates, rpt_model):
        self.X = X_data                    # (N, 60, 3, 24)
        self.news_emb = news_emb_data      # (N, 60, 40) ← quan trọng!
        self.dates = dates
        self.rpt_model = rpt_model
        self.capacity = np.array([30, 20, 10])  # Single, Double, VIP
        self.idx = 0
        self.revenue_history = deque(maxlen=10)  # chỉ 10 ngày gần nhất

    def reset(self):
        self.idx = WINDOW_SIZE - 1          # ← FIX 1: bắt đầu từ ngày có đủ 60 ngày lịch sử
        self.revenue_history.clear()
        return self._get_state()

    def _get_state(self):
        x_sample = self.X[self.idx]         # (60, 3, 24)
        news_sample = self.news_emb[self.idx]  # (60, 40) ← ĐÚNG!

        # Dự báo demand cho ngày hiện tại
        hist_feat = x_sample[:, :, :19]     # (60, 3, 19)
        demand_pred = predict_demand(self.rpt_model, hist_feat, news_sample, DEVICE)
        demand_vec = np.array([demand_pred['single'], demand_pred['double'], demand_pred['vip']])

        # Các thành phần state
        env_features = x_sample[-1, 0, 19:27]           # 8 env features ngày cuối
        available_rooms = x_sample[:, :, 1].sum(axis=0)  # còn trống trong 60 ngày (gợi ý xu hướng)
        news_today = news_sample[-1]                    # embedding ngày hiện tại (40,)

        # Lịch sử doanh thu 10 ngày (nếu chưa đủ thì pad 0)
        rev_hist = np.array(list(self.revenue_history))
        rev_hist = np.pad(rev_hist, (10 - len(rev_hist), 0), constant_values=0)

        current_revenue = np.sum(x_sample[:, :, 0] * x_sample[:, :, 2])  # doanh thu thực tế 60 ngày qua

        state = np.concatenate([
            demand_vec,          # 3
            env_features,        # 8
            available_rooms,     # 3
            news_today,          # 40
            rev_hist,            # 10
            [current_revenue]    # 1
        ]).astype(np.float32)

        assert state.shape[0] == 65, f"State shape sai: {state.shape}"
        return state

    def step(self, action_prices: np.ndarray):
        # === DOANH THU THỰC TẾ từ ngày HIỆN TẠI ===
        today_data = self.X[self.idx]
        sold_today = today_data[-1, :, 2]        # số phòng đã bán ngày hôm nay (3,)
        available_today = today_data[-1, :, 1]   # số phòng còn trống ngày hôm nay
        actual_sold = np.minimum(sold_today, available_today + sold_today)  # không âm
        revenue = np.sum(action_prices * actual_sold)

        # === REWARD CHÍNH LÀ DOANH THU THỰC TẾ ===
        reward = revenue  # ← FIX 4: Đây mới là mục tiêu tối ưu!

        # === PHẠT NẾU VI PHẠM GIÁ ===
        if not is_valid_price(action_prices):
            reward -= PARITY_PENALTY  # 1e7 → không bao giờ dám vi phạm

        # === PHẠT NHẸ nếu overbook (khuyến khích không đặt quá công suất) ===
        overbook = np.maximum(0, sold_today - self.capacity).sum()
        reward -= overbook * 5000

        # === CẬP NHẬT ===
        self.revenue_history.append(revenue)
        self.idx += 1
        done = self.idx >= len(self.X) - 1

        info = {
            "date": self.dates[self.idx-1],
            "revenue": revenue,
            "prices": action_prices.tolist(),
            "sold": sold_today.tolist(),
            "demand_pred": [
                predict_demand(self.rpt_model, self.X[self.idx-1][:, :, :19], self.news_emb[self.idx-1], DEVICE)['single'],
                predict_demand(self.rpt_model, self.X[self.idx-1][:, :, :19], self.news_emb[self.idx-1], DEVICE)['double'],
                predict_demand(self.rpt_model, self.X[self.idx-1][:, :, :19], self.news_emb[self.idx-1], DEVICE)['vip']
            ]
        }

        return self._get_state(), reward, done, info

# TRAINING

In [None]:
def plot_ppo_results(rewards_history, prices_history, demand_history, best_epoch):
    plt.figure(figsize=(15, 5))

    # Plot 1: Reward over Episodes
    plt.subplot(1, 3, 1)
    plt.plot(rewards_history, label='Reward', color='#1f77b4', linewidth=2)
    plt.axvline(best_epoch, color='green', linestyle='--', label=f'Best: {max(rewards_history):,.0f}')
    plt.scatter(best_epoch, max(rewards_history), color='red', s=120)
    plt.title('Reward Over Episodes', fontsize=14, fontweight='bold')
    plt.xlabel('Episode'); plt.ylabel('Reward'); plt.grid(True); plt.legend()

    # Plot 2: Giá đề xuất cuối cùng
    plt.subplot(1, 3, 2)
    room_types = ['Single', 'Double', 'VIP']
    last_prices = prices_history[-1]
    plt.bar(room_types, last_prices, color='#ff7f0e', alpha=0.8)
    plt.title('Giá Đề Xuất Cuối Cùng (VND)', fontsize=14, fontweight='bold')
    plt.ylabel('Giá'); plt.grid(True, axis='y')

    # Plot 3: Demand vs Công suất
    plt.subplot(1, 3, 3)
    x = np.arange(3)
    width = 0.35
    capacity = [30, 20, 10]
    last_demand = demand_history[-1]
    plt.bar(x - width/2, capacity, width, label='Công suất (60 tổng)', color='#2ca02c')
    plt.bar(x + width/2, last_demand, width, label='Demand dự báo', color='#d62728')
    plt.xticks(x, room_types); plt.ylabel('Số phòng')
    plt.title('Demand vs Công suất (31/12/2025)', fontsize=14, fontweight='bold')
    plt.legend(); plt.grid(True, axis='y')

    plt.suptitle('PPO Dynamic Pricing Kết Quả (2025)', fontsize=18, fontweight='bold')
    plt.tight_layout()
    plt.show()

In [None]:
def train_ppo():
    print("=== HUẤN LUYỆN PPO VỚI PRICE PARITY & MIN/MAX ===")
    rpt_model = RPTModel().to(DEVICE)
    rpt_model.load_state_dict(torch.load(RPT_MODEL_PATH))
    rpt_model.eval()

    data = np.load(PREPROCESSED_NPZ, allow_pickle=True)
    X, dates, news_emb_all = data['X'], data['dates'], data['news_emb']

    env = PricingEnv(X, news_emb_all, dates, rpt_model)

    env = PricingEnv(X, dates, rpt_model)
    agent = PPOAgent()

    memory = []
    best_reward = -float('inf')

    for ep in range(50):
        state = env.reset()
        total_reward = 0
        while True:
            action, log_prob = agent.select_action(state)
            next_state, reward, done, info = env.step(action)
            value = agent.critic(torch.FloatTensor(state).unsqueeze(0).to(DEVICE)).item()

            memory.append((state, action, log_prob.item(), reward, reward + value))
            total_reward += reward
            state = next_state
            if done: break

        # Update
        values = [m[4] for m in memory]
        rewards = [m[3] for m in memory]
        dones = [False] * (len(memory) - 1) + [True]
        advantages = []
        gae = 0
        for i in reversed(range(len(rewards))):
            delta = rewards[i] + GAMMA * values[i+1] * (1-dones[i]) - values[i]
            gae = delta + GAMMA * LAMBDA * (1-dones[i]) * gae
            advantages.insert(0, gae)
        returns = [a + v for a, v in zip(advantages, values)]

        for i, m in enumerate(memory):
            m = list(m); m[3] = advantages[i]; m[4] = returns[i]; memory[i] = tuple(m)
        agent.update(memory)
        memory.clear()

        print(f"Ep {ep+1:2d} | Reward: {total_reward:,.0f} | "
              f"Giá: {action[0]:,.0f} | {action[1]:,.0f} | {action[2]:,.0f}")

        if total_reward > best_reward and is_valid_price(action):
            best_reward = total_reward
            torch.save(agent.actor.state_dict(), PPO_MODEL_SAVE)
            print("   → Lưu model hợp lệ!")
    print(f\"BEST REWARD: {best_reward:,.0f}\")
    plot_ppo_results(rewards_history, prices_history, demand_history, best_epoch)  # ← thêm
    return agent

# INFERENCE

In [None]:
def predict_tomorrow_price(agent, rpt_model, data, idx=-1):
    """
    Dự báo giá ngày mai (31/12/2025) – ĐÃ FIX HẾT 6 LỖI
    """
    X = data['X']                          # (N, 60, 3, 24)
    news_emb_all = data['news_emb']        # (N, 60, 40)
    dates = data['dates']

    # === LẤY DỮ LIỆU NGÀY HIỆN TẠI (idx) ===
    sample_x = X[idx]                      # (60, 3, 24) – 60 ngày lịch sử tính đến hôm nay
    sample_news = news_emb_all[idx]        # (60, 40)  – ĐÚNG!

    # === DỰ BÁO DEMAND CHO NGÀY MAI (dựa trên 60 ngày vừa qua) ===
    hist_features = sample_x[:, :, :19]    # ← FIX 1: 19 features đầu (60, 3, 19)
    tomorrow_demand = predict_demand(rpt_model, hist_features, sample_news, DEVICE)

    demand_vec = np.array([
        tomorrow_demand['single'],
        tomorrow_demand['double'],
        tomorrow_demand['vip']
    ])

    # === XÂY DỰNG STATE CHO PPO (phải giống hệt trong env) ===
    env_features = sample_x[-1, 0, 19:27]          # 8 env features ngày hiện tại
    available_rooms = sample_x[:, :, 1].sum(axis=0)  # xu hướng phòng trống

    # News embedding ngày hiện tại (ngày cuối cùng trong cửa sổ 60 ngày)
    news_today = sample_news[-1]                   # ← FIX 2: (40,)

    # Lịch sử doanh thu (nếu có trong agent, dùng thật; không thì 0)
    try:
        rev_hist = np.array(list(agent.revenue_history))[-10:]
        rev_hist = np.pad(rev_hist, (10 - len(rev_hist), 0), constant_values=0)
    except:
        rev_hist = np.zeros(10)

    # Doanh thu thực tế 60 ngày qua
    current_revenue = np.sum(sample_x[:, :, 0] * sample_x[:, :, 2])

    # === TẠO STATE ĐÚNG 65 CHIỀU ===
    state = np.concatenate([
        demand_vec,       # 3
        env_features,     # 8
        available_rooms,  # 3
        news_today,       # 40
        rev_hist,         # 10
        [current_revenue] # 1
    ]).astype(np.float32)

    assert state.shape == (65,), f"State shape sai: {state.shape}"

    # === DỰ ĐOÁN GIÁ ===
    action_raw, _ = agent.select_action(state)
    action_raw = np.clip(action_raw,
                         [200, 400, 600],
                         [500, 700, 1500])  # đảm bảo trong min/max

    tomorrow_date = pd.to_datetime(dates[idx]) + pd.Timedelta(days=1)

    print(f"\n{'='*50}")
    print(f"   GIÁ ĐỀ XUẤT NGÀY {tomorrow_date.strftime('%d/%m/%Y')} (CAO ĐIỂM TẾT)")
    print(f"{'='*50}")
    print(f"   • Single : {action_raw[0]:,.0f} VND")
    print(f"   • Double : {action_raw[1]:,.0f} VND")
    print(f"   • VIP    : {action_raw[2]:,.0f} VND")
    print(f"   • Tỷ lệ  : D/S = {action_raw[1]/action_raw[0]:.2f} | V/S = {action_raw[2]/action_raw[0]:.2f}")
    print(f"   • Demand : {sum(tomorrow_demand.values()):.0f} phòng (vượt {sum(tomorrow_demand.values())-60:.0f} so với công suất 60)")
    print(f"{'='*50}")

    return action_raw

# Main

In [None]:
if __name__ == "__main__":
    agent = train_ppo()
    data = np.load(PREPROCESSED_NPZ, allow_pickle=True)
    rpt_model = RPTModel().to(DEVICE)
    rpt_model.load_state_dict(torch.load(RPT_MODEL_PATH))
    rpt_model.eval()

    # Load best PPO
    agent.actor.load_state_dict(torch.load(PPO_MODEL_SAVE))

    predict_tomorrow_price(agent, rpt_model, data)