In [1]:
!pip install pandas openpyxl --quiet

DATASET PREPARATION

In [2]:

import pandas as pd

products_df = pd.read_excel('/content/synthetic_products.xlsx')
promotions_df = pd.read_excel('/content/synthetic_promotions.xlsx')
users_df = pd.read_excel('/content/synthetic_users.xlsx')
transactions_df = pd.read_excel('/content/synthetic_transactions.xlsx')

def parse_list(list_str):
    try:
        return eval(list_str.replace("'", ""))
    except:
        return []

promotions_df['eligible_users'] = promotions_df['eligible_users'].apply(parse_list)
promotions_df['products_covered'] = promotions_df['products_covered'].apply(parse_list)

full_df = (
    transactions_df
    .merge(users_df, on='user_id')
    .merge(products_df, on='product_id')
    .merge(promotions_df, left_on='promotion_used', right_on='promotion_id', how='left')
)

STATE_ATTRIBUTES = [
    'user_id',
    'product_id',
    'category',
    'quantity',
    'response',
    'churn_score',
    'loyalty_score',
    'avg_order_value',
    'price',
    'margin',
    'promotion_history',
    'age',
    'gender',
    'income',
    'segment',
    'promotion_used'
]

final_dataset = full_df[STATE_ATTRIBUTES]

final_dataset.to_csv('all_users_state_actions.csv', index=False)

print(f"Saved {len(final_dataset)} entries with {len(final_dataset.columns)} state attributes")
print("Columns order:", list(final_dataset.columns))

Saved 300 entries with 16 state attributes
Columns order: ['user_id', 'product_id', 'category', 'quantity', 'response', 'churn_score', 'loyalty_score', 'avg_order_value', 'price', 'margin', 'promotion_history', 'age', 'gender', 'income', 'segment', 'promotion_used']


DATA PRE-PROCESSING

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns

df = pd.read_csv('/content/all_users_state_actions.csv')

print("Missing Values Before Processing:")
missing_values = df.isnull().sum()
percent_missing = (missing_values/len(df)) * 100
missing_table = pd.DataFrame({
    'Missing Values': missing_values,
    'Percentage (%)': percent_missing.round(2)
})
print(missing_table)

def handle_missing(data):
    cleaned_df = data.copy()

    num_cols = ['quantity', 'churn_score', 'loyalty_score',
               'avg_order_value', 'price', 'margin', 'income']
    for col in num_cols:
        if col in cleaned_df.columns:

            cleaned_df[col] = cleaned_df[col].fillna(cleaned_df[col].median())

    cat_cols = ['category', 'gender', 'segment', 'promotion_history']
    for col in cat_cols:
        if col in cleaned_df.columns:

            cleaned_df[col] = cleaned_df[col].fillna(cleaned_df[col].mode()[0])

    if 'promotion_used' in cleaned_df.columns:
        cleaned_df['promotion_used'] = cleaned_df['promotion_used'].fillna(0)

    cleaned_df = cleaned_df.dropna()

    return cleaned_df

cleaned_df = handle_missing(df)

print("\nMissing Values After Processing:")
print(cleaned_df.isnull().sum())

cleaned_df.to_csv('cleaned_state_actions.csv', index=False)
print("\nCleaned dataset saved as cleaned_state_actions.csv")

original_rows = len(df)
cleaned_rows = len(cleaned_df)
print(f"\nData Cleaning Report:")
print(f"Original entries: {original_rows}")
print(f"Cleaned entries: {cleaned_rows}")
print(f"Rows removed: {original_rows - cleaned_rows} ({(1-(cleaned_rows/original_rows))*100:.2f}%)")

Missing Values Before Processing:
                   Missing Values  Percentage (%)
user_id                         0            0.00
product_id                      0            0.00
category                        0            0.00
quantity                        0            0.00
response                        0            0.00
churn_score                     0            0.00
loyalty_score                   0            0.00
avg_order_value                 0            0.00
price                           0            0.00
margin                          0            0.00
promotion_history               0            0.00
age                             0            0.00
gender                          0            0.00
income                          0            0.00
segment                         0            0.00
promotion_used                 79           26.33

Missing Values After Processing:
user_id              0
product_id           0
category             0
quantity     

DATASET SPLITTING

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from google.colab import files


df = pd.read_csv('/content/cleaned_state_actions.csv')  # Replace with your filename

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_df.to_csv('train_data.csv', index=False)
test_df.to_csv('test_data.csv', index=False)

!ls -l *.csv

files.download('train_data.csv')
files.download('test_data.csv')

print("Training samples:", len(train_df))
print("Test samples:", len(test_df))
print("Files saved successfully!")

-rw-r--r-- 1 root root 26175 May  8 15:18 all_users_state_actions.csv
-rw-r--r-- 1 root root 26254 May  8 15:18 cleaned_state_actions.csv
-rw-r--r-- 1 root root  5357 May  8 15:18 test_data.csv
-rw-r--r-- 1 root root 21057 May  8 15:18 train_data.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Training samples: 240
Test samples: 60
Files saved successfully!


RL AGENT TRAINING (QDN ALGORITHM)

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from collections import deque
import random
import math

PROMOTION_IDS = ['PR100', 'PR101', 'PR102', 'PR103', 'PR104',
                'PR105', 'PR106', 'PR107', 'PR108', 'PR109']
ACTION_SPACE_SIZE = len(PROMOTION_IDS)

class AdvancedRewardSystem:
    def __init__(self, config):
        self.weights = config['weights']
        self.segment_multipliers = config['segment_multipliers']
        self.promo_costs = {
            'PR100': 0.15, 'PR101': 0.20, 'PR102': 0.18,
            'PR103': 0.25, 'PR104': 0.22, 'PR105': 0.19,
            'PR106': 0.17, 'PR107': 0.21, 'PR108': 0.23,
            'PR109': 0.16
        }
        self.eligibility_map = {
            'Low': ['PR100', 'PR101', 'PR102'],
            'Medium': ['PR103', 'PR104', 'PR105'],
            'High': ['PR106', 'PR107', 'PR108', 'PR109']
        }

    def get_valid_actions(self, promotion_history):
        eligible = self.eligibility_map.get(promotion_history, [])
        return [PROMOTION_IDS.index(p) for p in eligible if p in PROMOTION_IDS]

    def calculate_reward(self, transaction, action):
        promo_id = PROMOTION_IDS[action]
        cost = self.promo_costs[promo_id]

        response = transaction['response']
        quantity = transaction['quantity']
        price = transaction['price']
        margin = transaction['margin'] - cost
        loyalty = transaction['loyalty_score']
        churn = transaction['churn_score']
        segment = transaction.get('segment', 'Standard')

        sales_reward = math.log1p(quantity * price) * response * 2.0
        margin_reward = margin * quantity * 0.5
        retention_reward = (1 / (1 + math.exp(-loyalty))) - math.exp(churn)
        penalty = (1 - response) * price * 0.3

        segment_mult = self.segment_multipliers.get(segment, 1.0)

        return ((self.weights['sales'] * sales_reward +
                self.weights['margin'] * margin_reward +
                self.weights['retention'] * retention_reward -
                penalty) * segment_mult)

class DQNAgent:
    def __init__(self, state_size):
        self.state_size = state_size
        self.action_size = ACTION_SPACE_SIZE
        self.memory = deque(maxlen=20000)
        self.gamma = 0.99
        self.epsilon = 1.0
        self.epsilon_decay = 0.985
        self.epsilon_min = 0.15
        self.batch_size = 256

        self.model = self._build_model()
        self.target_model = self._build_model()
        self.update_target()
        self.optimizer = optim.AdamW(self.model.parameters(), lr=0.001)
        self.loss_fn = nn.SmoothL1Loss()

    def _build_model(self):
        return nn.Sequential(
            nn.Linear(self.state_size, 512),
            nn.LayerNorm(512),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.25),
            nn.Linear(512, 256),
            nn.LayerNorm(256),
            nn.LeakyReLU(0.1),
            nn.Linear(256, self.action_size)
        )

    def act(self, state, valid_actions):
        if np.random.rand() <= self.epsilon:
            return random.choice(valid_actions)

        state = torch.FloatTensor(state)
        with torch.no_grad():
            q_values = self.model(state)

        valid_q = q_values[valid_actions]
        return valid_actions[torch.argmax(valid_q).item()]

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def replay(self):
        if len(self.memory) < self.batch_size:
            return

        minibatch = random.sample(self.memory, self.batch_size)
        states = torch.FloatTensor(np.array([x[0] for x in minibatch]))
        actions = torch.LongTensor([x[1] for x in minibatch])
        rewards = torch.FloatTensor([x[2] for x in minibatch])
        next_states = torch.FloatTensor(np.array([x[3] for x in minibatch]))
        dones = torch.FloatTensor([x[4] for x in minibatch])

        current_q = self.model(states).gather(1, actions.unsqueeze(1))
        next_q = self.target_model(next_states).max(1)[0].detach()
        target_q = rewards + (1 - dones) * self.gamma * next_q

        loss = self.loss_fn(current_q.squeeze(), target_q)
        self.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
        self.optimizer.step()

    def update_target(self):
        self.target_model.load_state_dict(self.model.state_dict())

    def decay_epsilon(self):
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)

def preprocess_data(df):
    df = df.drop(['user_id', 'product_id', 'promotion_used'], axis=1, errors='ignore')
    categorical_cols = ['category', 'gender', 'segment', 'promotion_history']
    df = pd.get_dummies(df, columns=categorical_cols)
    numeric_cols = ['quantity', 'price', 'margin', 'churn_score',
                   'loyalty_score', 'avg_order_value', 'age', 'income']
    for col in numeric_cols:
        if col in df.columns:
            df[col] = (df[col] - df[col].mean()) / df[col].std()

    return df.fillna(0)

def train_dqn_agent(df, config):
    df = df[df['promotion_used'].isin(PROMOTION_IDS)].copy()
    processed_df = preprocess_data(df)
    promotion_map = {p: i for i, p in enumerate(PROMOTION_IDS)}
    processed_df['action'] = df['promotion_used'].map(promotion_map)
    state_columns = [col for col in processed_df.columns if col != 'action']
    states = processed_df[state_columns].values.astype(np.float32)
    states = np.nan_to_num(states)
    reward_system = AdvancedRewardSystem(config)
    agent = DQNAgent(states.shape[1])
    episodes = 300
    steps_per_episode = 300

    for episode in range(episodes):
        total_reward = 0
        indices = np.random.permutation(len(processed_df))

        for step in range(steps_per_episode):
            idx = indices[step % len(processed_df)]
            row = df.iloc[idx]
            state = states[idx]
            valid_actions = reward_system.get_valid_actions(row['promotion_history'])
            if not valid_actions:
                continue
            action = agent.act(state, valid_actions)
            reward = reward_system.calculate_reward(row.to_dict(), action)
            next_idx = (idx + 1) % len(processed_df)
            next_state = states[next_idx]
            agent.remember(state, action, reward, next_state, False)


            if step % 2 == 0:
                agent.replay()

            total_reward += reward

        agent.update_target()
        agent.decay_epsilon()

        avg_reward = total_reward / steps_per_episode
        print(f"Episode {episode+1:03d}/{episodes} | Avg Reward: {avg_reward:.2f} | ε: {agent.epsilon:.3f}")

    return agent

if __name__ == "__main__":
    df = pd.read_csv('train_data.csv')

    reward_config = {
        'weights': {
            'sales': 0.4,
            'margin': 0.35,
            'retention': 0.25
        },
        'segment_multipliers': {
            'Premium': 1.6,
            'Standard': 1.1,
            'Budget': 0.7
        }
    }

    trained_agent = train_dqn_agent(df, reward_config)
    torch.save(trained_agent.model.state_dict(), 'final_promotion_model.pth')
    print("Training completed successfully.")

Episode 001/300 | Avg Reward: 37.95 | ε: 0.985
Episode 002/300 | Avg Reward: 38.04 | ε: 0.970
Episode 003/300 | Avg Reward: 37.06 | ε: 0.956
Episode 004/300 | Avg Reward: 38.24 | ε: 0.941
Episode 005/300 | Avg Reward: 37.09 | ε: 0.927
Episode 006/300 | Avg Reward: 37.65 | ε: 0.913
Episode 007/300 | Avg Reward: 37.00 | ε: 0.900
Episode 008/300 | Avg Reward: 37.41 | ε: 0.886
Episode 009/300 | Avg Reward: 38.08 | ε: 0.873
Episode 010/300 | Avg Reward: 37.43 | ε: 0.860
Episode 011/300 | Avg Reward: 37.94 | ε: 0.847
Episode 012/300 | Avg Reward: 37.65 | ε: 0.834
Episode 013/300 | Avg Reward: 37.77 | ε: 0.822
Episode 014/300 | Avg Reward: 37.88 | ε: 0.809
Episode 015/300 | Avg Reward: 37.57 | ε: 0.797
Episode 016/300 | Avg Reward: 38.15 | ε: 0.785
Episode 017/300 | Avg Reward: 37.20 | ε: 0.773
Episode 018/300 | Avg Reward: 37.26 | ε: 0.762
Episode 019/300 | Avg Reward: 38.79 | ε: 0.750
Episode 020/300 | Avg Reward: 37.97 | ε: 0.739
Episode 021/300 | Avg Reward: 37.51 | ε: 0.728
Episode 022/3

EVALUATION WITH TEST_DATA

In [11]:
import torch
import pandas as pd
import numpy as np
import math
from collections import deque
import random

PROMOTION_IDS = ['PR100', 'PR101', 'PR102', 'PR103', 'PR104',
                'PR105', 'PR106', 'PR107', 'PR108', 'PR109']

class AdvancedRewardSystem:
    def __init__(self, config):
        self.weights = config['weights']
        self.segment_multipliers = config['segment_multipliers']
        self.promo_costs = {
            'PR100': 0.15, 'PR101': 0.20, 'PR102': 0.18,
            'PR103': 0.25, 'PR104': 0.22, 'PR105': 0.19,
            'PR106': 0.17, 'PR107': 0.21, 'PR108': 0.23,
            'PR109': 0.16
        }
        self.eligibility_map = {
            'Low': ['PR100', 'PR101', 'PR102'],
            'Medium': ['PR103', 'PR104', 'PR105'],
            'High': ['PR106', 'PR107', 'PR108', 'PR109']
        }

    def get_valid_actions(self, promotion_history):
        eligible = self.eligibility_map.get(promotion_history, [])
        return [PROMOTION_IDS.index(p) for p in eligible if p in PROMOTION_IDS]

    def calculate_reward(self, transaction, action):
        promo_id = PROMOTION_IDS[action]
        cost = self.promo_costs[promo_id]

        response = transaction['response']
        quantity = transaction['quantity']
        price = transaction['price']
        margin = transaction['margin'] - cost
        loyalty = transaction['loyalty_score']
        churn = transaction['churn_score']
        segment = transaction.get('segment', 'Standard')

        sales_reward = math.log1p(quantity * price) * response * 2.0
        margin_reward = margin * quantity * 0.5
        retention_reward = (1 / (1 + math.exp(-loyalty))) - math.exp(churn)
        penalty = (1 - response) * price * 0.3

        segment_mult = self.segment_multipliers.get(segment, 1.0)
        return ((self.weights['sales'] * sales_reward +
                self.weights['margin'] * margin_reward +
                self.weights['retention'] * retention_reward -
                penalty) * segment_mult)

class DQNAgent:
    def __init__(self, state_size):
        self.state_size = state_size
        self.action_size = len(PROMOTION_IDS)
        self.memory = deque(maxlen=20000)
        self.gamma = 0.99
        self.epsilon = 1.0
        self.epsilon_decay = 0.985
        self.epsilon_min = 0.15
        self.batch_size = 256

        self.model = self._build_model()
        self.target_model = self._build_model()
        self.update_target()
        self.optimizer = optim.AdamW(self.model.parameters(), lr=0.001)
        self.loss_fn = nn.SmoothL1Loss()

    def _build_model(self):
        return nn.Sequential(
            nn.Linear(self.state_size, 512),
            nn.LayerNorm(512),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.25),
            nn.Linear(512, 256),
            nn.LayerNorm(256),
            nn.LeakyReLU(0.1),
            nn.Linear(256, self.action_size)
        )

    def update_target(self):
        self.target_model.load_state_dict(self.model.state_dict())

    def act(self, state, valid_actions):
        if np.random.rand() <= self.epsilon:
            return random.choice(valid_actions)

        state = torch.FloatTensor(state)
        with torch.no_grad():
            q_values = self.model(state)

        valid_q = q_values[valid_actions]
        return valid_actions[torch.argmax(valid_q).item()]

def preprocess_data(df, train_stats=None):
    df = df.drop(['user_id', 'product_id', 'promotion_used'], axis=1, errors='ignore')
    categorical_cols = ['category', 'gender', 'segment', 'promotion_history']
    df = pd.get_dummies(df, columns=categorical_cols)
    numeric_cols = ['quantity', 'price', 'margin', 'churn_score',
                   'loyalty_score', 'avg_order_value', 'age', 'income']

    if train_stats:
        means = train_stats['means']
        stds = train_stats['stds']
    else:
        means = df[numeric_cols].mean()
        stds = df[numeric_cols].std()

    for col in numeric_cols:
        if col in df.columns:
            df[col] = (df[col] - means[col]) / stds[col]

    return df.fillna(0), (means, stds) if train_stats is None else None

def evaluate_model(test_path, model_path, reward_config, train_stats):
    df_test = pd.read_csv(test_path)
    df_test = df_test[df_test['promotion_used'].isin(PROMOTION_IDS)].copy()
    processed_test, _ = preprocess_data(df_test, train_stats)
    state_columns = [col for col in processed_test.columns if col != 'action']
    states_test = processed_test[state_columns].values.astype(np.float32)
    states_test = np.nan_to_num(states_test)
    reward_system = AdvancedRewardSystem(reward_config)
    agent = DQNAgent(states_test.shape[1])
    agent.model.load_state_dict(torch.load(model_path))
    agent.model.eval()
    agent.epsilon = 0.0
    total_reward = 0
    correct_predictions = 0
    action_counts = {p: 0 for p in PROMOTION_IDS}

    for idx in range(len(df_test)):
        row = df_test.iloc[idx]
        state = states_test[idx]
        valid_actions = reward_system.get_valid_actions(row['promotion_history'])
        if not valid_actions:
            continue
        action = agent.act(state, valid_actions)
        promo_used = PROMOTION_IDS[action]
        action_counts[promo_used] += 1
        reward = reward_system.calculate_reward(row.to_dict(), action)
        total_reward += reward
        if promo_used == row['promotion_used']:
            correct_predictions += 1
    avg_reward = total_reward / len(df_test) if len(df_test) > 0 else 0
    accuracy = (correct_predictions / len(df_test)) * 100 if len(df_test) > 0 else 0

    print(f"\n{' Evaluation Results ':=^40}")
    print(f"Test Samples: {len(df_test)}")
    print(f"Total Reward: {total_reward:.2f}")
    print(f"Avg Reward/Sample: {avg_reward:.2f}")
    print(f"Accuracy vs Historical: {accuracy:.2f}%")

    print("\nAction Distribution:")
    for promo, count in action_counts.items():
        print(f"{promo}: {count} ({count/len(df_test)*100:.1f}%)")

if __name__ == "__main__":
    train_df = pd.read_csv('train_data.csv')
    _, train_stats = preprocess_data(train_df)

    reward_config = {
        'weights': {'sales': 0.4, 'margin': 0.35, 'retention': 0.25},
        'segment_multipliers': {'Premium': 1.6, 'Standard': 1.1, 'Budget': 0.7}
    }

    evaluate_model(
        test_path='test_data.csv',
        model_path='/content/final_promotion_model.pth',
        reward_config=reward_config,
        train_stats={'means': train_stats[0], 'stds': train_stats[1]}
    )


Test Samples: 43
Total Reward: 1376.91
Avg Reward/Sample: 32.02
Accuracy vs Historical: 16.28%

Action Distribution:
PR100: 0 (0.0%)
PR101: 0 (0.0%)
PR102: 16 (37.2%)
PR103: 9 (20.9%)
PR104: 0 (0.0%)
PR105: 0 (0.0%)
PR106: 16 (37.2%)
PR107: 0 (0.0%)
PR108: 1 (2.3%)
PR109: 1 (2.3%)


RL AGENT TRAINING (PPO ALGORITHM)

In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
import numpy as np
import pandas as pd
from collections import deque
import random
import math
from torch.distributions import Categorical

PROMOTION_IDS = ['PR100', 'PR101', 'PR102', 'PR103', 'PR104',
                'PR105', 'PR106', 'PR107', 'PR108', 'PR109']
ACTION_SPACE_SIZE = len(PROMOTION_IDS)

class AdvancedRewardSystem:
    def __init__(self, config):
        self.weights = config['weights']
        self.segment_multipliers = config['segment_multipliers']
        self.promo_costs = {
            'PR100': 0.15, 'PR101': 0.20, 'PR102': 0.18,
            'PR103': 0.25, 'PR104': 0.22, 'PR105': 0.19,
            'PR106': 0.17, 'PR107': 0.21, 'PR108': 0.23,
            'PR109': 0.16
        }
        self.eligibility_map = {
            'Low': ['PR100', 'PR101', 'PR102'],
            'Medium': ['PR103', 'PR104', 'PR105'],
            'High': ['PR106', 'PR107', 'PR108', 'PR109']
        }

    def get_valid_actions(self, promotion_history):
        eligible = self.eligibility_map.get(promotion_history, [])
        return [PROMOTION_IDS.index(p) for p in eligible if p in PROMOTION_IDS]

    def calculate_reward(self, transaction, action):
        promo_id = PROMOTION_IDS[action]
        cost = self.promo_costs[promo_id]

        response = transaction['response']
        quantity = transaction['quantity']
        price = transaction['price']
        margin = transaction['margin'] - cost
        loyalty = transaction['loyalty_score']
        churn = transaction['churn_score']
        segment = transaction.get('segment', 'Standard')
        sales_reward = math.log1p(quantity * price) * response * 2.0
        margin_reward = margin * quantity * 0.5
        retention_reward = (1 / (1 + math.exp(-loyalty))) - math.exp(churn)
        penalty = (1 - response) * price * 0.3
        segment_mult = self.segment_multipliers.get(segment, 1.0)

        return ((self.weights['sales'] * sales_reward +
                self.weights['margin'] * margin_reward +
                self.weights['retention'] * retention_reward -
                penalty) * segment_mult)

class RolloutBuffer:
    def __init__(self):
        self.states = []
        self.actions = []
        self.log_probs = []
        self.rewards = []
        self.values = []
        self.dones = []
        self.advantages = []
        self.returns = []
        self.valid_actions_masks = []

    def add(self, state, action, log_prob, reward, value, done, valid_actions):
        self.states.append(state)
        self.actions.append(action)
        self.log_probs.append(log_prob)
        self.rewards.append(reward)
        self.values.append(value)
        self.dones.append(done)
        self.valid_actions_masks.append(valid_actions)

    def clear(self):
        self.states = []
        self.actions = []
        self.log_probs = []
        self.rewards = []
        self.values = []
        self.dones = []
        self.advantages = []
        self.returns = []
        self.valid_actions_masks = []

class PPOAgent:
    def __init__(self, state_size):
        self.state_size = state_size
        self.action_size = ACTION_SPACE_SIZE
        self.gamma = 0.99
        self.gae_lambda = 0.95
        self.eps_clip = 0.2
        self.entropy_coeff = 0.01
        self.critic_coeff = 0.5
        self.batch_size = 64
        self.n_epochs = 10
        self.actor = self._build_actor()
        self.critic = self._build_critic()
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=3e-4)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=1e-3)
        self.buffer = RolloutBuffer()

    def _build_actor(self):
        return nn.Sequential(
            nn.Linear(self.state_size, 512),
            nn.LayerNorm(512),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.25),
            nn.Linear(512, 256),
            nn.LayerNorm(256),
            nn.LeakyReLU(0.1),
            nn.Linear(256, self.action_size)
        )

    def _build_critic(self):
        return nn.Sequential(
            nn.Linear(self.state_size, 512),
            nn.LayerNorm(512),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.25),
            nn.Linear(512, 256),
            nn.LayerNorm(256),
            nn.LeakyReLU(0.1),
            nn.Linear(256, 1)
        )

    def act(self, state, valid_actions):
        state = torch.FloatTensor(state)
        with torch.no_grad():
            logits = self.actor(state)
            valid_actions_tensor = torch.tensor(valid_actions, dtype=torch.long)
            masked_logits = torch.full_like(logits, -1e8)
            masked_logits[valid_actions_tensor] = logits[valid_actions_tensor]

            dist = Categorical(logits=masked_logits)
            action = dist.sample()
            log_prob = dist.log_prob(action)
            value = self.critic(state)

        return action.item(), log_prob.item(), value.item()

    def evaluate(self, states, actions, valid_actions_masks):
        if valid_actions_masks is None:
            valid_actions_masks = torch.ones_like(states[:, :self.action_size])
        logits = self.actor(states)
        masked_logits = logits * valid_actions_masks + (1 - valid_actions_masks) * -1e8
        dist = Categorical(logits=masked_logits)
        log_probs = dist.log_prob(actions)
        entropy = dist.entropy().mean()
        values = self.critic(states).squeeze()
        return log_probs, values, entropy

    def compute_gae(self):
        rewards = torch.tensor(self.buffer.rewards, dtype=torch.float32)
        values = torch.tensor(self.buffer.values, dtype=torch.float32)
        dones = torch.tensor(self.buffer.dones, dtype=torch.float32)
        values = torch.cat([values, torch.zeros(1)])
        advantages = torch.zeros_like(rewards)
        gae = 0
        for t in reversed(range(len(rewards))):
            delta = rewards[t] + self.gamma * values[t+1] * (1 - dones[t]) - values[t]
            gae = delta + self.gamma * self.gae_lambda * (1 - dones[t]) * gae
            advantages[t] = gae
        returns = advantages + values[:-1]
        advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

        return advantages, returns

    def update(self):
        states_tensor = torch.FloatTensor(np.array(self.buffer.states))
        actions_tensor = torch.LongTensor(self.buffer.actions)
        old_log_probs_tensor = torch.FloatTensor(self.buffer.log_probs)
        valid_actions_masks_tensor = torch.zeros(len(self.buffer.valid_actions_masks), ACTION_SPACE_SIZE)
        for i, valid_actions in enumerate(self.buffer.valid_actions_masks):
            valid_actions_masks_tensor[i, valid_actions] = 1
        advantages_tensor, returns_tensor = self.compute_gae()
        dataset = TensorDataset(
            states_tensor,
            actions_tensor,
            old_log_probs_tensor,
            advantages_tensor,
            returns_tensor,
            valid_actions_masks_tensor
        )
        loader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True)
        for _ in range(self.n_epochs):
            for batch in loader:
                states, actions, old_log_probs, advantages, returns, valid_actions_masks = batch
                new_log_probs, values, entropy = self.evaluate(states, actions, valid_actions_masks)
                ratios = torch.exp(new_log_probs - old_log_probs)
                surr1 = ratios * advantages
                surr2 = torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip) * advantages
                actor_loss = -torch.min(surr1, surr2).mean()
                critic_loss = torch.nn.functional.mse_loss(values, returns)
                total_loss = (
                    actor_loss +
                    self.critic_coeff * critic_loss -
                    self.entropy_coeff * entropy
                )
                self.actor_optimizer.zero_grad()
                self.critic_optimizer.zero_grad()
                total_loss.backward()
                torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 0.5)
                torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 0.5)
                self.actor_optimizer.step()
                self.critic_optimizer.step()
        self.buffer.clear()

def preprocess_data(df):
    df = df.drop(['user_id', 'product_id', 'promotion_used'], axis=1, errors='ignore')
    categorical_cols = ['category', 'gender', 'segment', 'promotion_history']
    df = pd.get_dummies(df, columns=categorical_cols)
    numeric_cols = ['quantity', 'price', 'margin', 'churn_score',
                   'loyalty_score', 'avg_order_value', 'age', 'income']
    for col in numeric_cols:
        if col in df.columns:
            df[col] = (df[col] - df[col].mean()) / df[col].std()

    return df.fillna(0)

def train_ppo_agent(df, config):
    df = df[df['promotion_used'].isin(PROMOTION_IDS)].copy()
    processed_df = preprocess_data(df)
    promotion_map = {p: i for i, p in enumerate(PROMOTION_IDS)}
    processed_df['action'] = df['promotion_used'].map(promotion_map)
    state_columns = [col for col in processed_df.columns if col != 'action']
    states = processed_df[state_columns].values.astype(np.float32)
    states = np.nan_to_num(states)
    reward_system = AdvancedRewardSystem(config)
    agent = PPOAgent(states.shape[1])
    episodes = 300
    steps_per_episode = 300

    for episode in range(episodes):
        total_reward = 0
        indices = np.random.permutation(len(processed_df))

        for step in range(steps_per_episode):
            idx = indices[step % len(processed_df)]
            row = df.iloc[idx]
            state = states[idx]
            valid_actions = reward_system.get_valid_actions(row['promotion_history'])
            if not valid_actions:
                continue
            action, log_prob, value = agent.act(state, valid_actions)
            reward = reward_system.calculate_reward(row.to_dict(), action)
            next_idx = (idx + 1) % len(processed_df)
            next_state = states[next_idx]
            agent.buffer.add(
                state, action, log_prob, reward, value,
                False, valid_actions
            )

            total_reward += reward
        agent.update()

        avg_reward = total_reward / steps_per_episode
        print(f"Episode {episode+1:03d}/{episodes} | Avg Reward: {avg_reward:.2f}")

    return agent

if __name__ == "__main__":
    df = pd.read_csv('train_data.csv')

    reward_config = {
        'weights': {
            'sales': 0.4,
            'margin': 0.35,
            'retention': 0.25
        },
        'segment_multipliers': {
            'Premium': 1.6,
            'Standard': 1.1,
            'Budget': 0.7
        }
    }

    trained_agent = train_ppo_agent(df, reward_config)
    torch.save(trained_agent.actor.state_dict(), 'final_promotion_model.pth')
    print("Training completed successfully.")

Episode 001/300 | Avg Reward: 37.23
Episode 002/300 | Avg Reward: 36.94
Episode 003/300 | Avg Reward: 37.75
Episode 004/300 | Avg Reward: 37.87
Episode 005/300 | Avg Reward: 37.40
Episode 006/300 | Avg Reward: 37.69
Episode 007/300 | Avg Reward: 37.72
Episode 008/300 | Avg Reward: 38.06
Episode 009/300 | Avg Reward: 38.09
Episode 010/300 | Avg Reward: 37.76
Episode 011/300 | Avg Reward: 37.96
Episode 012/300 | Avg Reward: 37.75
Episode 013/300 | Avg Reward: 37.95
Episode 014/300 | Avg Reward: 36.80
Episode 015/300 | Avg Reward: 37.28
Episode 016/300 | Avg Reward: 38.09
Episode 017/300 | Avg Reward: 37.13
Episode 018/300 | Avg Reward: 38.05
Episode 019/300 | Avg Reward: 37.57
Episode 020/300 | Avg Reward: 38.43
Episode 021/300 | Avg Reward: 36.90
Episode 022/300 | Avg Reward: 36.42
Episode 023/300 | Avg Reward: 37.57
Episode 024/300 | Avg Reward: 38.03
Episode 025/300 | Avg Reward: 38.58
Episode 026/300 | Avg Reward: 35.87
Episode 027/300 | Avg Reward: 38.01
Episode 028/300 | Avg Reward

EVALUATION WITH TEST DATA

In [14]:
import torch
import pandas as pd
import numpy as np
import math
from collections import deque
import random
from torch.distributions import Categorical

PROMOTION_IDS = ['PR100', 'PR101', 'PR102', 'PR103', 'PR104',
                'PR105', 'PR106', 'PR107', 'PR108', 'PR109']

class AdvancedRewardSystem:
    def __init__(self, config):
        self.weights = config['weights']
        self.segment_multipliers = config['segment_multipliers']
        self.promo_costs = {
            'PR100': 0.15, 'PR101': 0.20, 'PR102': 0.18,
            'PR103': 0.25, 'PR104': 0.22, 'PR105': 0.19,
            'PR106': 0.17, 'PR107': 0.21, 'PR108': 0.23,
            'PR109': 0.16
        }
        self.eligibility_map = {
            'Low': ['PR100', 'PR101', 'PR102'],
            'Medium': ['PR103', 'PR104', 'PR105'],
            'High': ['PR106', 'PR107', 'PR108', 'PR109']
        }

    def get_valid_actions(self, promotion_history):
        eligible = self.eligibility_map.get(promotion_history, [])
        return [PROMOTION_IDS.index(p) for p in eligible if p in PROMOTION_IDS]

    def calculate_reward(self, transaction, action):
        promo_id = PROMOTION_IDS[action]
        cost = self.promo_costs[promo_id]

        response = transaction['response']
        quantity = transaction['quantity']
        price = transaction['price']
        margin = transaction['margin'] - cost
        loyalty = transaction['loyalty_score']
        churn = transaction['churn_score']
        segment = transaction.get('segment', 'Standard')

        sales_reward = math.log1p(quantity * price) * response * 2.0
        margin_reward = margin * quantity * 0.5
        retention_reward = (1 / (1 + math.exp(-loyalty))) - math.exp(churn)
        penalty = (1 - response) * price * 0.3

        segment_mult = self.segment_multipliers.get(segment, 1.0)
        return ((self.weights['sales'] * sales_reward +
                self.weights['margin'] * margin_reward +
                self.weights['retention'] * retention_reward -
                penalty) * segment_mult)

class PPOAgent:
    def __init__(self, state_size):
        self.state_size = state_size
        self.action_size = len(PROMOTION_IDS)
        self.actor = self._build_actor()

    def _build_actor(self):
        return nn.Sequential(
            nn.Linear(self.state_size, 512),
            nn.LayerNorm(512),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.25),
            nn.Linear(512, 256),
            nn.LayerNorm(256),
            nn.LeakyReLU(0.1),
            nn.Linear(256, self.action_size)
        )

    def act(self, state, valid_actions, deterministic=False):
        state = torch.FloatTensor(state)
        with torch.no_grad():
            logits = self.actor(state)
            valid_actions_tensor = torch.tensor(valid_actions, dtype=torch.long)
            masked_logits = torch.full_like(logits, -1e8)
            masked_logits[valid_actions_tensor] = logits[valid_actions_tensor]

            if deterministic:
                action = masked_logits.argmax(dim=-1)
            else:
                dist = Categorical(logits=masked_logits)
                action = dist.sample()

        return action.item()

def preprocess_data(df, train_stats=None):
    df = df.drop(['user_id', 'product_id', 'promotion_used'], axis=1, errors='ignore')
    categorical_cols = ['category', 'gender', 'segment', 'promotion_history']
    df = pd.get_dummies(df, columns=categorical_cols)
    numeric_cols = ['quantity', 'price', 'margin', 'churn_score',
                   'loyalty_score', 'avg_order_value', 'age', 'income']

    if train_stats:
        means = train_stats['means']
        stds = train_stats['stds']
    else:
        means = df[numeric_cols].mean()
        stds = df[numeric_cols].std()

    for col in numeric_cols:
        if col in df.columns:
            df[col] = (df[col] - means[col]) / stds[col]

    return df.fillna(0), (means, stds) if train_stats is None else None

def evaluate_model(test_path, model_path, reward_config, train_stats):
    df_test = pd.read_csv(test_path)
    df_test = df_test[df_test['promotion_used'].isin(PROMOTION_IDS)].copy()
    processed_test, _ = preprocess_data(df_test, train_stats)
    state_columns = processed_test.columns.tolist()
    states_test = processed_test.values.astype(np.float32)
    states_test = np.nan_to_num(states_test)
    reward_system = AdvancedRewardSystem(reward_config)

    agent = PPOAgent(states_test.shape[1])
    agent.actor.load_state_dict(torch.load(model_path))
    agent.actor.eval()

    total_reward = 0
    correct_predictions = 0
    action_counts = {p: 0 for p in PROMOTION_IDS}

    for idx in range(len(df_test)):
        row = df_test.iloc[idx]
        state = states_test[idx]
        valid_actions = reward_system.get_valid_actions(row['promotion_history'])
        if not valid_actions:
            continue
        action = agent.act(state, valid_actions, deterministic=True)
        promo_used = PROMOTION_IDS[action]
        action_counts[promo_used] += 1

        reward = reward_system.calculate_reward(row.to_dict(), action)
        total_reward += reward

        if promo_used == row['promotion_used']:
            correct_predictions += 1

    avg_reward = total_reward / len(df_test) if len(df_test) > 0 else 0
    accuracy = (correct_predictions / len(df_test)) * 100 if len(df_test) > 0 else 0

    print(f"\n{' Evaluation Results ':=^40}")
    print(f"Test Samples: {len(df_test)}")
    print(f"Total Reward: {total_reward:.2f}")
    print(f"Avg Reward/Sample: {avg_reward:.2f}")
    print(f"Accuracy vs Historical: {accuracy:.2f}%")

    print("\nAction Distribution:")
    for promo, count in action_counts.items():
        print(f"{promo}: {count} ({count/len(df_test)*100:.1f}%)")

if __name__ == "__main__":
    train_df = pd.read_csv('train_data.csv')
    _, train_stats = preprocess_data(train_df)

    reward_config = {
        'weights': {'sales': 0.4, 'margin': 0.35, 'retention': 0.25},
        'segment_multipliers': {'Premium': 1.6, 'Standard': 1.1, 'Budget': 0.7}
    }

    evaluate_model(
        test_path='test_data.csv',
        model_path='final_promotion_model.pth',
        reward_config=reward_config,
        train_stats={'means': train_stats[0], 'stds': train_stats[1]}
    )


Test Samples: 43
Total Reward: 1377.31
Avg Reward/Sample: 32.03
Accuracy vs Historical: 13.95%

Action Distribution:
PR100: 6 (14.0%)
PR101: 5 (11.6%)
PR102: 5 (11.6%)
PR103: 0 (0.0%)
PR104: 4 (9.3%)
PR105: 5 (11.6%)
PR106: 10 (23.3%)
PR107: 1 (2.3%)
PR108: 0 (0.0%)
PR109: 7 (16.3%)
