
# Budget Allocator — RL Prototype (Colab-ready)

Notebook contents:
- Synthetic influencer dataset generator
- Gym-style `BudgetAllocEnv` environment (step, reset, observation)
- Training snippet using `stable-baselines3` (PPO)
- Baseline: greedy heuristic and MIP (sketch using `pulp`)
- Evaluation utilities and tips for running on Colab

**How to use**
1. Upload this notebook to Google Colab.
2. Uncomment and run the `!pip install` cell to install dependencies (`gym`, `stable-baselines3`, `pulp`).
3. Run cells step-by-step. Training cell provided with modest timesteps for quick prototyping.


In [None]:

# Uncomment and run in Colab (or local env) to install required packages
# !pip install gym==0.26.5 stable-baselines3[extra]==2.0.0 pulp numpy==1.23.0
# If running on local machine, ensure compatible versions of gym and sb3 are used.


In [None]:

# Environment and synthetic data generator
import gym, numpy as np
from gym import spaces

def make_influencers(N=12, seed=0):
    np.random.seed(seed)
    infs = []
    for i in range(N):
        tier = np.random.choice(['micro','mid','macro'], p=[0.6,0.3,0.1])
        if tier == 'micro':
            base = 80_000
            reach_base = 2_000
        elif tier == 'mid':
            base = 300_000
            reach_base = 20_000
        else:
            base = 1_200_000
            reach_base = 100_000
        infs.append({
            "id": i,
            "tier": tier,
            "rate": {"story": int(base*0.2), "feed": int(base), "reels": int(base*1.4)},
            "base_reach": {"story": reach_base*0.2, "feed": reach_base, "reels": int(reach_base*1.2)},
            "engagement_rate": float(np.clip(np.random.normal(0.03, 0.008), 0.005, 0.2)),
            "audience_match": float(np.clip(np.random.beta(2,2), 0.0, 1.0)),
            "authenticity": float(np.clip(np.random.beta(3,2), 0.0, 1.0)),
            "variability": float(np.random.uniform(0.05, 0.25))
        })
    return infs

class BudgetAllocEnv(gym.Env):
    """
    Discrete-action environment for budget allocation.
    Action: Discrete encoding: influencer_idx * (formats*counts) + fmt*counts + cnt
    formats = 3 (story, feed, reels), counts = 3 (0,1,2 posts)
    """
    def __init__(self, influencers, budget, objective='conversion', max_steps=8):
        super().__init__()
        self.influencers = influencers
        self.N = len(influencers)
        self.budget = float(budget)
        self.objective = objective
        self.max_steps = max_steps
        self.formats = ['story','feed','reels']
        self.count_buckets = [0,1,2]  # posts per chosen action

        # features per influencer: rates(3), base_reaches(3), engagement_rate, audience_match, authenticity => 9
        F = 9
        obs_dim = 1 + self.N * F + 1  # budget_left + flattened influencer feats + objective id (or steps left)
        self.observation_space = spaces.Box(low=0.0, high=1.0, shape=(obs_dim,), dtype=np.float32)

        self.action_space = spaces.Discrete(self.N * len(self.formats) * len(self.count_buckets))
        self.reset()

    def reset(self):
        self.budget_left = float(self.budget)
        self.steps = 0
        # optionally track allocations
        self.allocations = []
        return self._get_obs()

    def _get_obs(self):
        feats = []
        for inf in self.influencers:
            feats.extend([
                inf['rate']['story'] / (self.budget + 1),
                inf['rate']['feed'] / (self.budget + 1),
                inf['rate']['reels'] / (self.budget + 1),
                inf['base_reach']['story'] / 1e6,
                inf['base_reach']['feed'] / 1e6,
                inf['base_reach']['reels'] / 1e6,
                inf['engagement_rate'],
                inf['audience_match'],
                inf['authenticity']
            ])
        obj_id = 0 if self.objective == 'awareness' else (1 if self.objective=='conversion' else 2)
        obs = np.concatenate([[self.budget_left / (self.budget+1)], np.array(feats), [obj_id]])
        return obs.astype(np.float32)

    def step(self, action):
        inf_idx = action // (len(self.formats)*len(self.count_buckets))
        rem = action % (len(self.formats)*len(self.count_buckets))
        fmt = rem // len(self.count_buckets)
        cnt = rem % len(self.count_buckets)  # 0..2

        inf = self.influencers[int(inf_idx)]
        format_name = self.formats[int(fmt)]
        cost = inf['rate'][format_name] * int(cnt)

        if cnt == 0:
            # no-op selection -> small negative to discourage wasted step
            reward = -0.01
            info = {"valid": True, "cost": 0.0}
        elif cost > self.budget_left:
            # invalid over-budget action -> negative reward and no budget change
            reward = -0.2
            info = {"valid": False, "cost": cost}
        else:
            # simulate outcomes
            base_reach = inf['base_reach'][format_name] * cnt
            noise = np.random.normal(1.0, inf['variability'])
            reach = max(0.0, base_reach * noise)
            engagements = reach * inf['engagement_rate']
            effective_eng = engagements * inf['audience_match']
            conversions = effective_eng * 0.002  # example conversion rate per engagement
            if self.objective == 'awareness':
                reward = np.log1p(reach) / 10.0
            elif self.objective == 'conversion':
                reward = conversions * 100.0 + effective_eng * 0.01
            else:
                reward = (conversions * 100.0) / (cost + 1.0)

            self.budget_left -= cost
            self.allocations.append({"inf": inf['id'], "format": format_name, "count": cnt, "cost": cost})

            info = {"valid": True, "cost": cost, "reach": reach, "engagements": engagements, "conversions": conversions}

        self.steps += 1
        done = (self.budget_left <= 0) or (self.steps >= self.max_steps)
        obs = self._get_obs()
        return obs, float(reward), bool(done), info

    def render(self, mode='human'):
        print("Budget left:", self.budget_left, "Allocations:", self.allocations)


In [None]:

# Training snippet (run after installing stable-baselines3)
# This cell uses PPO from stable-baselines3. For quick tests, reduce total_timesteps.
try:
    from stable_baselines3 import PPO
    from stable_baselines3.common.vec_env import DummyVecEnv
except Exception as e:
    print("stable-baselines3 not available. Please install in Colab: !pip install stable-baselines3[extra]")
    raise

# create synthetic influencers and env
infs = make_influencers(N=12, seed=42)
env = DummyVecEnv([lambda: BudgetAllocEnv(infs, budget=1_000_000, objective='conversion', max_steps=8)])

model = PPO("MlpPolicy", env, verbose=1, learning_rate=3e-4, n_steps=2048, batch_size=64, n_epochs=10, gamma=0.99)
# For prototyping, keep timesteps low (e.g., 50k). Increase for better policies.
model.learn(total_timesteps=50000)
model.save("ppo_budget_alloc_prototype")
print("Training finished, model saved.")


In [None]:

# Baseline: simple greedy heuristic and MIP sketch
def greedy_baseline(influencers, budget, objective='conversion'):
    remaining = budget
    allocations = []
    # score = expected_value_per_cost = (expected_conversions * value_per_conv)/cost approximated
    for inf in sorted(influencers, key=lambda x: (x['audience_match'] * x['engagement_rate'] * x['base_reach']['feed'])/x['rate']['feed'], reverse=True):
        # try choose feed first, then reels, then story
        for fmt in ['feed','reels','story']:
            cnt = 0
            while True:
                cost = inf['rate'][fmt]
                if cost <= remaining and cnt < 2:  # limit posts to 2 in baseline
                    remaining -= cost
                    allocations.append({"inf": inf['id'], "format": fmt, "count": 1, "cost": cost})
                    cnt += 1
                else:
                    break
    return allocations

# MIP sketch (deterministic expected reward) - requires pulp
mip_sketch = 

In [None]:

# Evaluation helpers
def simulate_allocations(influencers, allocations):
    total_cost = 0.0
    total_reach = 0.0
    total_eng = 0.0
    total_conv = 0.0
    for a in allocations:
        inf = influencers[a['inf']]
        fmt = a['format']
        cnt = a['count']
        cost = inf['rate'][fmt] * cnt
        base_reach = inf['base_reach'][fmt] * cnt
        noise = np.random.normal(1.0, inf['variability'])
        reach = max(0.0, base_reach * noise)
        eng = reach * inf['engagement_rate']
        eff = eng * inf['audience_match']
        conv = eff * 0.002
        total_cost += cost
        total_reach += reach
        total_eng += eng
        total_conv += conv
    return {"cost": total_cost, "reach": total_reach, "eng": total_eng, "conv": total_conv}

# Quick test of greedy baseline
infs = make_influencers(12, seed=1)
alloc = greedy_baseline(infs, budget=1_000_000)
res = simulate_allocations(infs, alloc)
print("Greedy baseline result:", res, "alloc len:", len(alloc))



---

## Notes & Tips
- The notebook uses a simple discrete action encoding for clarity. For improved performance, consider:
  - Action masking to prevent over-budget actions.
  - Hierarchical policies (pick influencer → pick format → pick count).
  - Continuous allocation output (fractions of budget) with post-processing / rounding.
- Reward shaping is crucial: tune constants so PPO receives stable, non-sparse feedback.
- Use curriculum learning: start with generous budgets and low stochasticity, then increase difficulty.
- Compare RL vs MIP baseline on identical seeds and average results across many episodes.
