In [2]:

!pip install numpy pandas scikit-learn gymnasium torch stable-baselines3[extra] tensorboard


Collecting stable-baselines3[extra]
  Downloading stable_baselines3-2.7.0-py3-none-any.whl.metadata (4.8 kB)
Downloading stable_baselines3-2.7.0-py3-none-any.whl (187 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m187.2/187.2 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: stable-baselines3
Successfully installed stable-baselines3-2.7.0


In [4]:
from google.colab import files
uploaded = files.upload()



Saving cars.csv to cars.csv


In [10]:
# ============================================================
# PPO DEALERSHIP RL WITH:
# - Relative Scoring
# - Preference Strictness
# - Hard Budget Constraints
# - Refinement Ranking
# - Explanations
# ============================================================

import gymnasium as gym
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import EvalCallback


# ============================================================
# 1. LOAD DATA
# ============================================================

df = pd.read_csv("cars.csv")

num_cols = ["price", "horsepower", "torque", "seats"]
cat_cols = ["brand", "body_type", "fuel_type"]

preprocess = ColumnTransformer([
    ("num", MinMaxScaler(), num_cols),
    ("cat", OneHotEncoder(), cat_cols),
])

X = preprocess.fit_transform(df)
X = X.toarray().astype(np.float32)

N_CARS = len(df)


# ============================================================
# 2. CUSTOMER GENERATOR WITH STRICTNESS LEVELS
# ============================================================

def generate_customer():
    return {
        "budget": np.random.uniform(0.1, 1.0),
        "power_pref": np.random.uniform(0.0, 1.0),
        "family_size": np.random.randint(1, 7),
        "body_pref": np.random.choice(df["body_type"].unique()),
        "fuel_pref": np.random.choice(df["fuel_type"].unique()),
        "patience": 4,

        # NEW: Preference strictness
        "strictness": {
            "body": np.random.choice(["must", "strong", "soft"]),
            "fuel": np.random.choice(["must", "strong", "soft"]),
            "power": np.random.choice(["strong", "soft"]),
            "budget": np.random.choice(["must", "strong"]),
            "seats": np.random.choice(["must", "strong"])
        }
    }


# ============================================================
# 3. MATCH SCORE (BASE SCORE)
# ============================================================

def match_score(customer, car_vector, car_row):
    score = 0
    hp_norm = car_row["horsepower"] / df["horsepower"].max()
    price_norm = car_row["price"] / df["price"].max()

    score += 1 - abs(hp_norm - customer["power_pref"])
    score += 1 - abs(price_norm - customer["budget"])
    score += 1 if car_row["body_type"] == customer["body_pref"] else 0
    score += 1 if car_row["fuel_type"] == customer["fuel_pref"] else 0
    score += 1 if car_row["seats"] >= customer["family_size"] else 0

    return score / 5


# ============================================================
# 4. REFINEMENT (TOP-N FILTERING)
# ============================================================

def refined_best_pick(customer, top=5):
    scores = []

    for i in range(N_CARS):
        car = df.iloc[i]

        # Hard budget constraint: price > budget*1.35 = filtered out
        if abs((car["price"] / df["price"].max()) - customer["budget"]) > 0.35:
            continue

        score = match_score(customer, X[i], car)
        scores.append((i, score))

    if not scores:
        # fallback: no constraints
        for i in range(N_CARS):
            car = df.iloc[i]
            score = match_score(customer, X[i], car)
            scores.append((i, score))

    scores.sort(key=lambda x: x[1], reverse=True)
    return scores[:top]


# ============================================================
# 5. EXPLANATION SYSTEM
# ============================================================

def explain_recommendation(customer, car):
    s = customer["strictness"]
    print("\nüìò WHY THIS CAR:")

    # Body type
    if car["body_type"] == customer["body_pref"]:
        print("‚úî Matches your preferred body type")
    else:
        print("‚úñ Body type mismatch (preference =", s["body"], ")")

    # Fuel
    if car["fuel_type"] == customer["fuel_pref"]:
        print("‚úî Fuel preference matched")
    else:
        print("‚úñ Fuel mismatch (preference =", s["fuel"], ")")

    # Budget
    price_norm = car["price"] / df["price"].max()
    if price_norm <= customer["budget"] + 0.15:
        print("‚úî Price aligns with your budget preference")
    else:
        print("‚úñ Price is higher than preferred (budget strict =", s["budget"], ")")

    # Seating
    if car["seats"] >= customer["family_size"]:
        print("‚úî Enough seats for your family")
    else:
        print("‚úñ Not enough seats (strict =", s["seats"], ")")

    # Performance
    hp_norm = car["horsepower"] / df["horsepower"].max()
    if abs(hp_norm - customer["power_pref"]) < 0.2:
        print("‚úî Power output close to desired level")
    else:
        print("‚úñ Power output differs from preferred level")


# ============================================================
# 6. ENVIRONMENT WITH STRICTNESS + HARD BUDGET PENALTY
# ============================================================

class CarDealershipEnv(gym.Env):
    metadata = {"render_modes": []}

    def __init__(self):
        super().__init__()
        self.action_space = gym.spaces.Discrete(N_CARS)
        self.observation_space = gym.spaces.Box(0, 1, shape=(6,), dtype=np.float32)
        self.reset()

    def _get_state(self):
        body_idx = list(df["body_type"].unique()).index(self.customer["body_pref"]) / len(df["body_type"].unique())
        fuel_idx = list(df["fuel_type"].unique()).index(self.customer["fuel_pref"]) / len(df["fuel_type"].unique())

        return np.array([
            self.customer["budget"],
            self.customer["power_pref"],
            self.customer["family_size"] / 7,
            body_idx,
            fuel_idx,
            self.customer["patience"] / 4
        ], dtype=np.float32)

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.customer = generate_customer()

        # Compute relative max score
        self.max_score = max(match_score(self.customer, X[i], df.iloc[i]) for i in range(N_CARS))
        return self._get_state(), {}

    def step(self, action):
        car = df.iloc[action]
        car_vec = X[action]
        c = self.customer
        s = c["strictness"]

        score = match_score(c, car_vec, car)
        norm = score / self.max_score

        reward = norm * 3 - 0.1
        done = False

        # Strictness penalties:
        # Body type
        if car["body_type"] != c["body_pref"]:
            if s["body"] == "must":
                reward -= 1.0
            elif s["body"] == "strong":
                reward -= 0.5
            else:
                reward -= 0.2

        # Fuel type
        if car["fuel_type"] != c["fuel_pref"]:
            if s["fuel"] == "must":
                reward -= 1.0
            elif s["fuel"] == "strong":
                reward -= 0.5
            else:
                reward -= 0.1

        # Hard budget constraint
        price_norm = car["price"] / df["price"].max()
        if abs(price_norm - c["budget"]) > 0.35:
            reward -= 1.2

        # Success condition
        if norm >= 0.95:
            reward += 6
            done = True
        else:
            c["patience"] -= 1

        if c["patience"] <= 0:
            reward -= 2
            done = True

        return self._get_state(), reward, done, False, {}


# ============================================================
# 7. TRAIN PPO
# ============================================================

def train_ppo():
    env = CarDealershipEnv()
    model = PPO(
        "MlpPolicy",
        env,
        learning_rate=3e-4,
        batch_size=256,
        n_steps=2048,
        gamma=0.99,
        gae_lambda=0.95,
        clip_range=0.2,
        ent_coef=0.01,
        verbose=1,
        tensorboard_log="./ppo_logs/",
    )

    callback = EvalCallback(env, best_model_save_path="./ppo_best/")
    model.learn(total_timesteps=300_000, callback=callback)
    model.save("car_ppo_agent")
    print("Training complete.")
    return model


# ============================================================
# 8. RECOMMENDATION (ONE CAR)
# ============================================================
def query_model(model,
                budget,
                power_pref,
                family_size,
                body_pref,
                fuel_pref,
                body_strict="strong",
                fuel_strict="strong",
                budget_strict="strong",
                power_strict="soft",
                seats_strict="strong"):

    # Normalize budget into 0‚Äì1 space
    max_price = df["price"].max()
    budget_norm = budget / max_price

    # Normalize hp preference into 0‚Äì1
    power_norm = power_pref  # already 0‚Äì1 suggested preference

    # Build customer object
    customer = {
        "budget": budget_norm,
        "power_pref": power_norm,
        "family_size": family_size,
        "body_pref": body_pref,
        "fuel_pref": fuel_pref,
        "patience": 4,
        "strictness": {
            "body": body_strict,
            "fuel": fuel_strict,
            "budget": budget_strict,
            "power": power_strict,
            "seats": seats_strict
        }
    }

    # Create env and manually override
    env = CarDealershipEnv()
    env.customer = customer
    env.max_score = max(match_score(customer, X[i], df.iloc[i]) for i in range(N_CARS))

    print("\nüßç Custom Customer:", customer)
    print("Max possible score:", env.max_score)

    # PPO prediction
    obs = env._get_state()
    ppo_action, _ = model.predict(obs, deterministic=True)
    ppo_car = df.iloc[ppo_action]

    print("\nüî• PPO suggestion:", ppo_car['brand'], ppo_car['model'])

    # Refinement layer
    top = refined_best_pick(customer, top=5)
    best_idx, best_score = top[0]
    best_car = df.iloc[best_idx]

    print("\nüîé Top 5 refined:")
    for idx, s in top:
        c = df.iloc[idx]
        print(c["brand"], c["model"], "‚Äî score:", s)

    print("\nüéØ FINAL RECOMMENDATION:")
    print(best_car["brand"], best_car["model"], "‚Çπ", best_car["price"])

    explain_recommendation(customer, best_car)

def recommend_one_car(model=None):
    if model is None:
        model = PPO.load("car_ppo_agent")

    env = CarDealershipEnv()
    obs, _ = env.reset()
    customer = env.customer

    print("\nüßç Customer:", customer)
    print("Strictness:", customer["strictness"])
    print("Max Score:", env.max_score)

    # PPO pick
    ppo_action, _ = model.predict(obs, deterministic=True)
    ppo_car = df.iloc[ppo_action]

    print("\nüî• PPO suggested:", ppo_car["brand"], ppo_car["model"])

    # Refinement layer
    top = refined_best_pick(customer, top=5)
    best_idx, best_score = top[0]
    best_car = df.iloc[best_idx]

    print("\nüîé Top 5 refined:")
    for idx, score in top:
        c = df.iloc[idx]
        print(c["brand"], c["model"], "‚Äî score:", score)

    print("\nüéØ FINAL RECOMMENDATION:")
    print(best_car["brand"], best_car["model"], "‚Çπ", best_car["price"])

    explain_recommendation(customer, best_car)


# ============================================================
# MAIN
# ============================================================

if __name__ == "__main__":
    model = train_ppo()
    recommend_one_car(model)


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to ./ppo_logs/PPO_2


  return datetime.utcnow().replace(tzinfo=utc)


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 3.74     |
|    ep_rew_mean     | -0.41    |
| time/              |          |
|    fps             | 187      |
|    iterations      | 1        |
|    time_elapsed    | 10       |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 3.66        |
|    ep_rew_mean          | 0.727       |
| time/                   |             |
|    fps                  | 185         |
|    iterations           | 2           |
|    time_elapsed         | 22          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.014793703 |
|    clip_fraction        | 0.095       |
|    clip_range           | 0.2         |
|    entropy_loss         | -4.84       |
|    explained_variance   | -0.00375    |
|    learning_rate        | 0.



New best mean reward!
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 3.55     |
|    ep_rew_mean     | 1.94     |
| time/              |          |
|    fps             | 185      |
|    iterations      | 5        |
|    time_elapsed    | 55       |
|    total_timesteps | 10240    |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 3.73        |
|    ep_rew_mean          | 0.534       |
| time/                   |             |
|    fps                  | 185         |
|    iterations           | 6           |
|    time_elapsed         | 66          |
|    total_timesteps      | 12288       |
| train/                  |             |
|    approx_kl            | 0.015408639 |
|    clip_fraction        | 0.186       |
|    clip_range           | 0.2         |
|    entropy_loss         | -4.71       |
|    explained_variance   | 0.21        |
|    lea

In [13]:
query_model(
    model,
    budget=10000000,
    power_pref=0.9,
    family_size=4,
    body_pref="sedan",
    fuel_pref="petrol"
)



üßç Custom Customer: {'budget': 0.625, 'power_pref': 0.9, 'family_size': 4, 'body_pref': 'sedan', 'fuel_pref': 'petrol', 'patience': 4, 'strictness': {'body': 'strong', 'fuel': 'strong', 'budget': 'strong', 'power': 'soft', 'seats': 'strong'}}
Max possible score: 0.9142164179104478

üî• PPO suggestion: Kia Stinger GT

üîé Top 5 refined:
Kia K9 ‚Äî score: 0.9142164179104478
Kia Stinger GT ‚Äî score: 0.8961940298507463
Audi A8 ‚Äî score: 0.8949999999999999
BMW 5 Series ‚Äî score: 0.8750373134328358
BMW 3 Series ‚Äî score: 0.8551492537313432

üéØ FINAL RECOMMENDATION:
Kia K9 ‚Çπ 9000000

üìò WHY THIS CAR:
‚úî Matches your preferred body type
‚úî Fuel preference matched
‚úî Price aligns with your budget preference
‚úî Enough seats for your family
‚úñ Power output differs from preferred level
