In [8]:
# Cell 1: Imports & Global Config
import os, random, math, sys, time, copy
from collections import deque

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Reproducibility
SEED = 42
np.random.seed(SEED)
random.seed(SEED)
tf.keras.utils.set_random_seed(SEED)

print("TensorFlow:", tf.__version__)


TensorFlow: 2.19.0


In [9]:
# Cell 2: Load CSV (replace path if needed)
FILE_PATH = "customer_movements data set.csv"  # update if different

df = pd.read_csv(FILE_PATH)
print("Loaded shape:", df.shape)
print(df.head(5).to_string(index=False))


Loaded shape: (8438, 6)
Customer_ID      Timestamp  Latitude  Longitude  Location_Name Event
       C001 8/17/2025 8:00 30.989552  28.693400 Sahel Seashell Enter
       C068 8/17/2025 8:00 30.989552  28.693400 Sahel Seashell Enter
       C067 8/17/2025 8:00 30.989552  28.693400 Sahel Seashell Enter
       C066 8/17/2025 8:00 30.989552  28.693400 Sahel Seashell Enter
       C065 8/17/2025 8:00 30.032746  31.458287         Tagamo Enter


In [11]:
# Cell 3: Preprocessing
# Convert Timestamp to datetime and extract features
df["Timestamp"] = pd.to_datetime(df["Timestamp"], errors='coerce')

# Time-derived features (you can add others)
df["Hour"] = df["Timestamp"].dt.hour.fillna(0).astype(int)
df["Day"]  = df["Timestamp"].dt.day.fillna(0).astype(int)
df["Month"]= df["Timestamp"].dt.month.fillna(0).astype(int)
df["Weekday"] = df["Timestamp"].dt.weekday.fillna(0).astype(int)

# Select features and target
# Drop Customer_ID and raw Timestamp; keep Latitude/Longitude, Location_Name, Hour, Day, Month, Weekday
features = ["Latitude", "Longitude", "Location_Name", "Hour", "Day", "Month", "Weekday"]
target_col = "Event"

# Basic cleaning: drop rows with missing event
df = df.dropna(subset=[target_col]).reset_index(drop=True)

X_df = df[features].copy()
y_ser = df[target_col].copy()

# Encode target as binary: 'Enter' -> 1 else 0
y = (y_ser == "Enter").astype(int).values

# Build preprocessing pipeline
categorical_cols = ["Location_Name"]
numeric_cols = ["Latitude", "Longitude", "Hour", "Day", "Month", "Weekday"]

preprocessor = ColumnTransformer(transformers=[
    ("num", StandardScaler(), numeric_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_cols)
], remainder='drop')

X_processed = preprocessor.fit_transform(X_df)  # numpy array (sparse=False)
print("Processed feature shape:", X_processed.shape)

# Train/Val/Test split: we will create x_tr, x_val, x_te and corresponding y arrays as in your original code
X_trainval, X_test, y_trainval, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=SEED, stratify=y)
VAL_FRAC = 0.2
n_val = int(len(X_trainval) * VAL_FRAC)
# shuffle before split
perm = np.arange(len(X_trainval))
np.random.shuffle(perm)
X_trainval = X_trainval[perm]
y_trainval = y_trainval[perm]

x_val = X_trainval[:n_val]
y_val = y_trainval[:n_val]
x_tr  = X_trainval[n_val:]
y_tr  = y_trainval[n_val:]

x_te = X_test
y_te = y_test

print("Shapes: x_tr, x_val, x_te ->", x_tr.shape, x_val.shape, x_te.shape)


Processed feature shape: (8438, 26)
Shapes: x_tr, x_val, x_te -> (5400, 26) (1350, 26) (1688, 26)


In [12]:
# Cell 4: Search space + helpers (adapted from your snippet)
SEARCH_EPOCHS = 3     # keep small for search
PATIENCE = 1
VERBOSE_FIT = 0
SEED = 42

RETRAIN_FINAL = True
FINAL_EPOCHS = 8
FINAL_PATIENCE = 2

# Replace embedding/LSTM hyperparams with DNN hyperparams
SPACE = {
    "H1_UNITS":   {"type": "int",   "min": 32,  "max": 256, "step": 32},
    "H2_UNITS":   {"type": "int",   "min": 16,  "max": 256, "step": 16},
    "DROPOUT":    {"type": "float", "min": 0.0, "max": 0.6, "step": 0.05},
    "LEARNING_RATE":{"type":"float","min":1e-5, "max":1e-2, "log": True},
    "BATCH_SIZE": {"type":"cat", "choices":[16, 32, 64, 128]}
}

START = {
    "H1_UNITS": 128,
    "H2_UNITS": 64,
    "DROPOUT": 0.25,
    "LEARNING_RATE": 1e-3,
    "BATCH_SIZE": 32,
}

HYPERPARAM_SPACE = SPACE  # reuse for _clip_round

def _clip_round(name, val):
    spec = HYPERPARAM_SPACE[name]
    if spec["type"] == "int":
        v = int(round(val))
        return int(max(spec["min"], min(spec["max"], v)))
    if spec["type"] == "float":
        v = float(max(spec["min"], min(spec["max"], float(val))))
        step = spec.get("step", None)
        if step is not None and step > 0:
            v = round(v / step) * step
            v = float(max(spec["min"], min(spec["max"], v)))
        else:
            precision = spec.get("precision", 6)
            v = round(v, precision)
        return v
    if spec["type"] == "cat":
        if val not in spec["choices"]:
            return spec["choices"][0]
        return val
    raise ValueError(f"Unknown spec type for {name}: {spec['type']}")

def _random_neighbor(cfg):
    # change exactly one hyperparam
    name = random.choice(list(SPACE.keys()))
    spec = SPACE[name]
    new_cfg = cfg.copy()
    if spec["type"] == "int":
        step = spec["step"]
        delta = random.choice([-step, step])
        new_cfg[name] = _clip_round(name, cfg[name] + delta)
    elif spec["type"] == "float" and not spec.get("log", False):
        step = spec["step"]
        delta = random.choice([-step, step])
        new_cfg[name] = _clip_round(name, cfg[name] + delta)
    elif spec["type"] == "float" and spec.get("log", False):
        factor = random.choice([0.5, 0.67, 1.5, 2.0])
        new_cfg[name] = _clip_round(name, cfg[name] * factor)
    elif spec["type"] == "cat":
        choices = spec["choices"]
        idx = choices.index(cfg[name])
        new_idx = (idx + random.choice([-1, 1])) % len(choices)
        new_cfg[name] = choices[new_idx]
    return new_cfg, name

def _cfg_key(cfg):
    return (
        int(cfg["H1_UNITS"]),
        int(cfg["H2_UNITS"]),
        float(cfg["DROPOUT"]),
        float(cfg["LEARNING_RATE"]),
        int(cfg["BATCH_SIZE"]),
    )

# Simple cache
_SCORE_CACHE = {}


In [13]:
# Cell 5: build_and_score
def build_and_score(cfg, search_epochs=SEARCH_EPOCHS, verbose_fit=VERBOSE_FIT):
    key = _cfg_key(cfg)
    if key in _SCORE_CACHE:
        return _SCORE_CACHE[key]

    tf.keras.utils.set_random_seed(SEED)
    # Build a simple DNN suited for tabular data
    model = keras.Sequential([
        layers.Input(shape=(x_tr.shape[1],)),
        layers.Dense(cfg["H1_UNITS"], activation="relu"),
        layers.Dropout(cfg["DROPOUT"]),
        layers.Dense(cfg["H2_UNITS"], activation="relu"),
        layers.Dropout(cfg["DROPOUT"]),
        layers.Dense(1, activation="sigmoid")
    ])
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=cfg["LEARNING_RATE"]),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )

    callbacks = [
        keras.callbacks.EarlyStopping(monitor='val_loss', patience=PATIENCE, restore_best_weights=True)
    ]

    hist = model.fit(
        x_tr, y_tr,
        validation_data=(x_val, y_val),
        epochs=search_epochs,
        batch_size=cfg["BATCH_SIZE"],
        callbacks=callbacks,
        verbose=verbose_fit
    )

    val_acc = float(max(hist.history.get("val_accuracy", [0.0])))
    _SCORE_CACHE[key] = val_acc
    return val_acc


In [14]:
# Cell 7: Run searches
HC_ITERS = 12
SA_ITERS = 20
TABU_ITERS = 15

print("\n=== Running Hill Climbing ===")
hc_hist, hc_best = hill_climb(START, iters=HC_ITERS)

print("\n=== Running Simulated Annealing ===")
sa_hist, sa_best = simulated_annealing(START, iters=SA_ITERS, T0=1.0, alpha=0.90)

print("\n=== Running Tabu Search ===")
tb_hist, tb_best = tabu_search(START, iters=TABU_ITERS, neighborhood_size=6, tabu_tenure=3, aspiration=True)

# Combine & show
all_hist = pd.DataFrame(hc_hist + sa_hist + tb_hist)
display(all_hist.head(10))

# Plot
plt.figure(figsize=(8,5))
for name, sub in all_hist.groupby("algorithm"):
    plt.plot(sub["iteration"], sub["val_accuracy"], marker='o', linestyle='-', label=name)
plt.title("Validation Accuracy vs. Iteration (search)")
plt.xlabel("Iteration")
plt.ylabel("Validation Accuracy")
plt.legend()
plt.tight_layout()
plt.show()

# Best-of summary
summary = pd.DataFrame([hc_best, sa_best, tb_best]).sort_values("val_accuracy", ascending=False).reset_index(drop=True)
print("\n=== Best configuration from each algorithm ===")
print(summary[["algorithm","val_accuracy","H1_UNITS","H2_UNITS","DROPOUT","LEARNING_RATE","BATCH_SIZE"]].to_string(index=False))
display(summary)



=== Running Hill Climbing ===


NameError: name 'hill_climb' is not defined

In [15]:
# Cell 8: Retrain selected best configs with fuller training and evaluate on test set
def retrain_and_test(cfg, epochs=FINAL_EPOCHS, patience=FINAL_PATIENCE):
    tf.keras.utils.set_random_seed(SEED)
    model = keras.Sequential([
        layers.Input(shape=(x_tr.shape[1],)),
        layers.Dense(cfg["H1_UNITS"], activation="relu"),
        layers.Dropout(cfg["DROPOUT"]),
        layers.Dense(cfg["H2_UNITS"], activation="relu"),
        layers.Dropout(cfg["DROPOUT"]),
        layers.Dense(1, activation="sigmoid")
    ])
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=cfg["LEARNING_RATE"]),
                  loss='binary_crossentropy', metrics=['accuracy'])
    callbacks = [keras.callbacks.EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights=True)]
    hist = model.fit(x_tr, y_tr, validation_data=(x_val, y_val), epochs=epochs,
                     batch_size=cfg["BATCH_SIZE"], callbacks=callbacks, verbose=1)
    test_loss, test_acc = model.evaluate(x_te, y_te, verbose=0)
    return model, test_acc, hist

if RETRAIN_FINAL:
    final_rows = []
    final_models = {}
    for row in [hc_best, sa_best, tb_best]:
        cfg = {k: row[k] for k in ["H1_UNITS","H2_UNITS","DROPOUT","LEARNING_RATE","BATCH_SIZE"]}
        print(f"\n[FINAL] Retraining best from {row['algorithm']} | cfg={cfg}")
        model_f, test_acc, hist = retrain_and_test(cfg)
        final_models[row['algorithm']] = model_f
        final_rows.append({"algorithm": row["algorithm"], "test_accuracy": float(test_acc), **cfg})
    final_df = pd.DataFrame(final_rows).sort_values("test_accuracy", ascending=False).reset_index(drop=True)
    print("\n=== Final Test Accuracies (retrained) ===")
    print(final_df.to_string(index=False))
    display(final_df)

    # Save the best retrained model
    best_alg = final_df.iloc[0]["algorithm"]
    best_model = final_models[best_alg]
    best_model.save(f"best_model_{best_alg.lower().replace(' ','_')}.keras")
    print(f"Saved best retrained model: best_model_{best_alg.lower().replace(' ','_')}.keras")


NameError: name 'hc_best' is not defined

In [16]:
# Cell 6: trace_row + Hill Climb / SA / Tabu (adapted)
def trace_row(alg, it, cfg, score, note):
    row = {
        "algorithm": alg,
        "iteration": it,
        "H1_UNITS": cfg["H1_UNITS"],
        "H2_UNITS": cfg["H2_UNITS"],
        "DROPOUT": cfg["DROPOUT"],
        "LEARNING_RATE": cfg["LEARNING_RATE"],
        "BATCH_SIZE": cfg["BATCH_SIZE"],
        "val_accuracy": score,
        "note": note,
    }
    return row

# Hill Climbing
def hill_climb(start_cfg, iters=15):
    current = start_cfg.copy()
    current_score = build_and_score(current)
    history = [trace_row("HillClimb", 0, current, current_score, "start")]
    print(f"[HC] iter 0 | score={current_score:.4f} | cfg={current}")

    for t in range(1, iters+1):
        neighbor, changed = _random_neighbor(current)
        neigh_score = build_and_score(neighbor)
        accepted = False
        if neigh_score >= current_score:
            current, current_score = neighbor, neigh_score
            accepted = True
        note = f"neighbor({changed}) {'ACCEPT' if accepted else 'reject'}"
        history.append(trace_row("HillClimb", t, current if accepted else neighbor, neigh_score, note))
        print(f"[HC] iter {t} | neigh={neigh_score:.4f} ({changed}) | best={current_score:.4f} | {'ACCEPT' if accepted else 'reject'}")
    best_idx = np.argmax([h["val_accuracy"] for h in history])
    return history, history[best_idx]

# Simulated Annealing
def simulated_annealing(start_cfg, iters=30, T0=1.0, alpha=0.90):
    current = start_cfg.copy()
    current_score = build_and_score(current)
    best_cfg = current.copy()
    best_score = current_score
    history = [trace_row("SimAnneal", 0, current, current_score, "start")]
    print(f"[SA] iter 0 | T={T0:.3f} | score={current_score:.4f} | cfg={current}")

    T = T0
    for t in range(1, iters+1):
        neighbor, changed = _random_neighbor(current)
        neigh_score = build_and_score(neighbor)
        delta = neigh_score - current_score
        accept_prob = 1.0 if delta >= 0 else math.exp(delta / max(T, 1e-8))
        accept = (delta >= 0) or (random.random() < accept_prob)

        if accept:
            current, current_score = neighbor, neigh_score

        if current_score > best_score:
            best_cfg, best_score = current.copy(), current_score

        note = f"T={T:.3f}; Î”={delta:.4f}; p={accept_prob:.3f}; {'ACCEPT' if accept else 'reject'}"
        history.append(trace_row("SimAnneal", t, current.copy(), current_score, note))
        print(f"[SA] iter {t} | T={T:.3f} | neigh={neigh_score:.4f} ({changed}) | cur={current_score:.4f} | best={best_score:.4f} | {note}")

        T *= alpha  # cool down
    best_idx = np.argmax([h["val_accuracy"] for h in history])
    return history, history[best_idx]

# Tabu Search
def tabu_search(start_cfg, iters=25, neighborhood_size=6, tabu_tenure=3, aspiration=True):
    current = start_cfg.copy()
    current_score = build_and_score(current)
    best_cfg = current.copy()
    best_score = current_score

    tabu = deque(maxlen=tabu_tenure)
    tabu.append(_cfg_key(current))

    history = [trace_row("Tabu", 0, current, current_score, f"start; tabu_size={len(tabu)}")]
    print(f"[Tabu] iter 0 | score={current_score:.4f} | cfg={current} | tabu={len(tabu)}")

    for t in range(1, iters + 1):
        candidates = []
        tried_keys = set()
        attempts = 0
        while len(candidates) < neighborhood_size and attempts < neighborhood_size * 4:
            neigh, changed = _random_neighbor(current)
            key = _cfg_key(neigh)
            if key in tried_keys:
                attempts += 1
                continue
            tried_keys.add(key)
            score = build_and_score(neigh)
            candidates.append((neigh, score, changed))
            attempts += 1

        candidates.sort(key=lambda x: x[1], reverse=True)
        chosen = None
        for neigh, score, changed in candidates:
            key = _cfg_key(neigh)
            if key not in tabu or (aspiration and score > best_score):
                chosen = (neigh, score, changed)
                break
        if chosen is None:
            chosen = candidates[0]

        neigh, neigh_score, changed = chosen
        current, current_score = neigh, neigh_score
        tabu.append(_cfg_key(current))

        if current_score > best_score:
            best_cfg, best_score = current.copy(), current_score

        note = f"move({changed}); tabu={len(tabu)}; best={best_score:.4f}"
        history.append(trace_row("Tabu", t, current.copy(), current_score, note))
        print(f"[Tabu] iter {t} | picked={neigh_score:.4f} ({changed}) | cur={current_score:.4f} | best={best_score:.4f} | tabu={len(tabu)}")

    best_idx = np.argmax([h["val_accuracy"] for h in history])
    return history, history[best_idx]
