In [None]:
# === Block 1: RAM logger ===
from google.colab import drive
drive.mount('/content/drive')

import os, signal, subprocess, time

LOG_PATH = "/content/process_ram_usage.txt"
PID_PATH = "/content/ram_logger.pid"

# Kill any existing logger we started earlier
if os.path.exists(PID_PATH):
    try:
        with open(PID_PATH, "r") as f:
            old_pid = int(f.read().strip())
        os.kill(old_pid, signal.SIGTERM)
        print(f"Stopped previous RAM logger (PID {old_pid})")
    except Exception as e:
        print(f"Could not stop previous logger: {e}")
    finally:
        try: os.remove(PID_PATH)
        except: pass

# Fresh log file
if os.path.exists(LOG_PATH):
    os.remove(LOG_PATH)
    print("Cleared existing RAM log file")

# Start a simple logger:
bash_script = r'''
set -e
while true; do
  {
    echo "---------------"
    echo "TS $(date -u +%Y-%m-%dT%H:%M:%SZ)"
    echo "SYSTEM_TOTAL $(free -b | awk '/^Mem:/{print $2}')"
    echo "SYSTEM_USED  $(free -b | awk '/^Mem:/{print $3}')"
    echo "SYSTEM_AVAILABLE $(free -b | awk '/^Mem:/{print $7}')"
    ps -eo pid,user,comm,rss --sort=-rss | head -20 | awk '{print "PROC",$1,$2,$3,$4}'
  } >> /content/process_ram_usage.txt
  sleep 10
done
'''

log_proc = subprocess.Popen(["bash", "-c", bash_script], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
with open(PID_PATH, "w") as f:
    f.write(str(log_proc.pid))

print("Background RAM logging started")
print(f"Log file: {LOG_PATH}")
print(f"Logger PID: {log_proc.pid}")
print("Stop later with: !kill -9 $(cat /content/ram_logger.pid)")
time.sleep(2)


In [None]:
# === Block 2: Training/eval scikit-learn Decision Tree ===
import pandas as pd
import numpy as np
import time
from datetime import datetime, timezone
from sklearn.tree import DecisionTreeRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error

# --- Timing helpers (UTC) ---
data = []  # list of dict(process, start, end)


def utcnow():
    return datetime.now(timezone.utc)


def log_start(name: str) -> int:
    idx = len(data)
    data.append({"process": name, "start": utcnow(), "end": None})
    return idx


def log_end(idx: int):
    data[idx]["end"] = utcnow()


def as_timing_df():
    if not data:
        return pd.DataFrame(columns=["process","start","end","duration_secs"])
    df = pd.DataFrame(data)
    df["duration_secs"] = (df["end"] - df["start"]).dt.total_seconds()
    return df


print("=== Workflow start (UTC) ===")

i_load_tr = log_start("load_train")
# 1. Load raw data (only finalized columns) -------------------------------
dtypes = {'id':'int64', 'item_nbr':'int32', 'store_nbr':'int8', 'onpromotion':'str'}
train = pd.read_csv('/content/drive/MyDrive/Dissertation/all files/train.csv',
                    dtype=dtypes,
                    parse_dates=['date'],
                    usecols=['id','date','store_nbr','item_nbr','unit_sales','onpromotion'])

log_end(i_load_tr)

stores = pd.read_csv('/content/drive/MyDrive/Dissertation/all files/stores.csv',
                     usecols=['store_nbr','city','state','type','cluster'])


items = pd.read_csv('/content/drive/MyDrive/Dissertation/all files/items.csv',
                    usecols=['item_nbr','family','class','perishable'])


hol = pd.read_csv('/content/drive/MyDrive/Dissertation/all files/holidays_events.csv',
                  dtype={'transferred':'str'},
                  parse_dates=['date'],
                  usecols=['date','locale','locale_name','type','transferred'])


log_end(i_load_tr)

# --- Paths & schema ---
base_path  = "/content/drive/MyDrive/Dissertation/final"
test_path  = f"{base_path}/df_test.csv"

# Load test
i_load_te = log_start("load_test")
assert os.path.exists(test_path), f"Test file not found: {test_path}"
print("Loading test data...")
DT_test = pd.read_csv(test_path, low_memory=False)
print(f"Test data loaded: {len(DT_test)} rows, {DT_test.shape[1]} columns")
log_end(i_load_te)



print("start join")
i_join = log_start("join")

# 2. Prepare holiday/event flags ------------------------------------------
hol = (
    hol[~hol.transferred.str.lower().eq('true')]       # drop transferred
       .query("type!='Work Day'")                       # drop compensatory work days
       .assign(
           on_hol=lambda df: df.type.map({
               'Holiday':'Holiday','Bridge':'Holiday','Additional':'Holiday'
           }),
           on_evt=lambda df: df.type.map({'Event':'Event'})
       )
)
locL = (
    hol.query("locale=='Local'")
       .loc[:, ['date','locale_name','on_hol','on_evt']]
       .rename(columns={'locale_name':'city'})
)
locR = (
    hol.query("locale=='Regional'")
       .loc[:, ['date','locale_name','on_hol','on_evt']]
       .rename(columns={'locale_name':'state'})
)
locN = hol.query("locale=='National'")[['date','on_hol','on_evt']]



# 3. Merge into single DataFrame -----------------------------------------

df = (
    train
      .merge(stores, on='store_nbr', how='left')
      .merge(items,  on='item_nbr',  how='left')
      .merge(locL,   on=['date','city'],  how='left')
      .merge(locR,   on=['date','state'], how='left')
      .merge(locN,   on='date',           how='left')
)



# 4. Keep only final features --------------------------------------------
df = df[[
    'id','unit_sales','date','store_nbr','item_nbr',
    'city','state','type','cluster','family','class','perishable',
    'onpromotion','on_hol','on_evt'
]].copy()


# 5. Basic transformations -----------------------------------------------

df['unit_sales'] = df['unit_sales'].clip(lower=0)


# calendar features
df['month']       = df['date'].dt.month
df['wage']        = df['date'].dt.day.isin([15, 31]).astype(int)
df['is_weekend']  = (df['date'].dt.dayofweek >= 5).astype(int)


# promotions & perishability
df['onpromotion'] = df['onpromotion'].map({'False': 0, 'True': 1}).fillna(2).astype(int)
df['perishable']  = df['perishable'].map({0: 1.0, 1: 1.25}).fillna(2)


# holiday/event flags
# prefer map+fillna so the result is a pandas Series
df['on_hol'] = df['on_hol'].map({'Holiday': 1}).fillna(-1).astype(int)
df['on_evt'] = df['on_evt'].map({'Event':   1}).fillna(-1).astype(int)




# Convert 'date' column to datetime
df['date'] = pd.to_datetime(df['date'], errors='coerce')


# Create 'day' column with weekday names
df['day'] = df['date'].dt.day_name()



# 7. Target (mean/rank) encode categoricals
categorical_cols = [
    'store_nbr','item_nbr','city','state',
    'type','cluster','family','class', 'day'
]
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False, dtype=np.uint8)
X_ohe = ohe.fit_transform(df[categorical_cols])

ohe_cols = ohe.get_feature_names_out(categorical_cols)
df_ohe = pd.DataFrame(X_ohe, columns=ohe_cols, index=df.index)

df = pd.concat([df.drop(columns=categorical_cols), df_ohe], axis=1)



# Create df_final with only the required features (dropping raw categoricals & outlier stats)
df_final = df[[
    'id',
    'unit_sales',
    'date',
    'perishable',
    'onpromotion',
    'on_hol',
    'on_evt',
    'month',
    'wage',
    'is_weekend',
    'day',
    'store_nbr',
    'item_nbr',
    'city',
    'state',
    'type',
    'cluster',
    'family',
    'class'
]]



features = [
  "perishable","onpromotion","on_hol","month","is_weekend","day",
  "store_nbr","item_nbr","city","state","type","cluster","family","class"
]
target = "unit_sales"
log_end(i_join)

# --- Prepare data ---
i_prep = log_start("prepare_data")
print("Preparing data...")


X_train = df_final[features].copy()
y_train = df_final[target].copy()
X_test  = df_final[features].copy()
y_test  = df_final[target].copy()


log_end(i_prep)


# --- Model ---
model = DecisionTreeRegressor(
    max_depth=7,
    min_samples_split=1000
)


# --- Train ---
i_train = log_start("train_model")
print("Training model...")
t0 = utcnow()
model.fit(X_train, y_train)
t1 = utcnow()
print(f"Model trained in {(t1 - t0).total_seconds():.1f} sec")
log_end(i_train)


# Predict
i_pred = log_start("predict")
print("Predicting...")
t0 = utcnow()
y_pred = model.predict(X_test)
t1 = utcnow()
print(f"Prediction done in {(t1 - t0).total_seconds():.1f} sec")
log_end(i_pred)



print("Buffering 10s before evaluation to capture RAM samples")
time.sleep(10)


# Evaluate
i_eval = log_start("evaluate")
print("Evaluating...")
valid = (~y_test.isna()) & (~pd.isna(y_pred))
y_true = y_test[valid].to_numpy()
y_hat  = y_pred[valid]
if len(y_true) == 0:
    print("No valid rows to evaluate (missing target).")
    mse = rmse = mae = r2 = float("nan")
else:
    mse  = mean_squared_error(y_true, y_hat)
    rmse = np.sqrt(mse)
    mae  = mean_absolute_error(y_true, y_hat)
    r2   = r2_score(y_true, y_hat)


print("Buffering 10s after evaluation to capture RAM samples")
time.sleep(10)
log_end(i_eval)


# Printed metrics for manual note-taking
print("\n=== EVALUATION RESULTS ===")
print(f"MSE : {mse:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"MAE : {mae:.2f}")


print("\n=== STEP TIMINGS (pretty) ===")
print(as_timing_df().to_string(index=False))
print("\n=== Workflow end ===")



In [None]:
# === Block 3: Export timing to CSV (UTC ISO-8601) ===
# Produces one row per (process, phase), where phase in {"start","end"}.
# Columns: process, phase, time

import pandas as pd

if "data" not in globals() or not data:
    raise RuntimeError("No timing data found. Run Block 2 first.")

def rows_from_step(step):
    return [
        {"process": step["process"], "phase": "start", "time": step["start"].strftime("%Y-%m-%dT%H:%M:%SZ")},
        {"process": step["process"], "phase": "end",   "time": step["end"].strftime("%Y-%m-%dT%H:%M:%SZ")},
    ]

save_rows = []
for step in data:
    if step["start"] is None or step["end"] is None:
        continue
    save_rows.extend(rows_from_step(step))

df_out = pd.DataFrame(save_rows)
df_out.to_csv("/content/data.csv", index=False)
print("Saved timing data to /content/data.csv")


In [None]:
# === Block 4: RAM plots actual + normalized time ===
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

# ---- Inputs ----
TIMING_CSV = "/content/data.csv"               # written by Block 3
LOG_PATH   = "/content/process_ram_usage.txt"  # written by Block 1


# ---- Output base (per-run folder) ----
RUNS_BASE = "/content/drive/MyDrive/Dissertation/runs"
os.makedirs(RUNS_BASE, exist_ok=True)


# ---- Fixed segment colors (blue, green, yellow, red) ----
COLOR_LOADING  = "#1f77b4"  # blue
COLOR_JOIN     = "#2ca02c"  # green
COLOR_TRAINING = "#ffd700"  # yellow (gold)
COLOR_EVAL     = "#d62728"  # red


# ---- Segment layout for normalized plot ----
SEG_WIDTHS = {"loading": 0.20, "join": 0.20, "training": 0.40, "evaluate": 0.20}
SEG_ORDER = ["loading", "join", "training", "evaluate"]
SEG_COLORS = {
    "loading": COLOR_LOADING,
    "join": COLOR_JOIN,
    "training": COLOR_TRAINING,
    "evaluate": COLOR_EVAL,
}


# 1) Load workflow events (expects: process, phase, time)
events = pd.read_csv(TIMING_CSV)
events["time"] = pd.to_datetime(events["time"], utc=True)


first_start = events.loc[events["phase"] == "start", "time"].min()
if pd.isna(first_start):
    raise RuntimeError("No start event found in timing CSV.")


# Stable run_id per run
run_id = "run_" + first_start.strftime("%Y%m%dT%H%M%SZ")
run_dir = os.path.join(RUNS_BASE, run_id)
os.makedirs(run_dir, exist_ok=True)


# Milestones required by segments
markers = [{"label": "before_load", "time": first_start}]
label_map = {
    "load_test": "after_load_test",
    "join": "after_join",
    "train_model": "after_train",
    "evaluate": "after_eval",
}
for proc, lab in label_map.items():
    row = events[(events["process"] == proc) & (events["phase"] == "end")]
    if not row.empty:
        markers.append({"label": lab, "time": pd.to_datetime(row.iloc[0]["time"], utc=True)})
markers_df = pd.DataFrame(markers).sort_values("time").reset_index(drop=True)


# 2) Parse RAM log (SYSTEM_USED bytes to GB)
def parse_ram_log(path=LOG_PATH):
    if not os.path.exists(path):
        raise FileNotFoundError(f"RAM log not found: {path}")
    ts = None
    rows = []
    with open(path, "r") as f:
        for line in f:
            s = line.strip()
            if s.startswith("TS "):
                ts = pd.to_datetime(s.split()[1], utc=True)
            elif s.startswith("SYSTEM_USED"):
                try:
                    used_bytes = int(s.split()[1])
                    if ts is not None:
                        rows.append({"time": ts, "ram_gb": used_bytes / (1024**3)})
                except Exception:
                    pass
    df = pd.DataFrame(rows).sort_values("time").reset_index(drop=True)
    if df.empty:
        raise RuntimeError("Parsed RAM log is empty. Has the logger run long enough?")
    return df


ram = parse_ram_log()
ram["time"] = pd.to_datetime(ram["time"], utc=True)


# 3) Align milestones to nearest sample at-or-before
markers_for_merge = markers_df.rename(columns={"time": "event_time"}).sort_values("event_time")
aligned_df = pd.merge_asof(
    markers_for_merge, ram.sort_values("time"),
    left_on="event_time", right_on="time", direction="backward"
).rename(columns={"time":"plot_time"})[["label","event_time","plot_time","ram_gb"]]
aligned_df = aligned_df.dropna(subset=["plot_time"]).reset_index(drop=True)
m_time = {r.label: r.plot_time for _, r in aligned_df.iterrows()}
m_y    = {r.label: r.ram_gb    for _, r in aligned_df.iterrows()}


# 4) Consolidated segments (4 with join)
SEGMENTS = [
    ("loading",  "before_load",    "after_load_test"),
    ("join",     "after_load_test","after_join"),
    ("training", "after_join",     "after_train"),
    ("evaluate", "after_train",    "after_eval"),
]


# Helpers
def interp_ram_at(t, times, values):
    if t <= times.iloc[0]:  return float(values.iloc[0])
    if t >= times.iloc[-1]: return float(values.iloc[-1])
    idx = times.searchsorted(t, side="left")
    t0, t1 = times.iloc[idx-1], times.iloc[idx]
    y0, y1 = values.iloc[idx-1], values.iloc[idx]
    frac = (t - t0) / (t1 - t0)
    return float(y0 + frac * (y1 - y0))


def build_segment_series(t_start, t_end, ram_df):
    ram_df = ram_df.sort_values("time").reset_index(drop=True)
    if t_end <= t_start:
        return pd.DataFrame(columns=["time","ram_gb"])
    inside = ram_df[(ram_df["time"] > t_start) & (ram_df["time"] < t_end)].copy()
    y_start = interp_ram_at(t_start, ram_df["time"], ram_df["ram_gb"])
    y_end   = interp_ram_at(t_end,   ram_df["time"], ram_df["ram_gb"])
    out = pd.concat([
        pd.DataFrame([{"time": t_start, "ram_gb": y_start}]),
        inside[["time","ram_gb"]],
        pd.DataFrame([{"time": t_end,   "ram_gb": y_end}]),
    ], ignore_index=True).sort_values("time").reset_index(drop=True)
    return out


def segment_stats(df):
    if len(df) < 2:
        return dict(mean=np.nan, peak=df["ram_gb"].max() if not df.empty else np.nan,
                    duration_secs=0.0, samples=len(df))
    t = df["time"].astype("int64").to_numpy() / 1e9  # seconds
    y = df["ram_gb"].to_numpy()
    duration = t[-1] - t[0]
    area = np.trapezoid(y, t)  # np.trapz deprecated
    mean = area / duration if duration > 0 else np.nan
    peak = float(np.max(y))
    return dict(mean=mean, peak=peak, duration_secs=duration, samples=len(df))


# 5) Build series & stats
seg_series, seg_stats = {}, {}
for seg_name, start_lab, end_lab in SEGMENTS:
    if (start_lab in m_time) and (end_lab in m_time):
        t0, t1 = m_time[start_lab], m_time[end_lab]
        if t1 > t0:
            s = build_segment_series(t0, t1, ram)
            seg_series[seg_name] = s
            seg_stats[seg_name] = segment_stats(s)


avail_times = [m_time[l] for l in ["before_load","after_load_test","after_join","after_train","after_eval"] if l in m_time]
if len(avail_times) >= 2:
    run_start, run_end = min(avail_times), max(avail_times)
else:
    run_start = first_start
    run_end   = ram["time"].iloc[-1]
run_series = build_segment_series(run_start, run_end, ram)
overall_stats = segment_stats(run_series)


# 6) Plot 1 : Actual time (UTC)
fig1, ax1 = plt.subplots(figsize=(14, 6))
ax1.plot(ram["time"], ram["ram_gb"], linewidth=2, alpha=0.95, zorder=3)
y_min, y_max = ram["ram_gb"].min(), ram["ram_gb"].max()
y_text = y_min + 0.05 * (y_max - y_min)
for seg_name, start_lab, end_lab in SEGMENTS:
    if seg_name in seg_series:
        df = seg_series[seg_name]
        ax1.fill_between(df["time"], 0, df["ram_gb"], color=SEG_COLORS[seg_name], alpha=0.28, zorder=1)
        x0, x1 = df["time"].iloc[0], df["time"].iloc[-1]
        y0, y1 = df["ram_gb"].iloc[0], df["ram_gb"].iloc[-1]
        ax1.scatter([x0, x1], [y0, y1], s=80, zorder=4, color=SEG_COLORS[seg_name], edgecolors="black")
        label = "Loading" if seg_name=="loading" else ("Join" if seg_name=="join" else ("Training" if seg_name=="training" else "Evaluate"))
        x_mid = x0 + (x1 - x0) / 2
        ax1.text(x_mid, y_text, label, ha="center", va="center",
                 fontsize=10, fontweight="bold",
                 bbox=dict(boxstyle="round,pad=0.25", facecolor="white", alpha=0.6),
                 zorder=5)
ax1.set_title("Memory Profile: SCIKIT LEARN (DecisionTreeRegressor) — Join of Tables in Memory (Full Dataset)", fontweight="bold")
ax1.set_xlabel("Time (UTC)")
ax1.set_ylabel("Memory Usage (GB)")
ax1.grid(True, alpha=0.3, linestyle="--")
ax1.xaxis.set_major_formatter(mdates.DateFormatter("%H:%M:%S", tz=mdates.UTC))
fig1.autofmt_xdate()
plt.tight_layout()
plt.show()


# 7) Plot 2 : Normalized time
seg_spans, cursor = {}, 0.0
for name in SEG_ORDER:
    w = SEG_WIDTHS[name]
    seg_spans[name] = (cursor, cursor + w)
    cursor += w


norm_rows = []
for seg_name in SEG_ORDER:
    if seg_name not in seg_series:
        continue
    df = seg_series[seg_name]
    t0, t1 = df["time"].iloc[0], df["time"].iloc[-1]
    denom_ns = (t1.value - t0.value)
    if denom_ns <= 0:
        continue
    times_ns = df["time"].astype("int64").to_numpy()
    x_seg = (times_ns - t0.value) / float(denom_ns)  # [0,1] within segment
    span0, span1 = seg_spans[seg_name]
    x_global = span0 + x_seg * (span1 - span0)       # [0,1] with 20/20/40/20 widths
    for xs, xg, y, tt in zip(x_seg, x_global, df["ram_gb"].to_numpy(), df["time"]):
        norm_rows.append({
            "run_id": run_id, "segment": seg_name,
            "x_segment_norm": float(xs), "x_global_norm": float(xg),
            "ram_gb": float(y), "time_utc": tt.isoformat()
        })
norm_df = pd.DataFrame(norm_rows).sort_values(["x_global_norm"]).reset_index(drop=True)


# Save minimal per-run data for super plot
norm_csv_path = os.path.join(run_dir, "ram_profile_normalized.csv")
norm_df.to_csv(norm_csv_path, index=False)


# Plot 2: Normalized RAM
fig2, ax2 = plt.subplots(figsize=(14, 6))
if not norm_df.empty:
    ax2.plot(norm_df["x_global_norm"], norm_df["ram_gb"], linewidth=2, alpha=0.95, zorder=3)
    for seg_name in SEG_ORDER:
        span = seg_spans[seg_name]
        df_seg = norm_df[norm_df["segment"] == seg_name]
        if df_seg.empty:
            continue
        ax2.fill_between(df_seg["x_global_norm"], 0, df_seg["ram_gb"],
                         color=SEG_COLORS[seg_name], alpha=0.28, zorder=1)
        x0, x1 = span
        y0 = df_seg.iloc[(df_seg["x_global_norm"]-x0).abs().argmin()]["ram_gb"]
        y1 = df_seg.iloc[(df_seg["x_global_norm"]-x1).abs().argmin()]["ram_gb"]
        ax2.scatter([x0, x1], [y0, y1], s=80, zorder=4, color=SEG_COLORS[seg_name], edgecolors="black")
        label = "Loading" if seg_name=="loading" else ("Join" if seg_name=="join" else ("Training" if seg_name=="training" else "Evaluate"))
        x_mid = x0 + (x1 - x0) / 2
        y_min2, y_max2 = norm_df["ram_gb"].min(), norm_df["ram_gb"].max()
        y_text2 = y_min2 + 0.05 * (y_max2 - y_min2)
        ax2.text(x_mid, y_text2, label, ha="center", va="center",
                 fontsize=10, fontweight="bold",
                 bbox=dict(boxstyle="round,pad=0.25", facecolor="white", alpha=0.6),
                 zorder=5)
ax2.set_xlim(0.0, 1.0)
ax2.set_title("Memory Profile: SCIKIT LEARN (DecisionTreeRegressor) — Join of Tables in Memory (Full Dataset)", fontweight="bold")
ax2.set_xlabel("Normalized Time")
ax2.set_ylabel("Memory Usage (GB)")
ax2.grid(True, alpha=0.3, linestyle="--")
plt.tight_layout()
plt.show()


# 8) Print and save summary stats (overall + per segment)
def pretty_stats(name, st):
    print(f"{name:>10}: mean={st['mean']:.3f} GB | peak={st['peak']:.3f} GB | duration={st['duration_secs']:.1f}s | samples={st['samples']}")
print("\n=== RAM SUMMARY (time-weighted) ===")
pretty_stats("OVERALL", overall_stats)
for seg_name in SEG_ORDER:
    if seg_name in seg_stats:
        title = "LOADING" if seg_name=="loading" else ("JOIN" if seg_name=="join" else ("TRAINING" if seg_name=="training" else "EVALUATE"))
        pretty_stats(title, seg_stats[seg_name])


summary_rows = [{
    "run_id": run_id, "scope": "overall",
    "mean_ram_gb": overall_stats["mean"], "peak_ram_gb": overall_stats["peak"],
    "duration_secs": overall_stats["duration_secs"], "samples": overall_stats["samples"]
}]
for seg_name in SEG_ORDER:
    if seg_name in seg_stats:
        summary_rows.append({
            "run_id": run_id, "scope": seg_name,
            "mean_ram_gb": seg_stats[seg_name]["mean"], "peak_ram_gb": seg_stats[seg_name]["peak"],
            "duration_secs": seg_stats[seg_name]["duration_secs"], "samples": seg_stats[seg_name]["samples"]
        })
pd.DataFrame(summary_rows).to_csv(os.path.join(run_dir, "run_summary.csv"), index=False)
print(f"\n Saved normalized plot data: {norm_csv_path}")
print(f" Saved summary stats:       {os.path.join(run_dir, 'run_summary.csv')}")
