In [None]:
# This script creates a complete starter project folder with code files for the
# "Laptop Battery Life Predictor" end-to-end ML project.
#
# You'll get:
# - data logger (collector.py)
# - shared sensor utilities (sensors.py)
# - model training & evaluation (train.py, evaluate.py)
# - live prediction CLI (predict_now.py)
# - optional Streamlit app (app.py)
# - a quick simulator to generate synthetic data for testing (simulate_data.py)
# - requirements.txt and a README.md
# - A zipped archive you can download
#
# After this cell runs, you'll see a download link at the end.

import os, json, textwrap, zipfile, pathlib

root = "/mnt/data/battery_ml"
os.makedirs(root, exist_ok=True)
os.makedirs(f"{root}/data", exist_ok=True)
os.makedirs(f"{root}/models", exist_ok=True)
os.makedirs(f"{root}/reports", exist_ok=True)

# sensors.py: shared functions for sampling system metrics (psutil), brightness, and feature list
sensors_py = r'''#!/usr/bin/env python3
"""
Shared utilities to read system sensors and build a feature row.
"""
import time
import psutil
from typing import Optional, Dict, Any, Tuple
try:
    import screen_brightness_control as sbc  # optional; supports Windows/Linux
except Exception:  # pragma: no cover
    sbc = None


def _get_brightness() -> Optional[float]:
    """Return screen brightness in [0, 100] if available, else None."""
    try:
        if sbc:
            val = sbc.get_brightness()
            # sbc.get_brightness() may return list of displays on some systems
            if isinstance(val, list):
                if len(val) == 0:
                    return None
                return float(sum(val) / len(val))
            return float(val)
    except Exception:
        pass
    return None


def _get_active_apps_count(threshold_cpu: float = 0.5, threshold_mem_mb: float = 100.0) -> int:
    """
    Approximate "apps open" by counting processes owned by the current user that are
    either using noticeable CPU or memory. This is cross-platform and avoids GUI APIs.
    """
    count = 0
    for proc in psutil.process_iter(['cpu_percent', 'memory_info', 'username']):
        try:
            cpu = proc.info.get('cpu_percent', 0.0) or 0.0
            mem = proc.info.get('memory_info').rss / (1024 * 1024) if proc.info.get('memory_info') else 0.0
            if cpu >= threshold_cpu or mem >= threshold_mem_mb:
                count += 1
        except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
            continue
    return int(count)


FEATURE_COLUMNS = [
    'battery_percent',
    'cpu_percent',
    'cpu_freq_mhz',
    'ram_percent',
    'disk_read_kb_s',
    'disk_write_kb_s',
    'net_sent_kb_s',
    'net_recv_kb_s',
    'brightness',
    'screen_on',
    'active_apps_count'
]


def sample_features(prev_state: Optional[Dict[str, Any]] = None, cpu_interval: float = 1.0
                   ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
    """
    Collect a single row of features and Target (if available) from the current system.
    Returns (row_dict, new_prev_state). The row includes 'hours_left' target when known.
    """
    now = time.time()

    # Battery
    battery = None
    try:
        battery = psutil.sensors_battery()
    except Exception:
        battery = None

    plugged = getattr(battery, "power_plugged", None) if battery else None
    percent = getattr(battery, "percent", None) if battery else None
    secsleft = getattr(battery, "secsleft", None) if battery else None

    # CPU / RAM
    cpu_percent = psutil.cpu_percent(interval=cpu_interval)
    try:
        cpu_freq = psutil.cpu_freq()
        cpu_freq_mhz = float(cpu_freq.current) if cpu_freq else None
    except Exception:
        cpu_freq_mhz = None
    ram_percent = float(psutil.virtual_memory().percent)

    # IO & Network (rates per second since prev_state)
    disk = psutil.disk_io_counters()
    net = psutil.net_io_counters()
    disk_read_kb_s = disk_write_kb_s = net_sent_kb_s = net_recv_kb_s = None
    if prev_state is not None:
        dt = max(1e-9, now - prev_state['t'])
        disk_read_kb_s  = max(0.0, (disk.read_bytes  - prev_state['disk_read_bytes'])  / 1024.0 / dt)
        disk_write_kb_s = max(0.0, (disk.write_bytes - prev_state['disk_write_bytes']) / 1024.0 / dt)
        net_sent_kb_s   = max(0.0, (net.bytes_sent   - prev_state['net_bytes_sent'])   / 1024.0 / dt)
        net_recv_kb_s   = max(0.0, (net.bytes_recv   - prev_state['net_bytes_recv'])   / 1024.0 / dt)

    # Brightness & apps
    brightness = _get_brightness()
    screen_on = (brightness is not None) and (float(brightness) > 0.0)
    active_apps_count = _get_active_apps_count()

    # Target
    hours_left = None
    try:
        if battery and not plugged and secsleft not in (psutil.POWER_TIME_UNKNOWN, psutil.POWER_TIME_UNLIMITED):
            hours_left = float(secsleft) / 3600.0
    except Exception:
        hours_left = None

    row = {
        'battery_percent': percent,
        'cpu_percent': cpu_percent,
        'cpu_freq_mhz': cpu_freq_mhz,
        'ram_percent': ram_percent,
        'disk_read_kb_s': disk_read_kb_s,
        'disk_write_kb_s': disk_write_kb_s,
        'net_sent_kb_s': net_sent_kb_s,
        'net_recv_kb_s': net_recv_kb_s,
        'brightness': brightness,
        'screen_on': float(screen_on) if screen_on is not None else None,
        'active_apps_count': active_apps_count,
        'on_ac_power': plugged,
        'secsleft_raw': secsleft,
        'hours_left': hours_left,
    }

    new_prev = {
        't': now,
        'disk_read_bytes': disk.read_bytes,
        'disk_write_bytes': disk.write_bytes,
        'net_bytes_sent': net.bytes_sent,
        'net_bytes_recv': net.bytes_recv,
    }
    return row, new_prev
'''
with open(f"{root}/sensors.py", "w", encoding="utf-8") as f:
    f.write(sensors_py)

# collector.py: continuous logger
collector_py = r'''#!/usr/bin/env python3
"""
Collects laptop telemetry every N seconds and appends to data/battery_log.csv
Stop with Ctrl+C. You can set env vars:
  BATTERY_LOG_PATH (default: data/battery_log.csv)
  BATTERY_LOG_INTERVAL (seconds, default: 60)
"""
import os
import time
import pandas as pd
from sensors import sample_features

LOG_PATH = os.environ.get("BATTERY_LOG_PATH", "data/battery_log.csv")
INTERVAL = float(os.environ.get("BATTERY_LOG_INTERVAL", "60"))

def ensure_parent(path: str):
    parent = os.path.dirname(path)
    if parent and not os.path.exists(parent):
        os.makedirs(parent, exist_ok=True)

def main():
    ensure_parent(LOG_PATH)
    prev = None
    print(f"[collector] Logging to {LOG_PATH} every {INTERVAL:.0f}s. Press Ctrl+C to stop.")
    try:
        while True:
            row, prev = sample_features(prev)
            row['timestamp'] = pd.Timestamp.now().isoformat()
            df = pd.DataFrame([row])
            header = not os.path.exists(LOG_PATH) or os.path.getsize(LOG_PATH) == 0
            df.to_csv(LOG_PATH, mode='a', index=False, header=header)
            print(f"[collector] wrote row @ {row['timestamp']} | target hours_left={row['hours_left']}")
            # sleep remaining interval (we already blocked ~1s inside sample_features for cpu_percent)
            time.sleep(max(0.0, INTERVAL - 1.0))
    except KeyboardInterrupt:
        print("\n[collector] Stopped. Goodbye!")

if __name__ == "__main__":
    main()
'''
with open(f"{root}/collector.py", "w", encoding="utf-8") as f:
    f.write(collector_py)

# train.py: trains models and saves the best pipeline
train_py = r'''#!/usr/bin/env python3
"""
Train a regression model to predict remaining battery hours.
Usage:
  python train.py --csv data/battery_log.csv --out models/best_model.joblib
If you have no real data yet, first run simulate_data.py to generate data.
"""
import argparse
import json
import os
import pandas as pd
import numpy as np
from typing import List
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import joblib
from sensors import FEATURE_COLUMNS

def load_and_clean(csv_path: str) -> pd.DataFrame:
    df = pd.read_csv(csv_path)
    # ensure timestamp is datetime if present
    if 'timestamp' in df.columns:
        df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')

    # Filter to rows with target present and meaningful
    df = df[df['hours_left'].notna()]
    # Use only when on battery (not plugged in)
    if 'on_ac_power' in df.columns:
        df = df[df['on_ac_power'] == False]  # noqa: E712

    # Some OSes sometimes report crazy secsleft; clamp target to a sensible range (0 - 15 hours)
    df = df[(df['hours_left'] > 0) & (df['hours_left'] <= 15)]
    df = df.drop_duplicates().reset_index(drop=True)
    return df

def build_pipelines(numeric_features: List[str]):
    pre = ColumnTransformer([
        ("num", Pipeline([
            ("imp", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())
        ]), numeric_features),
    ])

    models = {
        "LinearRegression": LinearRegression(),
        "RandomForest": RandomForestRegressor(
            n_estimators=300, max_depth=None, min_samples_leaf=2, n_jobs=-1, random_state=42
        ),
        "GradientBoosting": GradientBoostingRegressor(random_state=42)
    }

    pipelines = {name: Pipeline([("pre", pre), ("model", model)]) for name, model in models.items()}
    return pipelines

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--csv", default="data/battery_log.csv")
    ap.add_argument("--out", default="models/best_model.joblib")
    args = ap.parse_args()

    if not os.path.exists(args.csv):
        raise SystemExit(f"Data file not found: {args.csv}. Run collector.py or simulate_data.py first.")

    df = load_and_clean(args.csv)

    if len(df) < 80:
        print(f"[train] WARNING: only {len(df)} usable rows. More data will improve accuracy.")

    X = df[FEATURE_COLUMNS]
    y = df['hours_left']

    # time-aware split if timestamp exists; else random split
    if 'timestamp' in df.columns and df['timestamp'].notna().any():
        df_sorted = df.sort_values('timestamp')
        X = df_sorted[FEATURE_COLUMNS]
        y = df_sorted['hours_left']
        # use last 20% as test
        split_idx = int(0.8 * len(df_sorted))
        X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
        y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]
    else:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    pipelines = build_pipelines(FEATURE_COLUMNS)
    results = []

    for name, pipe in pipelines.items():
        pipe.fit(X_train, y_train)
        preds = pipe.predict(X_test)
        mae = mean_absolute_error(y_test, preds)
        rmse = mean_squared_error(y_test, preds, squared=False)
        r2 = r2_score(y_test, preds)
        results.append({"name": name, "mae": mae, "rmse": rmse, "r2": r2})
        print(f"[train] {name:16s} | MAE={mae:.3f} | RMSE={rmse:.3f} | R2={r2:.3f}")

    # pick best by MAE
    results_sorted = sorted(results, key=lambda d: d["mae"])
    best_name = results_sorted[0]["name"]
    best_pipe = pipelines[best_name]
    os.makedirs(os.path.dirname(args.out), exist_ok=True)
    joblib.dump(best_pipe, args.out)

    # Save metrics
    metrics_path = os.path.join(os.path.dirname(args.out), "metrics.json")
    with open(metrics_path, "w", encoding="utf-8") as f:
        json.dump({"results": results_sorted, "n_train": len(X_train), "n_test": len(X_test)}, f, indent=2)

    print(f"[train] Saved best model: {args.out} ({best_name})")
    print(f"[train] Metrics saved to: {metrics_path}")

if __name__ == "__main__":
    main()
'''
with open(f"{root}/train.py", "w", encoding="utf-8") as f:
    f.write(train_py)

# evaluate.py: deeper evaluation + permutation importance plot
evaluate_py = r'''#!/usr/bin/env python3
"""
Evaluate a saved model and produce a feature importance plot via permutation importance.
Usage:
  python evaluate.py --csv data/battery_log.csv --model models/best_model.joblib --out reports/feature_importance.png
"""
import argparse
import os
import joblib
import numpy as np
import pandas as pd
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sensors import FEATURE_COLUMNS
import matplotlib.pyplot as plt

def load_data(csv_path: str) -> pd.DataFrame:
    df = pd.read_csv(csv_path)
    if 'timestamp' in df.columns:
        df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
        df = df.sort_values('timestamp')
    df = df[df['hours_left'].notna()]
    if 'on_ac_power' in df.columns:
        df = df[df['on_ac_power'] == False]  # noqa: E712
    df = df[(df['hours_left'] > 0) & (df['hours_left'] <= 15)]
    return df.reset_index(drop=True)

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--csv", default="data/battery_log.csv")
    ap.add_argument("--model", default="models/best_model.joblib")
    ap.add_argument("--out", default="reports/feature_importance.png")
    args = ap.parse_args()

    model = joblib.load(args.model)
    df = load_data(args.csv)
    if len(df) < 30:
        print("[evaluate] WARNING: dataset is quite small; importance may be noisy.")

    # Use last 20% as test
    split_idx = int(0.8 * len(df))
    X_test = df[FEATURE_COLUMNS].iloc[split_idx:]
    y_test = df['hours_left'].iloc[split_idx:]

    preds = model.predict(X_test)
    mae = mean_absolute_error(y_test, preds)
    rmse = mean_squared_error(y_test, preds, squared=False)
    r2 = r2_score(y_test, preds)
    print(f"[evaluate] Test MAE={mae:.3f} | RMSE={rmse:.3f} | R2={r2:.3f}")

    # Permutation importance on test set
    result = permutation_importance(model, X_test, y_test, n_repeats=5, random_state=42, n_jobs=-1)
    importances = result.importances_mean
    stds = result.importances_std

    order = np.argsort(importances)[::-1]
    names = np.array(FEATURE_COLUMNS)[order]
    imps = importances[order]
    errs = stds[order]

    # Plot (no custom colors per instructions)
    plt.figure(figsize=(8, 5))
    plt.title("Permutation Feature Importance")
    plt.bar(range(len(names)), imps, yerr=errs)
    plt.xticks(range(len(names)), names, rotation=45, ha='right')
    plt.tight_layout()
    os.makedirs(os.path.dirname(args.out), exist_ok=True)
    plt.savefig(args.out, dpi=160)
    print(f"[evaluate] Saved feature importance plot to {args.out}")

if __name__ == "__main__":
    main()
'''
with open(f"{root}/evaluate.py", "w", encoding="utf-8") as f:
    f.write(evaluate_py)

# predict_now.py: live prediction using current sensors
predict_now_py = r'''#!/usr/bin/env python3
"""
Loads the trained model and predicts remaining hours for your current system state.
Usage:
  python predict_now.py --model models/best_model.joblib
"""
import argparse
import joblib
import pandas as pd
from sensors import sample_features, FEATURE_COLUMNS

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--model", default="models/best_model.joblib")
    args = ap.parse_args()

    model = joblib.load(args.model)

    row, prev = sample_features(prev_state=None)  # initial row has rate features = None
    # Take a second sample to compute IO/network rates
    row, prev = sample_features(prev_state=prev)

    X = pd.DataFrame([{k: row.get(k) for k in FEATURE_COLUMNS}])
    pred_hours = float(model.predict(X)[0])
    pred_mins = int(pred_hours * 60)
    print(f"[predict_now] Predicted remaining time: {pred_hours:.2f} hours (~{pred_mins} minutes)")
    print("[predict_now] Features used:")
    print(X.to_string(index=False))

if __name__ == "__main__":
    main()
'''
with open(f"{root}/predict_now.py", "w", encoding="utf-8") as f:
    f.write(predict_now_py)

# app.py: simple Streamlit app
app_py = r'''#!/usr/bin/env python3
import time
import os
import pandas as pd
import streamlit as st
import joblib
from sensors import sample_features, FEATURE_COLUMNS

MODEL_PATH = os.environ.get("MODEL_PATH", "models/best_model.joblib")
LOG_PATH = os.environ.get("BATTERY_LOG_PATH", "data/battery_log.csv")

st.set_page_config(page_title="Laptop Battery Life Predictor", layout="centered")

st.title("🔋 Laptop Battery Life Predictor")

if not os.path.exists(MODEL_PATH):
    st.warning("Model not found. Train it first with `python train.py`.")
    st.stop()

model = joblib.load(MODEL_PATH)

# Live prediction block
st.subheader("Live Prediction")
with st.spinner("Sampling system metrics..."):
    row, prev = sample_features(None)
    time.sleep(1.0)  # ensure rates have time to change
    row, _ = sample_features(prev)

import pandas as pd
X = pd.DataFrame([{k: row.get(k) for k in FEATURE_COLUMNS}])
pred_hours = float(model.predict(X)[0])
st.metric("Predicted Remaining Time", f"{pred_hours:.2f} hours", help="Based on current CPU, RAM, brightness, I/O and network activity")

st.write("Current features:")
st.dataframe(X)

# Historical log viewer
st.subheader("Recent Log (if available)")
if os.path.exists(LOG_PATH):
    df = pd.read_csv(LOG_PATH)
    if 'timestamp' in df.columns:
        df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
        df = df.sort_values('timestamp').tail(240)  # last ~4 hours if 1 min interval
    st.dataframe(df.tail(50))
else:
    st.info("No log file found yet. Start the collector with `python collector.py`.")
'''
with open(f"{root}/app.py", "w", encoding="utf-8") as f:
    f.write(app_py)

# simulate_data.py: bootstrap synthetic dataset for demo/training
simulate_py = r'''#!/usr/bin/env python3
"""
Generate a synthetic battery log to let you train/evaluate the pipeline
before you have real data. Writes to data/simulated_battery_log.csv
"""
import numpy as np
import pandas as pd
import os
rng = np.random.default_rng(42)

N = 1200  # ~20 hours of minute-level data
timestamps = pd.date_range("2025-01-01", periods=N, freq="1min")

battery_percent = np.clip(100 - np.linspace(0, 60, N) + rng.normal(0, 1.8, N), 5, 100)
cpu_percent = np.clip(rng.normal(22, 12, N) + 10*np.sin(np.linspace(0, 12, N)), 1, 98)
cpu_freq_mhz = np.clip(1200 + 800*(cpu_percent/100) + rng.normal(0, 80, N), 800, 4000)
ram_percent = np.clip(rng.normal(55, 10, N) + 0.2*cpu_percent, 10, 98)
disk_read_kb_s  = np.clip(np.abs(rng.normal(50, 80, N)), 0, 5000)
disk_write_kb_s = np.clip(np.abs(rng.normal(40, 60, N)), 0, 5000)
net_sent_kb_s   = np.clip(np.abs(rng.normal(20, 40, N)), 0, 4000)
net_recv_kb_s   = np.clip(np.abs(rng.normal(30, 60, N)), 0, 4000)
brightness = np.clip(rng.normal(60, 25, N), 0, 100)
screen_on = (brightness > 0).astype(float)
active_apps_count = np.clip((cpu_percent/15 + rng.normal(3, 2, N)).round().astype(int), 1, 50)

# A simple "physics-inspired" target: higher cpu/brightness/apps/io drain faster -> fewer hours left
drain_factor = (
    0.012*cpu_percent +
    0.010*ram_percent +
    0.006*brightness +
    0.002*active_apps_count +
    0.00015*(disk_read_kb_s + disk_write_kb_s) +
    0.00012*(net_sent_kb_s + net_recv_kb_s)
)
# Convert drain factor to remaining hours: smaller drain -> longer hours (bounded 0-12)
hours_left = np.clip(12 - 0.08*drain_factor + rng.normal(0, 0.5, N), 0.2, 12)

df = pd.DataFrame({
    'timestamp': timestamps,
    'battery_percent': battery_percent,
    'cpu_percent': cpu_percent,
    'cpu_freq_mhz': cpu_freq_mhz,
    'ram_percent': ram_percent,
    'disk_read_kb_s': disk_read_kb_s,
    'disk_write_kb_s': disk_write_kb_s,
    'net_sent_kb_s': net_sent_kb_s,
    'net_recv_kb_s': net_recv_kb_s,
    'brightness': brightness,
    'screen_on': screen_on,
    'active_apps_count': active_apps_count,
    'on_ac_power': False,
    'secsleft_raw': (hours_left * 3600).round(),
    'hours_left': hours_left
})

os.makedirs("data", exist_ok=True)
out = "data/simulated_battery_log.csv"
df.to_csv(out, index=False)
print(f"[simulate_data] Wrote {len(df)} rows to {out}")
'''
with open(f"{root}/simulate_data.py", "w", encoding="utf-8") as f:
    f.write(simulate_py)

# requirements.txt
requirements = """\
pandas
numpy
psutil
scikit-learn
joblib
matplotlib
screen_brightness_control
streamlit
"""
with open(f"{root}/requirements.txt", "w", encoding="utf-8") as f:
    f.write(requirements)

# README.md
readme = r'''# 🔋 Laptop Battery Life Predictor (End-to-End ML Project)

**Goal:** Predict remaining battery time (hours) using real-time laptop telemetry: CPU load, RAM, I/O, network, screen brightness, and active app count.

This project is designed to be:
- **Beginner-friendly:** minimal tools, simple Python scripts.
- **End-to-end:** data collection → modeling → evaluation → live prediction / app.
- **Recruiter-friendly:** real hardware + ML + small app.

---

## 📦 Project Structure



SyntaxError: incomplete input (ipython-input-994384677.py, line 577)