In [2]:
# import sagemaker
# import boto3
import pandas as pd
import numpy as np
# from sagemaker.session import Session
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
# from sagemaker.sklearn.processing import SKLearnProcessor
# from sagemaker.xgboost.estimator import XGBoost
# from sagemaker.serializers import CSVSerializer
import xgboost as xgb
import joblib 
  
# from sagemaker.inputs import TrainingInput

In [3]:
def generate_synthetic_data(n_samples=1000):
    data = {
        # Device IDs (from 'snn' examples)
        "snn": [f"snn:dev:geyser:v1:{np.random.randint(10000000, 99999999)}GEYSER:{np.random.randint(1000, 99999)}"
                 for _ in range(n_samples)],
        
        # Numeric features (random values within observed ranges)
        "waterTemperatureInternal": np.random.normal(50, 15, n_samples).clip(20, 80),
        "ambientTemperature": np.random.normal(25, 5, n_samples),
        "signalRssi": np.random.randint(10, 30, n_samples),
        "energy": np.random.lognormal(12, 1, n_samples),
        
        # Categorical features (from sample values)
        "firmwareVersion": np.random.choice(["N.5.6.E_a", "N.5.6.D_a", "N.6.0.E_a"], n_samples),
        "provider": np.random.choice(["nedbank", "connected-home", "nedbank-telesure"], n_samples),
        
        # Timestamps (random dates)
        "time": pd.date_range("2025-01-01", periods=n_samples, freq="5min")
    }
    df = pd.DataFrame(data)
    
    # Add a synthetic regression target (e.g., water usage)
    df['target'] = 0.5 * df['waterTemperatureInternal'] + \
                   0.3 * df['ambientTemperature'] + \
                   0.1 * df['signalRssi'] + \
                   0.05 * np.log(df['energy']) + \
                   np.random.normal(0, 2, n_samples)
    return df

df = generate_synthetic_data()

df.head()

Unnamed: 0,snn,waterTemperatureInternal,ambientTemperature,signalRssi,energy,firmwareVersion,provider,time,target
0,snn:dev:geyser:v1:96978431GEYSER:83850,34.333538,32.770192,26,23275.251804,N.5.6.E_a,nedbank-telesure,2025-01-01 00:00:00,27.350921
1,snn:dev:geyser:v1:61406249GEYSER:2714,49.769185,28.245773,22,239274.752781,N.5.6.D_a,nedbank,2025-01-01 00:05:00,36.833245
2,snn:dev:geyser:v1:40687426GEYSER:16330,30.814313,24.462445,14,93782.608128,N.5.6.E_a,nedbank,2025-01-01 00:10:00,23.037828
3,snn:dev:geyser:v1:40799605GEYSER:67576,69.480506,15.780658,10,38327.809415,N.5.6.D_a,connected-home,2025-01-01 00:15:00,40.75791
4,snn:dev:geyser:v1:20392077GEYSER:61838,39.153278,26.584576,10,174870.252904,N.5.6.E_a,nedbank,2025-01-01 00:20:00,26.059117


In [4]:
#Preprocess

X = df.drop(columns=["target", "time", "snn"])  # drop non-numeric/non-useful features
y = df["target"]
#One hot encode categoricals
X = pd.get_dummies(X, columns=["firmwareVersion", "provider"], drop_first=True)


In [5]:
# --------------------------
# Step 3: Train/Test split
# --------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# --------------------------
# Step 4: Train XGBoost
# --------------------------
model = xgb.XGBRegressor(
    n_estimators=200,
    max_depth=5,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
)
model.fit(X_train, y_train)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [7]:
# --------------------------
# Step 5: Save the model
# --------------------------
joblib.dump(model, "local_xgb_model.json")
print("✅ Model saved as local_xgb_model.json")

✅ Model saved as local_xgb_model.json


In [8]:
# --------------------------
# Step 6: Reload & predict
# --------------------------
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score
loaded_model = joblib.load("local_xgb_model.pkl")
y_pred = loaded_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"MAE={mae:.3f} | RMSE={rmse:.3f} | R2={r2:.3f}")
print("Predictions:", y_pred)

MAE=1.927 | RMSE=2.420 | R2=0.896
Predictions: [39.197506 46.764595 30.360186 39.747635 30.187353 37.795643 37.603035
 42.0429   25.39605  52.529205 25.7607   44.117764 19.977472 39.207573
 30.381824 31.579971 24.501749 35.600384 39.73668  32.54823  38.096386
 24.43567  29.46285  40.70621  39.35314  36.766186 22.485916 36.243904
 26.530804 40.878727 42.0135   40.899494 47.70624  48.187016 44.84664
 37.154472 39.199635 30.850517 35.634727 50.701546 35.174805 30.24588
 39.951744 34.29016  32.99795  27.216269 36.01433  21.72089  32.352592
 49.4176   35.31024  33.15479  42.83571  25.24902  33.991463 47.5583
 47.744324 40.624187 27.155954 45.7986   40.65526  19.22104  34.124096
 40.568325 38.545708 26.250097 33.43295  45.26311  29.613968 40.303913
 35.511406 34.449066 29.028635 37.496723 35.695526 26.846884 28.58929
 33.63956  28.901978 25.687815 36.470375 40.078896 40.621925 41.53995
 37.446354 32.37514  31.41533  28.389997 29.93822  27.228224 31.884747
 30.790337 28.98971  28.252005 34.31