In [1]:
# !pip install -r requirements.txt

In [2]:
import snowflake.snowpark.functions as F
from snowflake.snowpark.session import Session
from snowflake.snowpark.types import StructType, StructField, FloatType
from snowflake.snowpark import Session
import os
import json
import pandas as pd
import numpy as np

In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import joblib
import mlflow
import mlflow.sklearn
import mlflow.xgboost
import xgboost as xgb

In [3]:
connection_parameters = {
    "account": os.getenv("SNOWFLAKE_ACCOUNT"),
    "user": os.getenv("SNOWFLAKE_USER"),
    "password": os.getenv("SNOWFLAKE_PASSWORD"),
    "schema": "SENSOR",
    "database": "THINGSBOARD",
    "role": os.getenv("SNOWFLAKE_ROLE"),
    "warehouse": os.getenv("SNOWFLAKE_WAREHOUSE"),
}

session = Session.builder.configs(connection_parameters).create()

In [4]:
environmental_df = session.table("ENVIRONMENTAL")

In [5]:
environmental_df.show(5)

-------------------------------------------------------------------------------------------
|"TS"                 |"CO"  |"HUMIDITY"  |"LIGHT"  |"LPG"  |"MOTION"  |"SMOKE"  |"TEMP"  |
-------------------------------------------------------------------------------------------
|2024-01-02 12:30:00  |5.0   |51.1        |False    |7.7    |False     |20.6     |22.7    |
|2024-01-02 12:20:00  |2.7   |75.6        |False    |5.0    |False     |12.9     |19.8    |
|2024-01-02 12:10:00  |5.0   |51.1        |False    |7.7    |False     |20.5     |22.6    |
|2024-01-02 12:00:00  |2.8   |75.8        |False    |5.1    |False     |13.1     |19.8    |
|2024-01-02 11:50:00  |5.0   |51.1        |False    |7.7    |False     |20.5     |22.7    |
-------------------------------------------------------------------------------------------



In [6]:
environmental_df = environmental_df.drop(["HUMIDITY", "LIGHT", "LPG", "MOTION", "SMOKE", "TEMP"])

In [7]:
df =environmental_df.toPandas()

In [8]:
df.head()

Unnamed: 0,TS,CO
0,2024-01-02 12:30:00,5.0
1,2024-01-02 12:20:00,2.7
2,2024-01-02 12:10:00,5.0
3,2024-01-02 12:00:00,2.8
4,2024-01-02 11:50:00,5.0


In [9]:
df = df.sort_values('TS').reset_index(drop=True)

In [10]:

for lag in range(1, 4):  # Lag 1, 2, 3
    df[f'CO_lag_{lag}'] = df['CO'].shift(lag)

In [11]:
df['CO_roll_mean_3'] = df['CO'].rolling(window=3).mean()
df['CO_roll_std_3'] = df['CO'].rolling(window=3).std()

In [12]:
# Extract hour and minute
df['hour'] = df['TS'].dt.hour
df['minute'] = df['TS'].dt.minute

# Cyclical encoding
df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
df['minute_sin'] = np.sin(2 * np.pi * df['minute'] / 60)
df['minute_cos'] = np.cos(2 * np.pi * df['minute'] / 60)


In [13]:
df = df.dropna().reset_index(drop=True)

In [14]:
df.head()

Unnamed: 0,TS,CO,CO_lag_1,CO_lag_2,CO_lag_3,CO_roll_mean_3,CO_roll_std_3,hour,minute,hour_sin,hour_cos,minute_sin,minute_cos
0,2024-01-01 00:30:00,4.4,5.0,2.8,5.0,4.066667,1.137248,0,30,0.0,1.0,5.665539e-16,-1.0
1,2024-01-01 00:40:00,5.0,4.4,5.0,2.8,4.8,0.34641,0,40,0.0,1.0,-0.8660254,-0.5
2,2024-01-01 00:50:00,4.4,5.0,4.4,5.0,4.6,0.34641,0,50,0.0,1.0,-0.8660254,0.5
3,2024-01-01 01:00:00,5.0,4.4,5.0,4.4,4.8,0.34641,1,0,0.258819,0.965926,0.0,1.0
4,2024-01-01 01:10:00,2.9,5.0,4.4,5.0,4.1,1.081665,1,10,0.258819,0.965926,0.8660254,0.5


In [15]:
# Define feature columns
feature_cols = [
    'CO_lag_1', 'CO_lag_2', 'CO_lag_3',
    'CO_roll_mean_3', 'CO_roll_std_3',
    'hour_sin', 'hour_cos', 'minute_sin', 'minute_cos'
]

X = df[feature_cols]
y = df['CO']



In [16]:
import mlflow

mlflow.set_tracking_uri("https://legendary-dollop-7vr49x7r7v9pcxwrw-5000.app.github.dev/")


In [17]:
mlflow.set_experiment("Time_Series_CO_Prediction")


2024/11/23 10:36:33 INFO mlflow.tracking.fluent: Experiment with name 'Time_Series_CO_Prediction' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/artifacts/2', creation_time=1732358193397, experiment_id='2', last_update_time=1732358193397, lifecycle_stage='active', name='Time_Series_CO_Prediction', tags={}>

In [22]:
tscv = TimeSeriesSplit(n_splits=3)

# Start MLflow Run
with mlflow.start_run(run_name="RandomForest_Regressor_Run") as run:
    # Define and Train the Model
    rf = RandomForestRegressor(n_estimators=200, max_depth=None, min_samples_split=2, random_state=42)
    rf.fit(X, y)

    # Predictions (Using the entire dataset for simplicity; consider using a separate test set)
    predictions = rf.predict(X)

    # Calculate Metrics
    mae = mean_absolute_error(y, predictions)
    rmse = np.sqrt(mean_squared_error(y, predictions))

    # Log Parameters
    mlflow.log_param("model", "RandomForestRegressor")
    mlflow.log_param("n_estimators", 200)
    mlflow.log_param("max_depth", None)
    mlflow.log_param("min_samples_split", 2)
    mlflow.log_param("random_state", 42)

    # Log Metrics
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("RMSE", rmse)

    # Log the Model
    mlflow.sklearn.log_model(rf, "model")

    print(f"Run ID: {run.info.run_id}")
    print(f"Logged MAE: {mae:.2f}, RMSE: {rmse:.2f}")

The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



Run ID: b49c98dd8b5147f1a2efc8602930faa0
Logged MAE: 0.02, RMSE: 0.05
🏃 View run RandomForest_Regressor_Run at: https://legendary-dollop-7vr49x7r7v9pcxwrw-5000.app.github.dev/#/experiments/2/runs/b49c98dd8b5147f1a2efc8602930faa0
🧪 View experiment at: https://legendary-dollop-7vr49x7r7v9pcxwrw-5000.app.github.dev/#/experiments/2


In [None]:
# Get the Run ID from the previous step
run_id = run.info.run_id

# Register the model
model_name = "Time_Series_CO_Model"
result = mlflow.register_model(
    f"runs:/{run_id}/model",
    model_name
)

print(f"Registered Model: {result.name}, Version: {result.version}")


In [None]:
from mlflow.tracking import MlflowClient

client = MlflowClient()

# Transition the model to 'Staging'
client.transition_model_version_stage(
    name=model_name,
    version=result.version,
    stage="Staging",
    archive_existing_versions=True
)

print(f"Model {model_name} version {result.version} transitioned to Staging.")


In [None]:
import mlflow
import mlflow.sklearn

model_name = "Time_Series_CO_Model"
model_stage = "Staging"

# Load the model from the registry
model = mlflow.sklearn.load_model(
    model_uri=f"models:/{model_name}/{model_stage}"
)

print("Model loaded successfully from MLflow Model Registry.")


In [None]:

ts= [1672531200000, 1672531260000, 1672531320000, 1672531380000 ,  1672531440000 ]
df=pd.DataFrame(column='TS', value=ts)

In [None]:
# Convert 'TS' from milliseconds to datetime
df['TS'] = pd.to_datetime(df['TS'], unit='ms')

# Sort the dataframe by timestamp
df = df.sort_values('TS').reset_index(drop=True)

print("\nData after converting TS to datetime:")
print(df.head())
