In [4]:
import pandas as pd

def loadData(path) : 
    return pd.read_csv(path)

import pandas as pd
import numpy as np

def split_dataset_pandas(df, train_ratio=0.7, test_ratio=0.2, valid_ratio=0.1, random_state=42):
  """
  Splits a pandas DataFrame into train, test, and validation sets.

  Args:
    df: The input pandas DataFrame.
    train_ratio: Proportion of data for the training set (default: 0.7).
    test_ratio: Proportion of data for the test set (default: 0.2).
    valid_ratio: Proportion of data for the validation set (default: 0.1).
    random_state: Seed for random number generator (default: 42).

  Returns:
    A tuple containing the training, test, and validation DataFrames.
  """

  if not 0 <= train_ratio <= 1 or not 0 <= test_ratio <= 1 or not 0 <= valid_ratio <= 1:
    raise ValueError("Ratios must be between 0 and 1.")

  if abs(train_ratio + test_ratio + valid_ratio - 1) > 1e-6:
    raise ValueError("Ratios must sum to 1.")

  # Shuffle the DataFrame
  df = df.sample(frac=1, random_state=random_state)

  total_size = len(df)
  train_size = int(train_ratio * total_size)
  test_size = int(test_ratio * total_size)

  train_df = df.iloc[:train_size].to_csv('/home/dikidwidasa/mlflow/data/train.csv',index = False)
  test_df = df.iloc[train_size:train_size + test_size].to_csv('/home/dikidwidasa/mlflow/data/test.csv', index = False)
  valid_df = df.iloc[train_size + test_size:].to_csv('/home/dikidwidasa/mlflow/data/valid.csv', index = False)

  return None

    
df = loadData('/home/dikidwidasa/mlflow/data/dummy_data.csv').reset_index(drop=True)
split_dataset_pandas(df)
df.head()

Unnamed: 0,Building Type,Square Footage,Number of Occupants,Appliances Used,Average Temperature,Day of Week,Energy Consumption
0,Industrial,26390,96,5,21.63,Weekday,6691.1
1,Residential,25369,13,38,29.76,Weekday,6140.03
2,Industrial,17881,84,46,27.6,Weekday,5990.69
3,Industrial,14587,43,4,14.85,Weekend,6181.95
4,Residential,33264,37,28,22.76,Weekday,1480.3


In [47]:
def mapping(df, colsname, map_var):
    df[colsname] = df[colsname].map(map_var)
    return df


df.head()

Unnamed: 0,Building Type,Square Footage,Number of Occupants,Appliances Used,Average Temperature,Day of Week,Energy Consumption
0,1,24563,15,4,28.52,1,2865.57
1,2,27583,56,23,23.07,0,4283.8
2,2,45313,4,44,33.56,1,5067.83
3,1,41625,84,17,27.39,0,4624.3
4,1,36720,58,47,17.08,1,4820.59


In [39]:
import numpy as np

def feature_selection(df): 
    x = df.iloc[:,:-1].values
    y = df.iloc[:,-1].values
    return x,y

x,y = feature_selection(df)
data = {
    "lenx" : len(x),
    "leny" : len(y),
    "typex" : type(x),
    "typey" : type(y)
}
print(data)
print(type(x),type(y))

{'lenx': 100, 'leny': 100, 'typex': <class 'numpy.ndarray'>, 'typey': <class 'numpy.ndarray'>}
<class 'numpy.ndarray'> <class 'numpy.ndarray'>


In [40]:
import numpy as np


def custom_train_test_split(X, y, test_size=0.2, random_state=None):
    """
    Splits data into training and testing sets.

    Args:
        X: Features (array-like).
        y: Target variable (array-like).
        test_size: Proportion of data to include in the testing set (default: 0.2).
        random_state: Controls the randomness of the split (default: None).

    Returns:
        X_train: Features for the training set.
        X_test: Features for the testing set.
        y_train: Target variable for the training set.
        y_test: Target variable for the testing set.
    """

    if random_state is not None:
        np.random.seed(random_state)

    indices = np.arange(len(X))
    np.random.shuffle(indices)

    test_size = int(len(X) * test_size)
    test_indices = indices[:test_size]
    train_indices = indices[test_size:]

    X_train = X[train_indices]
    y_train = y[train_indices]
    X_test = X[test_indices]
    y_test = y[test_indices]

    return X_train, X_test, y_train, y_test

# Example usage:
# Assuming X and y are your feature matrix and target variable 
X_train, X_test, y_train, y_test = custom_train_test_split(x, y, test_size=0.25, random_state=42)

data = {
    "lenx_train" : len(X_train),
    "lenx_test" : len(X_test),
    "leny_train" : len(y_train),
    "leny_test" : len(y_test),
    "typex_train" : type(X_train),
    "typex_test" : type(X_test),
    "typey_train" : type(y_train),
    "typey_test" : type(y_test)
}
print(data)

{'lenx_train': 75, 'lenx_test': 25, 'leny_train': 75, 'leny_test': 25, 'typex_train': <class 'numpy.ndarray'>, 'typex_test': <class 'numpy.ndarray'>, 'typey_train': <class 'numpy.ndarray'>, 'typey_test': <class 'numpy.ndarray'>}


In [50]:
from datetime import datetime

def get_datetime_string():
  """
  Gets the current datetime as a string in the format 'YYYYMMDDHHMMSS'.

  Returns:
    str: The current datetime as a string.
  """
  return datetime.now().strftime("%Y%m%d%H%M%S")

print(get_datetime_string())

20250107115612


In [52]:
import mlflow 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


mlflow.autolog()
mlflow.set_tracking_uri('http://localhost:5000')

df = loadData('/home/dikidwidasa/mlflow/data/test_energy_data.csv').reset_index(drop=True)

map_building = {
    'Residential' : 1,
    'Commercial' : 2,
    "Industrial" : 3
}

day_of_week_map = {"Weekday": 1, "Weekend": 0}

cols_to_map = ['Building Type', 'Day of Week']
var_to_map = [map_building, day_of_week_map]

for col, map_dict in zip(cols_to_map, var_to_map):
    df = mapping(df, col, map_dict) 



df.head()

x,y = feature_selection(df)

X_train, X_test, y_train, y_test = custom_train_test_split(x, y, test_size=0.25, random_state=42)

with mlflow.start_run():
    model = LinearRegression()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("r2", r2)
    mlflow.log_artifact(local_path="/home/dikidwidasa/mlflow/data/test_energy_data.csv")

    mlflow.sklearn.log_model(model, "model_awal")

model_uri = f"runs:/{mlflow.active_run().info.run_id}/outputs/model_awal"
versi = get_datetime_string()
registered_model_name = f"model_awal {versi} "
registered_model_version = mlflow.register_model(model_uri, registered_model_name)


2025/01/07 11:57:03 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


🏃 View run defiant-croc-734 at: http://localhost:5000/#/experiments/0/runs/e5981f2ca1194f8ca73f48fcf3e6f9d9
🧪 View experiment at: http://localhost:5000/#/experiments/0


AttributeError: 'NoneType' object has no attribute 'info'

In [54]:
import mlflow
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from datetime import datetime

# Set up MLflow
mlflow.set_tracking_uri('http://localhost:5000')
mlflow.autolog()

# Helper Functions
def load_and_preprocess_data(filepath):
    """Load data and preprocess it with mappings."""
    df = loadData(filepath).reset_index(drop=True)

    # Define mapping dictionaries
    mappings = {
        'Building Type': {'Residential': 1, 'Commercial': 2, 'Industrial': 3},
        'Day of Week': {"Weekday": 1, "Weekend": 0}
    }

    # Apply mappings
    for col, map_dict in mappings.items():
        df = mapping(df, col, map_dict)
    
    return df

def feature_engineering_and_split(df, test_size=0.25, random_state=42):
    """Perform feature selection and train-test split."""
    x, y = feature_selection(df)
    return custom_train_test_split(x, y, test_size=test_size, random_state=random_state)

def get_datetime_string():
    """Get current date and time as a formatted string."""
    return datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

# Main MLflow Workflow
def train_and_log_model(df, test_data_path, model_name="model_awal"):
    """Train the model, log metrics and artifacts, and register the model."""
    # Train-Test Split
    X_train, X_test, y_train, y_test = feature_engineering_and_split(df)

    # Start MLflow run
    with mlflow.start_run() as run:
        # Model Training
        model = LinearRegression()
        model.fit(X_train, y_train)

        # Model Predictions
        y_pred = model.predict(X_test)

        # Log Metrics
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        mlflow.log_metric("mse", mse)
        mlflow.log_metric("r2", r2)

        # Log Data Artifact
        mlflow.log_artifact(local_path=test_data_path)

        # Log Model
        mlflow.sklearn.log_model(model, model_name)

        # Save run ID
        run_id = run.info.run_id

    # Register the Model
    model_uri = f"runs:/{run_id}/{model_name}"
    version_timestamp = get_datetime_string()
    registered_model_name = f"{model_name} {version_timestamp}"
    registered_model_version = mlflow.register_model(model_uri, registered_model_name)

    print(f"Model registered: {registered_model_name}, version: {registered_model_version.version}")
    return registered_model_name, registered_model_version.version

# Execution
if __name__ == "__main__":
    # Paths
    test_data_path = "/home/dikidwidasa/mlflow/data/test_energy_data.csv"

    # Load and preprocess data
    df = load_and_preprocess_data(test_data_path)

    # Train, log, and register model
    train_and_log_model(df, test_data_path)


2025/01/07 12:01:48 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
Successfully registered model 'model_awal 2025-01-07_12-01-55'.
2025/01/07 12:01:55 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: model_awal 2025-01-07_12-01-55, version 1


🏃 View run indecisive-jay-316 at: http://localhost:5000/#/experiments/0/runs/437e359be7a64d089782f1fa976791e4
🧪 View experiment at: http://localhost:5000/#/experiments/0
Model registered: model_awal 2025-01-07_12-01-55, version: 1


Created version '1' of model 'model_awal 2025-01-07_12-01-55'.
