In [5]:
import numpy as np
import pandas as pd
import pickle
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from sklearn.tree import DecisionTreeRegressor
import xgboost
import sys
import warnings
from itertools import cycle
warnings.filterwarnings("ignore")
np.random.seed(40)
import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature

train = pd.read_csv('train_set.csv')
test = pd.read_csv('test_set.csv')

X_train = train.drop('log_total_users', axis=1)
y_train = train.log_total_users

X_test = test.drop('log_total_users', axis=1)
y_test = test.log_total_users

# Load preprocessed data
with open('preprocess.pkl', 'rb') as file:
    pre = pickle.load(file)

def eval_metrics(actual, pred):
    mse = mean_squared_error(actual, pred)
    mape = mean_absolute_percentage_error(actual, pred)
    return rmse, mae

# Define the experiment name
experiment_name = "end-to-end-mlops"

# Check if the experiment exists
experiment = mlflow.get_experiment_by_name(experiment_name)

if experiment is None:
    # If the experiment does not exist, create it
    mlflow.create_experiment(experiment_name)

# Set the experiment
mlflow.set_experiment(experiment_name)

n_samples = len(test)
split_size = n_samples // 4

# Split X_test into 4 parts
test1 = test.iloc[:split_size, :]
test2 = test.iloc[split_size:2 * split_size, :]
test3 = test.iloc[2 * split_size:3 * split_size, :]
test4 = test.iloc[3 * split_size:, :]

X_test1 = test1.drop('log_total_users',axis=1)
X_test2 = test2.drop('log_total_users',axis=1)
X_test3 = test3.drop('log_total_users',axis=1)
X_test4 = test4.drop('log_total_users',axis=1)

y_test1 = test1.log_total_users
y_test2 = test2.log_total_users
y_test3 = test3.log_total_users
y_test4 = test4.log_total_users

with open('preprocess.pkl', 'rb') as file:
    pre = pickle.load(file)
with open('dt.pkl', 'rb') as file:
    dt_model = pickle.load(file)
with open('gb.pkl', 'rb') as file:
    gb_model = pickle.load(file)
with open('xg.pkl', 'rb') as file:
    xg_model = pickle.load(file)
# Create an empty DataFrame to store the outputs
columns = ['Subset', 'Model', 'MSE', 'MAPE']
outputs_data = []

X_test_list = [X_test1,X_test2,X_test3,X_test4]
y_test_list = [y_test1,y_test2,y_test3,y_test4]
models = [('Decision Tree', dt_model), ('Gradient Boosting', gb_model), ('XGBoost', xg_model)]


for i, (X_test_part, y_test_part) in enumerate(zip(X_test_list, y_test_list), start=1):
    for model_name, model in models:
        # Apply preprocessing
        X_test_part_tf = pre.transform(X_test_part)

        # Make predictions using the model
        predictions = model.predict(X_test_part_tf)

        # Calculate MSE and MAPE
        mse = mean_squared_error(predictions, y_test_part)
        mape = mean_absolute_percentage_error(predictions, y_test_part)
        

        # Start MLflow run
        with mlflow.start_run(run_name=f"Subset_{i}_{model_name}"):
            # Log metrics
            mlflow.log_metric("MSE", mse)
            mlflow.log_metric("MAPE", mape)

            # Infer model signature
            signature = infer_signature(X_test_part, predictions)

            # Log model
            mlflow.sklearn.log_model(model, model_name, signature=signature)

            # Append results to the list
            outputs_data.append({
                'Subset': i,
                'Model': model_name,
                'MSE': mse,
                'MAPE': mape
            })

# Create DataFrame from the list
outputs_df = pd.DataFrame(outputs_data)

# Display the DataFrame
print(outputs_df)


The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh()

All git commands will error until this is rectified.

$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - error|e|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



    Subset              Model       MSE      MAPE
0        1      Decision Tree  0.199105  0.048796
1        1  Gradient Boosting  0.141893  0.040976
2        1            XGBoost  0.140899  0.041963
3        2      Decision Tree  0.214218  0.048812
4        2  Gradient Boosting  0.153273  0.042986
5        2            XGBoost  0.151427  0.042203
6        3      Decision Tree  0.225972  0.053039
7        3  Gradient Boosting  0.136316  0.040928
8        3            XGBoost  0.141446  0.041556
9        4      Decision Tree  0.206729  0.050413
10       4  Gradient Boosting  0.143841  0.042932
11       4            XGBoost  0.137602  0.041002
