<a href="https://colab.research.google.com/github/DS-Amdari/Jewelry-Optimisation/blob/main/jewwlry-predv1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install mlflow --quiet

In [2]:
!pip install pyngrok --quiet

In [19]:
# Step 1: Import Required Libraries
# Explanation: Import necessary Python libraries for data manipulation, machine learning, and MLflow tracking.
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

In [4]:
data = pd.read_csv('Jewelry_Dataset.csv')

In [5]:
# Step 4: Display Dataset Info
# Explanation: Perform some initial exploratory steps to understand the data structure.
print("Dataset Sample:")
print(data.sample(2))  # Display a few rows
print("Dataset Info:")
print(data.info())

# Step 5: Rename Columns
# Explanation: Rename the columns for clarity and consistency.
data.columns = [
    "Order datetime", "Order ID", "Purchased ID", "Quantity of SKU",
    "Category ID", "Category alias", "Brand ID", "Price in USD",
    "User ID", "gender", "Main Color", "Main metal", "Main gem"
]

# Step 6: Drop Missing Values
# Explanation: Clean the dataset by removing rows with missing values.
data.dropna(inplace=True)
print("Dataset cleaned. Remaining rows:", len(data))

# Step 7: Feature and Target Separation
# Explanation: Separate the dataset into features (`x`) and target (`y`).
x = data.drop(columns=["Price in USD", "Order ID", "Purchased ID", "Order datetime", "Category ID"], axis=1)
y = data["Price in USD"]

# Step 8: Identify Column Indices for Preprocessing
# Explanation: Identify categorical features for encoding and other preprocessing steps.
cols = list(x.columns)
col_indices = [cols.index(c) for c in cols]
cat_feat = list(x.dtypes[x.dtypes == "object"].index)
cat_indices = [cols.index(c) for c in cat_feat]


Dataset Sample:
       2018-12-01 11:40:29 UTC  1924719191579951782  1842195256808833386  1  \
76763  2021-09-09 05:58:58 UTC  2658745118852383196  1515966223038609348  1   
67333  2021-07-30 08:22:44 UTC  2629101675272471311  1923198279655358487  1   

       1806829201890738522 jewelry.earring    0  561.51  1515915625207851155  \
76763         1.806829e+18    jewelry.ring  1.0  342.33         1.515916e+18   
67333         1.806829e+18    jewelry.ring  1.0  225.89         1.515916e+18   

      Unnamed: 9  red  gold   diamond  
76763          f  red  gold   diamond  
67333          f  red  gold  amethyst  
Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95910 entries, 0 to 95909
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   2018-12-01 11:40:29 UTC  95910 non-null  object 
 1   1924719191579951782      95910 non-null  int64  
 2   1842195256808833386      95910 non-null  

In [6]:
#data.head()

In [7]:
# Step 9: Define Preprocessing Pipeline
# Explanation: Create a reusable function for building pipelines that include preprocessing and a model.
def make_pipe(model, col_indices, cat_indices, fill_missing=True, one_hot=True, label_encode=False):
    first_transform = ColumnTransformer(
        transformers=[("imputer", SimpleImputer(strategy="most_frequent"), col_indices)],
        remainder="passthrough"
    )
    sec_transform = ColumnTransformer(
        transformers=[("one_hot", OneHotEncoder(), cat_indices)],
        remainder="passthrough"
    )
    third_transform = ColumnTransformer(
        transformers=[("ordinal", OrdinalEncoder(), cat_indices)],
        remainder="passthrough"
    )
    steps = []
    if fill_missing:
        steps.append(("first", first_transform))
    if one_hot:
        steps.append(("second", sec_transform))
    if label_encode:
        steps.append(("third", third_transform))
    steps.append(("model", model))
    return Pipeline(steps=steps)


In [8]:
# Step 10: Train-Test Split
# Explanation: Split the dataset into training and test sets for evaluation.
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

# Step 11: Initialize and Fit a Model Pipeline
# Explanation: Create a linear regression model pipeline and train it on the data.
lin_pipe = make_pipe(
    LinearRegression(),
    col_indices,
    cat_indices,
    fill_missing=True,
    one_hot=True
)

lin_pipe.fit(x_train, y_train)

In [9]:
def log_model_with_mlflow(model, model_name, x_test, y_test):
   try:
       with mlflow.start_run(run_name=model_name):
           # Log model parameters
           if hasattr(model.named_steps['model'], "get_params"):
               params = model.named_steps['model'].get_params()
               mlflow.log_params(params)


           # Log metrics
           preds = model.predict(x_test)
           mae = mean_absolute_error(y_test, preds)
           mse = mean_squared_error(y_test, preds)
           mlflow.log_metric("MAE", mae)
           mlflow.log_metric("MSE", mse)


           # Log the model
           mlflow.sklearn.log_model(model, model_name)
           print(f"Model '{model_name}' logged successfully.")
   except mlflow.exceptions.MlflowException as e:
       print(f"Error logging model with MLflow: {e}")


In [15]:
log_model_with_mlflow(lin_pipe, "Linear_Regression", x_test, y_test)



Model 'Linear_Regression' logged successfully.


In [17]:
Random_reg_pipe = make_pipe(
    RandomForestRegressor(),
    col_indices,
    cat_indices,
    fill_missing=True,
    one_hot=True
)

Random_reg_pipe.fit(x_train, y_train)

In [18]:
log_model_with_mlflow(Random_reg_pipe,"Random Forest",x_test,y_test)



Model 'Random Forest' logged successfully.


In [20]:
Dec_tree_pipe = make_pipe(
    DecisionTreeRegressor(),
    col_indices,
    cat_indices,
    fill_missing=True,
    one_hot=True
)

Dec_tree_pipe.fit(x_train, y_train)

In [21]:
log_model_with_mlflow(Dec_tree_pipe,"Decision Tree",x_test,y_test)



Model 'Decision Tree' logged successfully.


In [10]:
mlflow.set_tracking_uri(uri="/content/mlruns")

In [11]:
mlflow.set_experiment("Jewelry_Experiment")

2025/02/14 15:09:46 INFO mlflow.tracking.fluent: Experiment with name 'Jewelry_Experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='/content/mlruns/303929524424097035', creation_time=1739545786392, experiment_id='303929524424097035', last_update_time=1739545786392, lifecycle_stage='active', name='Jewelry_Experiment', tags={}>

In [12]:
from pyngrok import ngrok

In [13]:
get_ipython().system_raw("mlflow ui --port 5000 &")


In [14]:
ngrok.kill()


# Setting the authtoken (optional)
# Get your authtoken from https://dashboard.ngrok.com/auth
NGROK_AUTH_TOKEN = "2sig79yJVADDUu2FplDOmP1oRhO_6DsZFTRif4NDCYLgLc3mk"
ngrok.set_auth_token(NGROK_AUTH_TOKEN)


# Open an HTTPs tunnel on port 5000 for http://localhost:5000
ngrok_tunnel = ngrok.connect(addr="5000", proto="http", bind_tls=True)
print("MLflow Tracking UI:", ngrok_tunnel.public_url)


MLflow Tracking UI: https://98a0-34-85-135-116.ngrok-free.app
