In [1]:
# imports
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, r2_score
from scipy.stats import uniform, randint
from sklearn.model_selection import RandomizedSearchCV
import random
from pathlib import Path

# models
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

print("Libraries imported!")

  import pkg_resources  # noqa: TID251


Libraries imported!


In [2]:
# data loading
df = pd.read_csv('/Users/salikali/VSCodeProjects/Uni/MLOps/Project/Taste-Karachi/data/train_set.csv')
print("Data loaded!")

Data loaded!


In [3]:
df.head()

Unnamed: 0,area,price_level,latitude,longitude,category,dine_in,takeout,delivery,reservable,serves_breakfast,...,restroom,parking_free_lot,parking_free_street,accepts_debit_cards,accepts_cash_only,wheelchair_accessible,is_open_24_7,open_after_midnight,is_closed_any_day,rating
0,Gulshan-e-Iqbal,PRICE_LEVEL_INEXPENSIVE,24.863728,67.153379,Restaurant,True,True,True,False,True,...,True,True,True,False,True,False,True,False,False,4.2
1,Surjani Town,PRICE_LEVEL_INEXPENSIVE,25.011558,67.057304,Restaurant,True,True,True,False,True,...,False,False,True,False,True,False,True,False,False,4.2
2,PECHS,PRICE_LEVEL_MODERATE,24.868304,67.057431,Bakery,True,True,True,False,True,...,True,False,True,True,False,False,False,False,False,4.5
3,PECHS,PRICE_LEVEL_MODERATE,24.866186,67.077431,Restaurant,True,True,True,True,True,...,True,True,True,True,False,False,False,True,False,4.5
4,Gulshan-e-Iqbal,PRICE_LEVEL_MODERATE,24.904308,67.077439,Restaurant,True,True,True,True,False,...,True,True,True,True,False,True,True,False,False,3.9


In [4]:
# --- 1. Define Column Lists ---

# Your 'object' columns to be encoded
CATEGORICAL_COLS = [
    'area', 
    'price_level', 
    'category'
]

# Your 'float64' columns to be scaled
NUMERIC_COLS = [
    'latitude', 
    'longitude'
]

# All of your 'bool' columns to be left alone
# This list is now corrected to match your DataFrame
BINARY_COLS = [
    'dine_in', 'takeout', 'delivery', 'reservable', 'serves_breakfast', 
    'serves_lunch', 'serves_dinner', 'serves_coffee', 'serves_dessert', 
    'outdoor_seating', 'live_music', 'good_for_children', 'good_for_groups', 
    'good_for_watching_sports', 'restroom', 'parking_free_lot', 
    'parking_free_street', 'accepts_debit_cards', 'accepts_cash_only', 
    'wheelchair_accessible', 'is_open_24_7', 'open_after_midnight', 
    'is_closed_any_day'
]

# Create the final list of all features
FEATURES = CATEGORICAL_COLS + NUMERIC_COLS + BINARY_COLS
TARGET = 'rating'

In [5]:
# --- 2. Create Preprocessing Pipelines ---

# Create pipeline for categorical ('object') features:
# We use OneHotEncoder to turn categories into numerical columns.
# handle_unknown='ignore' prevents errors if new data has a category 
# that wasn't in the training data.
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create pipeline for numeric ('float64') features:
# StandardScaler puts all features on the same scale.
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

In [6]:
# --- 3. Build the Master ColumnTransformer ---

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, CATEGORICAL_COLS),
        ('num', numeric_transformer, NUMERIC_COLS),
        ('bin', 'passthrough', BINARY_COLS) # 'passthrough' leaves these columns untouched
    ])

print("Preprocessing pipeline created successfully!")
print(preprocessor)

Preprocessing pipeline created successfully!
ColumnTransformer(transformers=[('cat',
                                 Pipeline(steps=[('onehot',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 ['area', 'price_level', 'category']),
                                ('num',
                                 Pipeline(steps=[('scaler', StandardScaler())]),
                                 ['latitude', 'longitude']),
                                ('bin', 'passthrough',
                                 ['dine_in', 'takeout', 'delivery',
                                  'reservable', 'serves_breakfast',
                                  'serves_lunch', 'serves_dinner',
                                  'serves_coffee', 'serves_dessert',
                                  'outdoor_seating', 'live_music',
                                  'good_for_children', 'good_for_groups',
                                  'good_f

In [7]:
# --- 2. Create X and y, then Split Data ---
FEATURES = NUMERIC_COLS + CATEGORICAL_COLS + BINARY_COLS


y = df[TARGET]
X = df[FEATURES]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Data is ready. Training with {len(X_train)} rows.")

Data is ready. Training with 582 rows.


In [8]:
# --- 1. Set up MLflow Experiment ---
# This tells MLflow where to log your runs (the UI you have open)
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("Taste Karachi Rating Predictor Experiments")

2025/10/30 06:01:59 INFO mlflow.tracking.fluent: Experiment with name 'Taste Karachi Rating Predictor Experiments' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/606966764099264858', creation_time=1761786119461, experiment_id='606966764099264858', last_update_time=1761786119461, lifecycle_stage='active', name='Taste Karachi Rating Predictor Experiments', tags={}>

In [9]:
# run simple linear regression model

with mlflow.start_run(run_name="Linear Regression Model") as run:
    # Create the model pipeline
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ])
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"Linear Regression MAE: {mae}")
    print(f"Linear Regression R2: {r2}")
    
    # Log parameters and metrics to MLflow
    mlflow.log_param("model_type", "Linear Regression")
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("R2", r2)
    
    # Log the model itself
    mlflow.sklearn.log_model(model, "linear_regression_model")
    
    print(f"Run ID: {run.info.run_id} logged to MLflow.")

Linear Regression MAE: 0.18510684945236694
Linear Regression R2: 0.07474764230234732
Run ID: ccdeec20a7a14e0c8af293cff209b369 logged to MLflow.


In [10]:
# --- 2. Define Parameter Sampling Functions ---
# We'll use these to get random values for each run

def get_rf_params():
    """Generates a random set of params for Random Forest"""
    return {
        'n_estimators': random.randint(50, 500),
        'max_depth': random.choice([3, 5, 7, 10, 12, 14, 20, None]),
        'min_samples_leaf': random.randint(1, 7),
        'min_samples_split': random.randint(2, 11),
        'max_features': random.choice(['sqrt', 'log2', None])
    }

def get_gb_params():
    """Generates a random set of params for Gradient Boosting"""
    return {
        'n_estimators': random.randint(100, 800),
        'learning_rate': random.uniform(0.01, 0.3),
        'max_depth': random.choice([3, 5, 7, 9, 10, 12]),
        'subsample': random.uniform(0.7, 1.0), # Fixed range (0.7 to 1.0)
        'min_samples_leaf': random.randint(1, 5),
        'max_features': random.choice(['sqrt', 'log2', None])
    }

# --- 3. Run Manual Random Forest Experiments ---
print("--- Starting 30 Random Forest Runs ---")
N_RUNS = 30

for i in range(N_RUNS):
    
    # Start a new, independent run for each loop
    with mlflow.start_run(run_name=f"RandomForest_Run_{i+1}"):
        
        # 1. Get random params and create model
        rf_params = get_rf_params()
        model = RandomForestRegressor(random_state=42, **rf_params)
        
        # 2. Create the full pipeline
        model_pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('regressor', model)
        ])
        
        # 3. Log model type and params
        mlflow.log_param("model_type", "RandomForest")
        mlflow.log_params(rf_params)
        
        # 4. Fit the model
        model_pipeline.fit(X_train, y_train)
        
        # 5. Make predictions and get metrics
        y_pred = model_pipeline.predict(X_test)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        # 6. Log metrics
        mlflow.log_metric("mae", mae)
        mlflow.log_metric("r2", r2)
        
        # 7. Log the model file itself
        mlflow.sklearn.log_model(model_pipeline, artifact_path="model")
        
    print(f"Completed RF Run {i+1}/{N_RUNS} (MAE: {mae:.3f})")

print("\n--- Starting 30 Gradient Boosting Runs ---")

# --- 4. Run Manual Gradient Boosting Experiments ---
for i in range(N_RUNS):
    
    with mlflow.start_run(run_name=f"GradientBoosting_Run_{i+1}"):
        
        # 1. Get random params and create model
        gb_params = get_gb_params()
        model = GradientBoostingRegressor(random_state=42, **gb_params)
        
        # 2. Create the full pipeline
        model_pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('regressor', model)
        ])
        
        # 3. Log model type and params
        mlflow.log_param("model_type", "GradientBoosting")
        mlflow.log_params(gb_params)
        
        # 4. Fit the model
        model_pipeline.fit(X_train, y_train)
        
        # 5. Make predictions and get metrics
        y_pred = model_pipeline.predict(X_test)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        # 6. Log metrics
        mlflow.log_metric("mae", mae)
        mlflow.log_metric("r2", r2)
        
        # 7. Log the model file itself
        mlflow.sklearn.log_model(model_pipeline, artifact_path="model")
        
    print(f"Completed GB Run {i+1}/{N_RUNS} (MAE: {mae:.3f})")

print("\nAll 60 manual experiments complete!")

--- Starting 30 Random Forest Runs ---




Completed RF Run 1/30 (MAE: 0.184)




Completed RF Run 2/30 (MAE: 0.183)




Completed RF Run 3/30 (MAE: 0.182)




Completed RF Run 4/30 (MAE: 0.178)




Completed RF Run 5/30 (MAE: 0.184)




Completed RF Run 6/30 (MAE: 0.181)




Completed RF Run 7/30 (MAE: 0.179)




Completed RF Run 8/30 (MAE: 0.182)




Completed RF Run 9/30 (MAE: 0.183)




Completed RF Run 10/30 (MAE: 0.185)




Completed RF Run 11/30 (MAE: 0.183)




Completed RF Run 12/30 (MAE: 0.181)




Completed RF Run 13/30 (MAE: 0.185)




Completed RF Run 14/30 (MAE: 0.179)




Completed RF Run 15/30 (MAE: 0.180)




Completed RF Run 16/30 (MAE: 0.185)




Completed RF Run 17/30 (MAE: 0.182)




Completed RF Run 18/30 (MAE: 0.177)




Completed RF Run 19/30 (MAE: 0.183)




Completed RF Run 20/30 (MAE: 0.183)




Completed RF Run 21/30 (MAE: 0.183)




Completed RF Run 22/30 (MAE: 0.178)




Completed RF Run 23/30 (MAE: 0.181)




Completed RF Run 24/30 (MAE: 0.184)




Completed RF Run 25/30 (MAE: 0.184)




Completed RF Run 26/30 (MAE: 0.181)




Completed RF Run 27/30 (MAE: 0.182)




Completed RF Run 28/30 (MAE: 0.178)




Completed RF Run 29/30 (MAE: 0.181)




Completed RF Run 30/30 (MAE: 0.179)

--- Starting 30 Gradient Boosting Runs ---




Completed GB Run 1/30 (MAE: 0.216)




Completed GB Run 2/30 (MAE: 0.218)




Completed GB Run 3/30 (MAE: 0.230)




Completed GB Run 4/30 (MAE: 0.203)




Completed GB Run 5/30 (MAE: 0.189)




Completed GB Run 6/30 (MAE: 0.206)




Completed GB Run 7/30 (MAE: 0.215)




Completed GB Run 8/30 (MAE: 0.182)




Completed GB Run 9/30 (MAE: 0.214)




Completed GB Run 10/30 (MAE: 0.216)




Completed GB Run 11/30 (MAE: 0.206)




Completed GB Run 12/30 (MAE: 0.203)




Completed GB Run 13/30 (MAE: 0.196)




Completed GB Run 14/30 (MAE: 0.219)




Completed GB Run 15/30 (MAE: 0.212)




Completed GB Run 16/30 (MAE: 0.222)




Completed GB Run 17/30 (MAE: 0.181)




Completed GB Run 18/30 (MAE: 0.194)




Completed GB Run 19/30 (MAE: 0.226)




Completed GB Run 20/30 (MAE: 0.218)




Completed GB Run 21/30 (MAE: 0.216)




Completed GB Run 22/30 (MAE: 0.182)




Completed GB Run 23/30 (MAE: 0.208)




Completed GB Run 24/30 (MAE: 0.225)




Completed GB Run 25/30 (MAE: 0.222)




Completed GB Run 26/30 (MAE: 0.198)




Completed GB Run 27/30 (MAE: 0.209)




Completed GB Run 28/30 (MAE: 0.209)




Completed GB Run 29/30 (MAE: 0.205)
Completed GB Run 30/30 (MAE: 0.202)

All 60 manual experiments complete!




In [11]:
# randomised search cv to find best hyperparameters for gradient boosting regressor
gb_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(random_state=42))
])

param_dist = {
    'regressor__n_estimators': randint(50, 500),
    'regressor__learning_rate': uniform(0.01, 0.29),
    'regressor__max_depth': randint(3, 12),
    'regressor__subsample': uniform(0.7, 0.3),
    'regressor__min_samples_leaf': randint(1, 5),
    'regressor__max_features': ['sqrt', 'log2', None]
}

random_search = RandomizedSearchCV(
    gb_model,
    param_distributions=param_dist,
    n_iter=30,
    scoring='neg_mean_absolute_error',
    cv=5,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)
print("Best parameters found: ", random_search.best_params_)
print("Best MAE found: ", -random_search.best_score_)

Best parameters found:  {'regressor__learning_rate': 0.01596950334578271, 'regressor__max_depth': 4, 'regressor__max_features': 'log2', 'regressor__min_samples_leaf': 2, 'regressor__n_estimators': 435, 'regressor__subsample': 0.7545474901621302}
Best MAE found:  0.1823960881371908


In [12]:
# run best model from randomised search cv and log to mlflow
best_gb_model = random_search.best_estimator_

with mlflow.start_run(run_name="Gradient_Boosting_from_RandomizedSearchCV_1") as run:
    # Make predictions
    y_pred = best_gb_model.predict(X_test)
    
    # Evaluate the model
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"Best Gradient Boosting MAE: {mae}")
    print(f"Best Gradient Boosting R2: {r2}")
    
    # Log parameters and metrics to MLflow
    mlflow.log_param("model_type", "Gradient Boosting")
    mlflow.log_params(random_search.best_params_)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("r2", r2)
    
    # Log the model itself
    mlflow.sklearn.log_model(best_gb_model, "best_gradient_boosting_model")
    
    print(f"Run ID: {run.info.run_id} logged to MLflow.")

Best Gradient Boosting MAE: 0.17815564097660533
Best Gradient Boosting R2: 0.0893525440915629
Run ID: 6dc2ce69fa104f2eb9371b67ec04a74e logged to MLflow.




In [13]:
# randomised search cv to find best hyperparameters for random forest regressor
rf_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

param_dist = {
    'regressor__n_estimators': randint(50, 500),
    'regressor__max_depth': randint(3, 20),
    'regressor__min_samples_leaf': randint(1, 7),
    'regressor__min_samples_split': randint(2, 11),
    'regressor__max_features': ['sqrt', 'log2', None]
}

random_search_rf = RandomizedSearchCV(
    rf_model,
    param_distributions=param_dist,
    n_iter=40,
    scoring='neg_mean_absolute_error',
    cv=5,
    random_state=42,
    n_jobs=-1
)

random_search_rf.fit(X_train, y_train)
print("Best parameters found: ", random_search_rf.best_params_)
print("Best MAE found: ", -random_search_rf.best_score_)

Best parameters found:  {'regressor__max_depth': 4, 'regressor__max_features': None, 'regressor__min_samples_leaf': 4, 'regressor__min_samples_split': 8, 'regressor__n_estimators': 477}
Best MAE found:  0.1812820101832942


In [14]:
# run best model from randomised search cv and log to mlflow
best_rf_model = random_search_rf.best_estimator_

with mlflow.start_run(run_name="Random_Forest_from_RandomizedSearchCV_1") as run:
    # Make predictions
    y_pred = best_rf_model.predict(X_test)
    
    # Evaluate the model
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"Best Random Forest MAE: {mae}")
    print(f"Best Random Forest R2: {r2}")
    
    # Log parameters and metrics to MLflow
    mlflow.log_param("model_type", "Random Forest")
    mlflow.log_params(random_search_rf.best_params_)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("r2", r2)
    
    # Log the model itself
    mlflow.sklearn.log_model(best_rf_model, "best_random_forest_model")
    
    print(f"Run ID: {run.info.run_id} logged to MLflow.")

Best Random Forest MAE: 0.1801411440161394
Best Random Forest R2: 0.057764340900299915
Run ID: 523acdece41f4c61b9fc29a098a4e138 logged to MLflow.




In [16]:
# Your registered model details
model_name = "Restaurant_rating_prediction_regression"
model_version = 1
model_uri = f"models:/{model_name}/{model_version}"

# Create models directory
models_dir = Path("../models")
models_dir.mkdir(exist_ok=True)

print(f"Downloading model: {model_name} version {model_version}")
print(f"Model URI: {model_uri}")

# Download the model
download_path = mlflow.artifacts.download_artifacts(
    artifact_uri=model_uri,
    dst_path=str(models_dir)
)

print(f"\n✅ Model downloaded successfully!")
print(f"📁 Location: {download_path}")
print(f"\nYou can now find your model at:")
print(f"   models/{model_name}/{model_version}/")

Downloading model: Restaurant_rating_prediction_regression version 1
Model URI: models:/Restaurant_rating_prediction_regression/1

✅ Model downloaded successfully!
📁 Location: /Users/salikali/VSCodeProjects/Uni/MLOps/Project/Taste-Karachi/models/

You can now find your model at:
   models/Restaurant_rating_prediction_regression/1/
