In [95]:
# imports
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from scipy.stats import uniform, randint
from sklearn.model_selection import RandomizedSearchCV
import random
from pathlib import Path
import boto3, os

# models
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

print("Libraries imported!")

Libraries imported!


In [96]:
# data loading
df = pd.read_csv('/Users/salikali/VSCodeProjects/Uni/MLOps/Project/Taste-Karachi/data/train_set.csv')
print("Data loaded!")

Data loaded!


In [97]:
df.head()

Unnamed: 0,area,price_level,latitude,longitude,category,dine_in,takeout,delivery,reservable,serves_breakfast,...,restroom,parking_free_lot,parking_free_street,accepts_debit_cards,accepts_cash_only,wheelchair_accessible,is_open_24_7,open_after_midnight,is_closed_any_day,rating
0,Gulshan-e-Iqbal,PRICE_LEVEL_INEXPENSIVE,24.863728,67.153379,Restaurant,True,True,True,False,True,...,True,True,True,False,True,False,True,False,False,4.2
1,Surjani Town,PRICE_LEVEL_INEXPENSIVE,25.011558,67.057304,Restaurant,True,True,True,False,True,...,False,False,True,False,True,False,True,False,False,4.2
2,PECHS,PRICE_LEVEL_MODERATE,24.868304,67.057431,Bakery,True,True,True,False,True,...,True,False,True,True,False,False,False,False,False,4.5
3,PECHS,PRICE_LEVEL_MODERATE,24.866186,67.077431,Restaurant,True,True,True,True,True,...,True,True,True,True,False,False,False,True,False,4.5
4,Gulshan-e-Iqbal,PRICE_LEVEL_MODERATE,24.904308,67.077439,Restaurant,True,True,True,True,False,...,True,True,True,True,False,True,True,False,False,3.9


In [98]:
# --- 1. Define Column Lists ---

# Your 'object' columns to be encoded
CATEGORICAL_COLS = [
    'area', 
    'price_level', 
    'category'
]

# Your 'float64' columns to be scaled
NUMERIC_COLS = [
    'latitude', 
    'longitude'
]

# All of your 'bool' columns to be left alone
# This list is now corrected to match your DataFrame
BINARY_COLS = [
    'dine_in', 'takeout', 'delivery', 'reservable', 'serves_breakfast', 
    'serves_lunch', 'serves_dinner', 'serves_coffee', 'serves_dessert', 
    'outdoor_seating', 'live_music', 'good_for_children', 'good_for_groups', 
    'good_for_watching_sports', 'restroom', 'parking_free_lot', 
    'parking_free_street', 'accepts_debit_cards', 'accepts_cash_only', 
    'wheelchair_accessible', 'is_open_24_7', 'open_after_midnight', 
    'is_closed_any_day'
]

# Create the final list of all features
FEATURES = CATEGORICAL_COLS + NUMERIC_COLS + BINARY_COLS
TARGET = 'rating'

In [99]:
# --- 2. Create Preprocessing Pipelines ---

# Create pipeline for categorical ('object') features:
# We use OneHotEncoder to turn categories into numerical columns.
# handle_unknown='ignore' prevents errors if new data has a category 
# that wasn't in the training data.
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create pipeline for numeric ('float64') features:
# StandardScaler puts all features on the same scale.
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

In [100]:
# --- 3. Build the Master ColumnTransformer ---

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, CATEGORICAL_COLS),
        ('num', numeric_transformer, NUMERIC_COLS),
        ('bin', 'passthrough', BINARY_COLS) # 'passthrough' leaves these columns untouched
    ])

print("Preprocessing pipeline created successfully!")
print(preprocessor)

Preprocessing pipeline created successfully!
ColumnTransformer(transformers=[('cat',
                                 Pipeline(steps=[('onehot',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 ['area', 'price_level', 'category']),
                                ('num',
                                 Pipeline(steps=[('scaler', StandardScaler())]),
                                 ['latitude', 'longitude']),
                                ('bin', 'passthrough',
                                 ['dine_in', 'takeout', 'delivery',
                                  'reservable', 'serves_breakfast',
                                  'serves_lunch', 'serves_dinner',
                                  'serves_coffee', 'serves_dessert',
                                  'outdoor_seating', 'live_music',
                                  'good_for_children', 'good_for_groups',
                                  'good_f

In [101]:
# --- 2. Create X and y, then Split Data ---
FEATURES = NUMERIC_COLS + CATEGORICAL_COLS + BINARY_COLS


y = df[TARGET]
X = df[FEATURES]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Data is ready. Training with {len(X_train)} rows.")

Data is ready. Training with 582 rows.


In [102]:
MLFLOW_TRACKING_URI = "http://54.196.196.185:5000"
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
print(f"MLflow Tracking URI: {mlflow.get_tracking_uri()}")

# 🔥 ADD THIS LINE - Force artifact proxying through server
os.environ['MLFLOW_ARTIFACTS_DESTINATION'] = MLFLOW_TRACKING_URI

MLflow Tracking URI: http://54.196.196.185:5000


In [103]:
experiment_name = "Taste-Karachi-Rating-Regressor"
mlflow.set_experiment(experiment_name)

2025/10/31 23:36:48 INFO mlflow.tracking.fluent: Experiment with name 'Taste-Karachi-Rating-Regressor' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/15', creation_time=1761935809127, experiment_id='15', last_update_time=1761935809127, lifecycle_stage='active', name='Taste-Karachi-Rating-Regressor', tags={}>

In [104]:
with mlflow.start_run(run_name="linear_regression_simple"):
    
    # Define hyperparameters - Linear Regression has no hyperparameters to tune
    random_state = 42
    
    # Log parameters
    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_param("test_size", 0.2)
    
    # ============================================
    # Create FULL PIPELINE (Preprocessing + Model)
    # ============================================
    print("Creating full pipeline...")
    full_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', LinearRegression())
    ])
    
    # Train the pipeline
    print("Training model...")
    full_pipeline.fit(X_train, y_train)
    
    # Make predictions
    y_pred = full_pipeline.predict(X_test)
    
    # Calculate regression metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    # Log metrics
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2_score", r2)
    
    print(f"MAE: {mae:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"R² Score: {r2:.4f}")
    
    # ============================================
    # STEP 5: Log the FULL PIPELINE to MLflow
    # ============================================
    print("Logging model to MLflow...")
    mlflow.sklearn.log_model(
        full_pipeline,
        "linear_regression_model"
    )
    
    print("✅ Model logged successfully!")
    print(f"Run ID: {mlflow.active_run().info.run_id}")

print("\n" + "="*50)
print("Experiment completed!")
print(f"View results at: {MLFLOW_TRACKING_URI}")
print("="*50)

Creating full pipeline...
Training model...
MAE: 0.1851
RMSE: 0.2561
R² Score: 0.0747
Logging model to MLflow...




✅ Model logged successfully!
Run ID: e9cb9560ae4c4179b3a2d03a2633dc9c

Experiment completed!
View results at: http://54.196.196.185:5000


In [105]:
# --- 2. Define Parameter Sampling Functions ---
# We'll use these to get random values for each run

def get_rf_params():
    """Generates a random set of params for Random Forest"""
    return {
        'n_estimators': random.randint(50, 500),
        'max_depth': random.choice([3, 5, 7, 10, 12, 14, 20, None]),
        'min_samples_leaf': random.randint(1, 7),
        'min_samples_split': random.randint(2, 11),
        'max_features': random.choice(['sqrt', 'log2', None])
    }

def get_gb_params():
    """Generates a random set of params for Gradient Boosting"""
    return {
        'n_estimators': random.randint(100, 800),
        'learning_rate': random.uniform(0.01, 0.3),
        'max_depth': random.choice([3, 5, 7, 9, 10, 12]),
        'subsample': random.uniform(0.7, 1.0),
        'min_samples_leaf': random.randint(1, 5),
        'max_features': random.choice(['sqrt', 'log2', None])
    }

# --- 3. Run Manual Random Forest Experiments ---
print("--- Starting 10 Random Forest Runs ---")
N_RUNS = 10

for i in range(N_RUNS):
    
    # Start a new, independent run for each loop
    with mlflow.start_run(run_name=f"RandomForest_Run_{i+1}"):
        
        # 1. Get random params and create model
        rf_params = get_rf_params()
        model = RandomForestRegressor(random_state=42, **rf_params)
        
        # 2. Create the full pipeline
        model_pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('regressor', model)
        ])
        
        # 3. Log model type and params
        mlflow.log_param("model_type", "RandomForest")
        mlflow.log_params(rf_params)
        
        # 4. Fit the model
        model_pipeline.fit(X_train, y_train)
        
        # 5. Make predictions and calculate ALL metrics
        y_pred = model_pipeline.predict(X_test)
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test, y_pred)
        
        # 6. Log ALL metrics (same as linear regression)
        mlflow.log_metric("mae", mae)
        mlflow.log_metric("mse", mse)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2_score", r2)
        
        # 7. Log the model file itself (no registration)
        mlflow.sklearn.log_model(model_pipeline, artifact_path="model")
        
    print(f"Completed RF Run {i+1}/{N_RUNS} (MAE: {mae:.3f}, RMSE: {rmse:.3f}, R²: {r2:.3f})")

print("\n--- Starting 10 Gradient Boosting Runs ---")

# --- 4. Run Manual Gradient Boosting Experiments ---
for i in range(N_RUNS):
    
    with mlflow.start_run(run_name=f"GradientBoosting_Run_{i+1}"):
        
        # 1. Get random params and create model
        gb_params = get_gb_params()
        model = GradientBoostingRegressor(random_state=42, **gb_params)
        
        # 2. Create the full pipeline
        model_pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('regressor', model)
        ])
        
        # 3. Log model type and params
        mlflow.log_param("model_type", "GradientBoosting")
        mlflow.log_params(gb_params)
        
        # 4. Fit the model
        model_pipeline.fit(X_train, y_train)
        
        # 5. Make predictions and calculate ALL metrics
        y_pred = model_pipeline.predict(X_test)
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test, y_pred)
        
        # 6. Log ALL metrics (same as linear regression)
        mlflow.log_metric("mae", mae)
        mlflow.log_metric("mse", mse)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2_score", r2)
        
        # 7. Log the model file itself (no registration)
        mlflow.sklearn.log_model(model_pipeline, artifact_path="model")
        
    print(f"Completed GB Run {i+1}/{N_RUNS} (MAE: {mae:.3f}, RMSE: {rmse:.3f}, R²: {r2:.3f})")

print("\nAll 20 manual experiments complete!")

--- Starting 10 Random Forest Runs ---




Completed RF Run 1/10 (MAE: 0.184, RMSE: 0.260, R²: 0.044)




Completed RF Run 2/10 (MAE: 0.181, RMSE: 0.259, R²: 0.056)




Completed RF Run 3/10 (MAE: 0.180, RMSE: 0.260, R²: 0.046)




Completed RF Run 4/10 (MAE: 0.179, RMSE: 0.257, R²: 0.066)




Completed RF Run 5/10 (MAE: 0.179, RMSE: 0.260, R²: 0.046)




Completed RF Run 6/10 (MAE: 0.184, RMSE: 0.261, R²: 0.040)




Completed RF Run 7/10 (MAE: 0.181, RMSE: 0.260, R²: 0.049)




Completed RF Run 8/10 (MAE: 0.179, RMSE: 0.258, R²: 0.059)




Completed RF Run 9/10 (MAE: 0.181, RMSE: 0.260, R²: 0.050)




Completed RF Run 10/10 (MAE: 0.179, RMSE: 0.259, R²: 0.051)

--- Starting 10 Gradient Boosting Runs ---




Completed GB Run 1/10 (MAE: 0.183, RMSE: 0.258, R²: 0.061)




Completed GB Run 2/10 (MAE: 0.218, RMSE: 0.294, R²: -0.223)




Completed GB Run 3/10 (MAE: 0.219, RMSE: 0.295, R²: -0.225)




Completed GB Run 4/10 (MAE: 0.227, RMSE: 0.294, R²: -0.223)




Completed GB Run 5/10 (MAE: 0.198, RMSE: 0.267, R²: -0.006)




Completed GB Run 6/10 (MAE: 0.193, RMSE: 0.270, R²: -0.030)




Completed GB Run 7/10 (MAE: 0.188, RMSE: 0.257, R²: 0.069)




Completed GB Run 8/10 (MAE: 0.201, RMSE: 0.278, R²: -0.088)




Completed GB Run 9/10 (MAE: 0.198, RMSE: 0.267, R²: -0.006)




Completed GB Run 10/10 (MAE: 0.218, RMSE: 0.285, R²: -0.146)

All 20 manual experiments complete!


In [106]:
# randomised search cv to find best hyperparameters for gradient boosting regressor
gb_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(random_state=42))
])

param_dist = {
    'regressor__n_estimators': randint(50, 500),
    'regressor__learning_rate': uniform(0.01, 0.29),
    'regressor__max_depth': randint(3, 12),
    'regressor__subsample': uniform(0.7, 0.3),
    'regressor__min_samples_leaf': randint(1, 5),
    'regressor__max_features': ['sqrt', 'log2', None]
}

random_search = RandomizedSearchCV(
    gb_model,
    param_distributions=param_dist,
    n_iter=30,
    scoring='neg_mean_absolute_error',
    cv=5,
    random_state=42,
    n_jobs=-1
)

print("Running RandomizedSearchCV for Gradient Boosting...")
random_search.fit(X_train, y_train)

print("Best parameters found: ", random_search.best_params_)
print("Best MAE found: ", -random_search.best_score_)

# Log to MLflow
with mlflow.start_run(run_name="GradientBoosting_RandomizedSearchCV"):
    
    # Get the best model
    best_gb_model = random_search.best_estimator_
    
    # Log model type
    mlflow.log_param("model_type", "GradientBoosting")
    mlflow.log_param("search_method", "RandomizedSearchCV")
    mlflow.log_param("cv_folds", 5)
    mlflow.log_param("n_iter", 30)
    
    # Log best parameters
    mlflow.log_params(random_search.best_params_)
    
    # Make predictions on test set
    y_pred = best_gb_model.predict(X_test)
    
    # Calculate all metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    # Log all metrics
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2_score", r2)
    mlflow.log_metric("cv_best_mae", -random_search.best_score_)  # Cross-validation MAE
    
    print(f"\nTest Set Results:")
    print(f"MAE: {mae:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"R² Score: {r2:.4f}")
    
    # Log the best model
    mlflow.sklearn.log_model(best_gb_model, "best_gradient_boosting_model")
    
    print(f"✅ Model logged to MLflow!")
    print(f"Run ID: {mlflow.active_run().info.run_id}")

Running RandomizedSearchCV for Gradient Boosting...
Best parameters found:  {'regressor__learning_rate': 0.01596950334578271, 'regressor__max_depth': 4, 'regressor__max_features': 'log2', 'regressor__min_samples_leaf': 2, 'regressor__n_estimators': 435, 'regressor__subsample': 0.7545474901621302}
Best MAE found:  0.1823960881371908

Test Set Results:
MAE: 0.1782
RMSE: 0.2540
R² Score: 0.0894




✅ Model logged to MLflow!
Run ID: f61f1125f39c43b491c6ec12f13c3112


In [107]:
# randomised search cv to find best hyperparameters for random forest regressor
rf_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

param_dist = {
    'regressor__n_estimators': randint(50, 500),
    'regressor__max_depth': randint(3, 20),
    'regressor__min_samples_leaf': randint(1, 7),
    'regressor__min_samples_split': randint(2, 11),
    'regressor__max_features': ['sqrt', 'log2', None]
}

random_search_rf = RandomizedSearchCV(
    rf_model,
    param_distributions=param_dist,
    n_iter=40,
    scoring='neg_mean_absolute_error',
    cv=5,
    random_state=42,
    n_jobs=-1
)

random_search_rf.fit(X_train, y_train)
print("Best parameters found: ", random_search_rf.best_params_)
print("Best MAE found: ", -random_search_rf.best_score_)

# run best model from randomised search cv and log to mlflow
best_rf_model = random_search_rf.best_estimator_

with mlflow.start_run(run_name="Random_Forest_from_RandomizedSearchCV_1") as run:
    # Make predictions
    y_pred = best_rf_model.predict(X_test)
    
    # Calculate ALL metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    print(f"Best Random Forest MAE: {mae:.4f}")
    print(f"Best Random Forest MSE: {mse:.4f}")
    print(f"Best Random Forest RMSE: {rmse:.4f}")
    print(f"Best Random Forest R²: {r2:.4f}")
    
    # Log parameters and ALL metrics to MLflow
    mlflow.log_param("model_type", "Random Forest")
    mlflow.log_params(random_search_rf.best_params_)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2_score", r2)
    
    # Log the model itself (no registration)
    mlflow.sklearn.log_model(best_rf_model, "best_random_forest_model")
    
    print(f"✅ Run ID: {run.info.run_id} logged to MLflow.")

Best parameters found:  {'regressor__max_depth': 4, 'regressor__max_features': None, 'regressor__min_samples_leaf': 4, 'regressor__min_samples_split': 8, 'regressor__n_estimators': 477}
Best MAE found:  0.1812820101832942
Best Random Forest MAE: 0.1801
Best Random Forest MSE: 0.0668
Best Random Forest RMSE: 0.2584
Best Random Forest R²: 0.0578




✅ Run ID: 2bcd65c00cfc4af786a11f76f2affad1 logged to MLflow.
