In [1]:
from pycaret.regression import *
import pandas as pd

In [2]:
# Load the cleaned dataset
df = pd.read_csv('cleaned_melbourne_data.csv')

In [3]:

# Display dataset information
print("Dataset Overview:")
print(df.info())
print("\nFirst 5 Rows:")
print(df.head())


Dataset Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         13580 non-null  object 
 1   Address        13580 non-null  object 
 2   Rooms          13580 non-null  int64  
 3   Type           13580 non-null  object 
 4   Price          13580 non-null  int64  
 5   Method         13580 non-null  object 
 6   Seller         13580 non-null  object 
 7   Date           13580 non-null  object 
 8   Distance       13580 non-null  float64
 9   Postcode       13580 non-null  int64  
 10  Bedroom2       13580 non-null  int64  
 11  Bathroom       13580 non-null  int64  
 12  Car            13580 non-null  float64
 13  Landsize       13580 non-null  int64  
 14  BuildingArea   13580 non-null  float64
 15  YearBuilt      13580 non-null  float64
 16  Lattitude      13580 non-null  float64
 17  Longtitude     13580 non-null  f

In [4]:
# Set up the PyCaret regression environment
print("\nInitializing PyCaret Setup...")
exp = setup(df, target='Price', session_id=42, normalize=True, remove_multicollinearity=True, train_size=0.8)


Initializing PyCaret Setup...


Unnamed: 0,Description,Value
0,Session id,42
1,Target,Price
2,Target type,Regression
3,Original data shape,"(13580, 20)"
4,Transformed data shape,"(13580, 32)"
5,Transformed train set shape,"(10864, 32)"
6,Transformed test set shape,"(2716, 32)"
7,Numeric features,12
8,Categorical features,7
9,Preprocess,True


In [5]:
# Compare multiple regression models and select top-performing ones
print("\nComparing Models...")
best_models = compare_models(n_select=5, sort='R2')
print("\nTop 5 Selected Models:")
print(best_models)


Comparing Models...


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
knn,K Neighbors Regressor,247539.7003,190508792480.3145,434303.9433,0.539,0.2887,0.2195,0.129
en,Elastic Net,318695.8622,236198565886.7352,482432.8748,0.4313,0.3852,0.3568,0.098
ada,AdaBoost Regressor,432860.3436,374502444890.7216,610684.7921,0.0857,0.5135,0.4995,0.36
xgboost,Extreme Gradient Boosting,444159.1888,389636233631.4344,622921.0693,0.0493,0.5257,0.5176,0.165
et,Extra Trees Regressor,448992.8581,390889133479.94,623826.6762,0.0468,0.5319,0.5316,0.808
catboost,CatBoost Regressor,447304.6106,391692860849.6026,624601.4562,0.0441,0.5307,0.5272,1.875
lightgbm,Light Gradient Boosting Machine,446533.672,394437864350.7677,626722.3875,0.0377,0.5298,0.5232,0.635
gbr,Gradient Boosting Regressor,454689.2814,402241418975.8039,632972.5773,0.0183,0.5381,0.5363,0.678
ridge,Ridge Regression,453717.0679,402626712774.6196,633287.5593,0.0173,0.5376,0.5348,0.097
br,Bayesian Ridge,453804.4947,402756920817.9928,633390.3782,0.017,0.5377,0.5349,0.093



Top 5 Selected Models:
[KNeighborsRegressor(n_jobs=-1), ElasticNet(random_state=42), AdaBoostRegressor(random_state=42), XGBRegressor(base_score=None, booster='gbtree', callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device='cpu', early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=None, n_jobs=-1,
             num_parallel_tree=None, random_state=42, ...), ExtraTreesRegressor(n_jobs=-1, random_state=42)]


In [6]:
# Tune hyperparameters of selected models
print("\nTuning Selected Models...")
tuned_models = []
for model in best_models:
    print(f"Tuning {model}")
    tuned_model = tune_model(model, optimize='R2')
    tuned_models.append(tuned_model)
    print(f"Best Tuned Model Parameters: {tuned_model}")


Tuning Selected Models...
Tuning KNeighborsRegressor(n_jobs=-1)


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,225778.3306,180254842836.7042,424564.2976,0.5932,0.2755,0.216
1,231901.8896,162480326137.8351,403088.4843,0.6093,0.2643,0.2042
2,245862.9524,185812279918.8212,431059.4854,0.5704,0.2857,0.2132
3,249470.2914,274746424130.7029,524162.5932,0.4913,0.2858,0.2042
4,235225.1122,143184769549.8417,378397.6342,0.5973,0.2767,0.2148
5,211768.1133,142189501205.9386,377080.2318,0.6139,0.2584,0.1949
6,228924.6356,153261956738.3112,391486.8539,0.6049,0.258,0.1931
7,218494.8381,140064444710.2166,374251.8466,0.6271,0.2639,0.2061
8,229944.5536,150074227488.9901,387394.15,0.6175,0.2703,0.2077
9,228734.1081,144412338259.3431,380016.2342,0.6234,0.2565,0.1943


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Best Tuned Model Parameters: KNeighborsRegressor(metric='manhattan', n_jobs=-1, n_neighbors=8)
Tuning ElasticNet(random_state=42)


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,323968.8186,261756586275.8743,511621.5264,0.4093,0.4101,0.3878
1,329480.1415,238031498816.0538,487884.7188,0.4277,0.3939,0.3711
2,339704.8956,252128984085.9687,502124.4707,0.417,0.399,0.3683
3,351896.7901,441343596419.2439,664336.96,0.1828,0.4132,0.3726
4,316537.5511,196130713807.9116,442866.474,0.4484,0.3843,0.3591
5,316539.3476,206799384482.3213,454752.0033,0.4385,0.392,0.3732
6,315829.0747,216468179175.6889,465261.4095,0.4419,0.3727,0.3369
7,321154.3305,208810342112.9072,456957.7028,0.4441,0.4033,0.386
8,328639.2858,217114448452.0553,465955.4147,0.4466,0.3956,0.3731
9,316975.105,207284062822.4517,455284.5954,0.4595,0.3699,0.3391


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
Best Tuned Model Parameters: ElasticNet(random_state=42)
Tuning AdaBoostRegressor(random_state=42)


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,437340.1713,404500924113.5918,636003.8711,0.0871,0.5307,0.525
1,442414.0047,379784766123.6364,616266.7978,0.0869,0.5228,0.5263
2,445984.2068,398512227383.4718,631278.2488,0.0786,0.5206,0.4935
3,443271.9451,490779840135.4322,700556.8072,0.0913,0.519,0.497
4,424531.4052,316843027410.6908,562888.1127,0.1089,0.5045,0.5058
5,390567.593,313889357654.9099,560258.2955,0.1477,0.4743,0.4335
6,396789.8786,327616844400.4602,572378.2354,0.1553,0.4655,0.4257
7,440465.9747,348346371943.4126,590208.7529,0.0727,0.5405,0.5551
8,453879.1208,366705808583.984,605562.3903,0.0654,0.5362,0.5489
9,433291.6082,331781705737.3621,576004.9529,0.1348,0.5011,0.5047


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Best Tuned Model Parameters: AdaBoostRegressor(learning_rate=0.4, loss='square', n_estimators=210,
                  random_state=42)
Tuning XGBRegressor(base_score=None, booster='gbtree', callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device='cpu', early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=None, n_jobs=-1,
             num_parallel_tree=None, random_state=42, ...)


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,341705.3264,256868391887.1218,506821.8542,0.4203,0.4284,0.416
1,344904.1126,238444597881.7114,488307.8925,0.4267,0.4147,0.4011
2,343127.0878,244588847212.638,494559.2454,0.4345,0.4102,0.3817
3,354268.9117,351679403237.4014,593025.6346,0.3488,0.4215,0.3879
4,323136.3886,193929066836.6337,440373.7808,0.4546,0.3919,0.3697
5,331045.6282,208402868673.197,456511.6304,0.4342,0.4121,0.4018
6,327252.4744,227373146342.0446,476836.6034,0.4138,0.3853,0.3508
7,334755.6386,208681724010.3892,456816.948,0.4445,0.4269,0.4188
8,333686.9466,211861455691.6114,460284.1032,0.46,0.4073,0.3887
9,335400.8031,213638209975.6103,462210.1362,0.4429,0.3941,0.3719


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Best Tuned Model Parameters: XGBRegressor(base_score=None, booster='gbtree', callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.5, device='cpu', early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.05, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=8, max_leaves=None,
             min_child_weight=4, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=130, n_jobs=-1,
             num_parallel_tree=None, random_state=42, ...)
Tuning ExtraTreesRegressor(n_jobs=-1, random_state=42)


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,275409.0133,200050631517.5729,447270.1997,0.5485,0.3541,0.3252
1,265140.7856,172166841984.5543,414929.9242,0.586,0.3199,0.284
2,279191.8828,182218911881.5366,426871.0717,0.5787,0.3355,0.2973
3,291391.3185,299203646934.819,546995.1069,0.446,0.3449,0.2874
4,263329.0592,140926182772.6367,375401.3622,0.6037,0.3267,0.2939
5,259257.4866,150856707414.2747,388402.7644,0.5904,0.3288,0.2968
6,264203.586,165061604864.2511,406277.7435,0.5744,0.3126,0.2701
7,261073.069,150257547594.2747,387630.6845,0.6,0.3309,0.299
8,267585.8335,156219507112.4622,395246.1348,0.6018,0.3304,0.2967
9,261392.4236,152672897193.8663,390733.7933,0.6019,0.3076,0.2675


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Best Tuned Model Parameters: ExtraTreesRegressor(max_depth=11, max_features='log2',
                    min_impurity_decrease=0.0001, min_samples_leaf=3,
                    min_samples_split=7, n_estimators=30, n_jobs=-1,
                    random_state=42)


In [7]:
# Evaluate each tuned model
print("\nEvaluating Models...")
for model in tuned_models:
    print(f"\nEvaluating {model}")
    evaluate_model(model)



Evaluating Models...

Evaluating KNeighborsRegressor(metric='manhattan', n_jobs=-1, n_neighbors=8)


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…


Evaluating ElasticNet(random_state=42)


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…


Evaluating AdaBoostRegressor(learning_rate=0.4, loss='square', n_estimators=210,
                  random_state=42)


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…


Evaluating XGBRegressor(base_score=None, booster='gbtree', callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.5, device='cpu', early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.05, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=8, max_leaves=None,
             min_child_weight=4, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=130, n_jobs=-1,
             num_parallel_tree=None, random_state=42, ...)


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…


Evaluating ExtraTreesRegressor(max_depth=11, max_features='log2',
                    min_impurity_decrease=0.0001, min_samples_leaf=3,
                    min_samples_split=7, n_estimators=30, n_jobs=-1,
                    random_state=42)


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [10]:
# Select the best model based on R2 score
best_model = tuned_models[0]  # The first model is the best one after sorting
print(f"\nFinalizing and Saving the Best Model: {best_model}")

# Finalize the best model
final_best_model = finalize_model(best_model)

# Save only the best model
save_model(final_best_model, 'C:/Users/srira/OneDrive/MLOPS/individual/models')

print("\n✅ Best Model Saved Successfully!")



Finalizing and Saving the Best Model: KNeighborsRegressor(metric='manhattan', n_jobs=-1, n_neighbors=8)
Transformation Pipeline and Model Successfully Saved

✅ Best Model Saved Successfully!


In [None]:
print("\nModel training and saving completed successfully.")


In [2]:
from pycaret.regression import load_model

model = load_model("C:/Users/srira/OneDrive/MLOPS/individual/model/final_model_1")
print(model)


Transformation Pipeline and Model Successfully Loaded
Pipeline(memory=Memory(location=None),
         steps=[('numerical_imputer',
                 TransformerWrapper(include=['Rooms', 'Distance', 'Postcode',
                                             'Bedroom2', 'Bathroom', 'Car',
                                             'Landsize', 'BuildingArea',
                                             'YearBuilt', 'Lattitude',
                                             'Longtitude', 'Propertycount'],
                                    transformer=SimpleImputer())),
                ('categorical_imputer',
                 TransformerWrapper(include=['Suburb', 'Address', 'Type',
                                             'Method', 'S...
                                    transformer=TargetEncoder(cols=['Suburb',
                                                                    'Address',
                                                                    'Seller',
                 