In [14]:
from sklearn.pipeline import Pipeline
import pickle
import numpy as np

# Scraping the data preprocessing pipeline from the existing Pipeline
with open('PIPELINE.pkl' , 'rb') as file:
    pipeline = pickle.load(file)
with open('dataframe.pkl' , 'rb') as file:
    data = pickle.load(file)

data_pipeline = Pipeline(pipeline.steps[:-1])


In [15]:
data_pipeline

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7651 entries, 0 to 10664
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     7651 non-null   object 
 1   availability  7651 non-null   int64  
 2   location      7651 non-null   object 
 3   total_sqft    7651 non-null   float64
 4   bath          7651 non-null   float64
 5   price         7651 non-null   float64
 6   bhk           7651 non-null   int32  
dtypes: float64(3), int32(1), int64(1), object(2)
memory usage: 448.3+ KB


In [17]:
X = data.drop(columns = ['price'])
Y = data['price']

In [44]:
from sklearn.ensemble import GradientBoostingRegressor , AdaBoostRegressor , StackingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split , GridSearchCV , RandomizedSearchCV

x_train , x_test , y_train , y_test = train_test_split(X , Y , shuffle = True , test_size = 0.2)

In [38]:
model1 = GradientBoostingRegressor()
model1_pipe = Pipeline([('Data_processing' , data_pipeline) , ('model1' , model1)])
parameters = {
    "model1__learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
    "model1__max_depth":[3,5,8],
    "model1__max_features":["log2","sqrt"],
    "model1__criterion": ["friedman_mse"],
    "model1__subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
    "model1__n_estimators":[50, 100, 150, 200]
}

gscv1 = GridSearchCV(model1_pipe , parameters , scoring='neg_mean_absolute_error' , cv = 5 , n_jobs=-1 , verbose = 3)
gscv1.fit(x_train , y_train)

Fitting 5 folds for each of 1176 candidates, totalling 5880 fits
[CV 5/5] END model1__criterion=friedman_mse, model1__learning_rate=0.01, model1__max_depth=3, model1__max_features=log2, model1__n_estimators=50, model1__subsample=0.5;, score=-47.027 total time=   0.1s
[CV 1/5] END model1__criterion=friedman_mse, model1__learning_rate=0.01, model1__max_depth=3, model1__max_features=log2, model1__n_estimators=50, model1__subsample=0.5;, score=-47.426 total time=   0.2s
[CV 4/5] END model1__criterion=friedman_mse, model1__learning_rate=0.01, model1__max_depth=3, model1__max_features=log2, model1__n_estimators=50, model1__subsample=0.618;, score=-48.581 total time=   0.1s
[CV 3/5] END model1__criterion=friedman_mse, model1__learning_rate=0.01, model1__max_depth=3, model1__max_features=log2, model1__n_estimators=50, model1__subsample=0.618;, score=-49.476 total time=   0.2s
[CV 3/5] END model1__criterion=friedman_mse, model1__learning_rate=0.01, model1__max_depth=3, model1__max_features=log2

In [39]:
print(gscv1.best_estimator_)
print('--'*10)
print(f'Best score :{gscv1.best_score_}')

Pipeline(steps=[('Data_processing',
                 Pipeline(steps=[('step1',
                                  ColumnTransformer(remainder='passthrough',
                                                    transformers=[('col_tnf',
                                                                   OneHotEncoder(drop='first',
                                                                                 sparse_output=False),
                                                                   [0, 2])])),
                                 ('step2', StandardScaler())])),
                ('model1',
                 GradientBoostingRegressor(learning_rate=0.15, max_depth=8,
                                           max_features='sqrt',
                                           n_estimators=200, subsample=0.85))])
--------------------
Best score :-15.883028237152882


In [42]:
model2 = AdaBoostRegressor()
model2_pipe = Pipeline([('Data_processing' , data_pipeline) , ('model2' , model2)])
parameters = {
    "model2__learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
    'model2__loss' : ['linear', 'square', 'exponential'],
    "model2__n_estimators":[50, 100, 150, 200]
}

gscv2 = GridSearchCV(model2_pipe , parameters , scoring='neg_mean_absolute_error' , cv = 5 , n_jobs=-1 , verbose = 3)
gscv2.fit(x_train , y_train)

Fitting 5 folds for each of 84 candidates, totalling 420 fits
[CV 2/5] END model2__learning_rate=0.01, model2__loss=linear, model2__n_estimators=50;, score=-26.372 total time=   2.1s
[CV 5/5] END model2__learning_rate=0.01, model2__loss=linear, model2__n_estimators=50;, score=-27.088 total time=   2.1s
[CV 3/5] END model2__learning_rate=0.01, model2__loss=linear, model2__n_estimators=50;, score=-26.715 total time=   2.1s
[CV 1/5] END model2__learning_rate=0.01, model2__loss=linear, model2__n_estimators=50;, score=-25.348 total time=   2.2s
[CV 4/5] END model2__learning_rate=0.01, model2__loss=linear, model2__n_estimators=50;, score=-25.086 total time=   2.2s
[CV 2/5] END model2__learning_rate=0.01, model2__loss=linear, model2__n_estimators=100;, score=-26.744 total time=   4.0s
[CV 1/5] END model2__learning_rate=0.01, model2__loss=linear, model2__n_estimators=100;, score=-25.348 total time=   4.2s
[CV 3/5] END model2__learning_rate=0.01, model2__loss=linear, model2__n_estimators=100;, 

In [43]:
print(gscv2.best_estimator_)
print('--'*10)
print(f'Best score :{gscv2.best_score_}')

Pipeline(steps=[('Data_processing',
                 Pipeline(steps=[('step1',
                                  ColumnTransformer(remainder='passthrough',
                                                    transformers=[('col_tnf',
                                                                   OneHotEncoder(drop='first',
                                                                                 sparse_output=False),
                                                                   [0, 2])])),
                                 ('step2', StandardScaler())])),
                ('model2',
                 AdaBoostRegressor(learning_rate=0.025, loss='square'))])
--------------------
Best score :-25.98007300710056


In [47]:
mean_absolute_error(gscv1.best_estimator_.predict(x_test) , y_test) , mean_absolute_error(pipeline.predict(x_test) , y_test)

(12.311008461717156, 12.823689637660669)

In [48]:
# lets save the Gradient boosting model
with open('GradientBoostingRegressor.pkl' , 'wb') as file:
    pickle.dump(gscv1.best_estimator_ , file)

In [105]:
# Now as Gradient Boosting is performing well lets create a stacking model using GradientBoosting and Xgboost Model that was used earlier
xgbr = pipeline.steps[-1][1]
gbr = gscv1.best_estimator_.steps[-1][1]

from sklearn.svm import LinearSVR

estimators = [('xgbr', xgbr), ('gbr', gbr)]
stacking_model = StackingRegressor(estimators = estimators , final_estimator = LinearSVR())
stacking_model_pipeline = Pipeline([
                            ('Data_pipeline' , data_pipeline),
                            ('stacking_model' , stacking_model)
                        ])
stacking_model_pipeline.fit(x_train , y_train)



In [106]:
mean_absolute_error(stacking_model_pipeline.predict(x_test) , y_test)

18.38473245692642

In [107]:
param_grid = {
    'stacking_model__final_estimator__C': [0.01, 0.1, 1, 10, 100],      
    'stacking_model__final_estimator__epsilon': [0.01, 0.1, 0.2, 0.5, 1], 
    'stacking_model__final_estimator__tol': [1e-4, 1e-3, 1e-2, 1e-1],     
}

gscv3 = GridSearchCV(stacking_model_pipeline , param_grid , n_jobs = -1 , scoring = 'neg_mean_squared_error' , verbose = 3)
gscv3.fit(x_train , y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
