In [21]:
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
df = pd.read_csv('houses.csv')

In [47]:
df.head()

Unnamed: 0,rooms,baths,square,post,price
0,4,4.0,4053,85255,869500
1,4,3.0,3343,36372,865200
2,3,4.0,3923,85266,889000
3,5,5.0,4022,85262,910000
4,3,4.0,4116,85266,971226


In [51]:
true_price = df['price'][(df['rooms'] == 4) & (df['baths'] == 4) & (df['square'] == 4053) & (df['post'] == 85255)]

In [55]:
float(true_price)

869500.0

In [3]:
# split df to train test
X_train, X_valid, y_train, y_valid = train_test_split(df.drop('price', axis=1), df['price'], test_size=0.2, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [4]:
model = CatBoostRegressor()

grid = {'learning_rate': [0.03, 0.1],
        'depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
        'l2_leaf_reg': [1, 3, 5, 7, 9]}

In [None]:

randomized_search_result = model.randomized_search(grid,
                                                   X=X_train,
                                                   y=y_train,
                                                   plot=True)


In [18]:
randomized_search_result

{'params': {'depth': 10, 'l2_leaf_reg': 5, 'learning_rate': 0.1},
 'cv_results': defaultdict(list,
             {'iterations': [0,
               1,
               2,
               3,
               4,
               5,
               6,
               7,
               8,
               9,
               10,
               11,
               12,
               13,
               14,
               15,
               16,
               17,
               18,
               19,
               20,
               21,
               22,
               23,
               24,
               25,
               26,
               27,
               28,
               29,
               30,
               31,
               32,
               33,
               34,
               35,
               36,
               37,
               38,
               39,
               40,
               41,
               42,
               43,
               44,
               45,
               46,
    

In [17]:
model.get_all_params()

{'nan_mode': 'Min',
 'eval_metric': 'RMSE',
 'iterations': 1000,
 'sampling_frequency': 'PerTree',
 'leaf_estimation_method': 'Newton',
 'grow_policy': 'SymmetricTree',
 'penalties_coefficient': 1,
 'boosting_type': 'Plain',
 'model_shrink_mode': 'Constant',
 'feature_border_type': 'GreedyLogSum',
 'bayesian_matrix_reg': 0.10000000149011612,
 'force_unit_auto_pair_weights': False,
 'l2_leaf_reg': 5,
 'random_strength': 1,
 'rsm': 1,
 'boost_from_average': True,
 'model_size_reg': 0.5,
 'pool_metainfo_options': {'tags': {}},
 'subsample': 0.800000011920929,
 'use_best_model': False,
 'random_seed': 0,
 'depth': 10,
 'posterior_sampling': False,
 'border_count': 254,
 'classes_count': 0,
 'auto_class_weights': 'None',
 'sparse_features_conflict_fraction': 0,
 'leaf_estimation_backtracking': 'AnyImprovement',
 'best_model_min_trees': 1,
 'model_shrink_rate': 0,
 'min_data_in_leaf': 1,
 'loss_function': 'RMSE',
 'learning_rate': 0.10000000149011612,
 'score_function': 'Cosine',
 'task_type

In [None]:
model.fit(X_train, y_train)

In [7]:
def mean_absolute_percentage_error(true, pred):
    abs_error = (np.abs(true - pred)) / true
    sum_abs_error = np.sum(abs_error)
    mape_loss = (sum_abs_error / true.size) * 100
    return mape_loss

In [23]:
def root_mean_squared_error(true, pred):
    squared_error = np.square(true - pred) 
    sum_squared_error = np.sum(squared_error)
    rmse_loss = np.sqrt(sum_squared_error / true.size)
    return rmse_loss

In [20]:
y_pred = model.predict(X_test)

In [22]:
mean_absolute_percentage_error(y_test, y_pred)

26.12264068996636

In [36]:
model2 = model.copy()

In [37]:
X_train2 = X_train.append(X_test)
y_train2 = y_train.append(y_test)

  X_train2 = X_train.append(X_test)
  y_train2 = y_train.append(y_test)


In [None]:
model2.fit(X_train2, y_train2)

In [39]:
y_pred2 = model2.predict(X_valid)

In [24]:
root_mean_squared_error(y_test, y_pred)

194078.55938731233

In [40]:
root_mean_squared_error(y_valid, y_pred2)

267752.5921891755

In [41]:
mean_absolute_percentage_error(y_valid, y_pred2)

36.00161367096253

In [29]:
X_train

Unnamed: 0,rooms,baths,square,post
400,2,2.0,1056,92276
83,4,2.5,3060,91901
164,4,3.0,2462,95220
19,3,3.5,4143,85266
348,2,2.0,1344,92276
...,...,...,...,...
5,4,5.0,4581,85266
402,3,2.0,1806,92276
504,3,2.5,1935,92802
389,2,2.0,1440,92276


In [42]:
check = [2, 2, 1000, 92276]

In [33]:
y_train

400      68500
83      699999
164     915000
19      925000
348      67500
        ...   
5      1249000
402     179000
504     695000
389     154500
220    1495000
Name: price, Length: 342, dtype: int64

In [43]:
model2.predict(check)

84152.66906817962

In [None]:
save_model(fname,
           format="cbm",
           export_parameters=None,
           pool=None)

In [44]:
# After you train the model using fit(), save like this - 
model.save_model('catboost')    # extension not required.

model.load_model('catboost')

<catboost.core.CatBoostRegressor at 0x1cf3deb8850>

In [45]:
model.predict(check)

78439.49696686753