In [29]:
import pandas as pd
import catboost as ct
from sklearn.model_selection import KFold, GroupKFold, train_test_split, TimeSeriesSplit

VERSION = "CATBOOST_V1"
target="price"

In [2]:
main_train_data = pd.read_csv("HW_train_main_data.csv", parse_dates=["timestamp"])
main_test_data = pd.read_csv("HW_test_main_data.csv", parse_dates=["timestamp"])

main_train_data = main_train_data.sort_values(by=["timestamp"])
main_test_data = main_test_data.sort_values(by=["timestamp"])
add_train_data = pd.read_csv("HW_train_additional_data.csv")
add_test_data = pd.read_csv("HW_test_additional_data.csv")

In [3]:
main_train_data = main_train_data.merge(add_train_data, on="id")
main_test_data = main_test_data.merge(add_test_data, on="id")

In [6]:
main_train_data.dtypes

id                                          int64
timestamp                          datetime64[ns]
full_sq                                     int64
life_sq                                   float64
floor                                     float64
max_floor                                 float64
material                                  float64
build_year                                float64
num_room                                  float64
kitch_sq                                  float64
apartment condition                       float64
sub_area                                    int64
price                                     float64
population                                  int64
indust_part                               float64
preschool_facilities                        int64
school_facilities                           int64
hospital_beds_raion                       float64
healthcare_facilities                       int64
university_num                              int64


In [19]:
main_train_data[["kitch_sq"]].value_counts()

kitch_sq
1.0         4612
8.0         2565
5.0         2460
6.0         2179
10.0        2137
            ... 
48.0           1
44.0           1
36.0           1
35.0           1
2014.0         1
Length: 74, dtype: int64

In [None]:
# X_train = main_train_data.drop(['id', target], axis = 1)
# y_train = main_train_data[target]

# X_test = main_test_data.drop(['id', target], axis = 1)

In [27]:
kf = KFold(n_splits=5)

models = []

FEATURES = [c for c in main_train_data.columns if c not in ['id', 'timestamp', target]]

for i, (train_index, test_index) in enumerate(kf.split(X=main_train_data)):
    
    train_x = main_train_data.iloc[train_index][FEATURES]
    train_y = main_train_data[target].iloc[train_index]

    # VALID DATA
    valid_x = main_train_data.iloc[test_index][FEATURES]
    valid_y = main_train_data[target].iloc[test_index]


    # TRAIN MODEL        
    model = ct.CatBoostRegressor(loss_function='RMSE')
    cat_features = ['leisure_facilities', 'church_facilities']
    model.fit(train_x, train_y,
            eval_set=[(valid_x, valid_y)], early_stopping_rounds = 50,
            use_best_model=True,
            verbose=100, cat_features = cat_features)
    
    models.append(model)

Learning rate set to 0.083421
0:	learn: 4901522.4747542	test: 4657885.4717088	best: 4657885.4717088 (0)	total: 19.5ms	remaining: 19.5s
100:	learn: 2562731.4460439	test: 3407706.5771566	best: 3406915.9798533 (93)	total: 1.86s	remaining: 16.6s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 3389881.121
bestIteration = 136

Shrink model to first 137 iterations.
Learning rate set to 0.083421
0:	learn: 4956638.6475368	test: 4397296.8196639	best: 4397296.8196639 (0)	total: 17.3ms	remaining: 17.3s
100:	learn: 2756707.2357912	test: 2788983.3159248	best: 2788983.3159248 (100)	total: 1.83s	remaining: 16.3s
200:	learn: 2574487.8139100	test: 2730427.9049948	best: 2730427.9049948 (200)	total: 3.62s	remaining: 14.4s
300:	learn: 2446087.4832128	test: 2696188.3657125	best: 2696054.5971135 (299)	total: 5.43s	remaining: 12.6s
400:	learn: 2359231.5144656	test: 2683804.1944897	best: 2683804.1944897 (400)	total: 7.22s	remaining: 10.8s
Stopped by overfitting detector  (50 iterations wait)


In [30]:
ts = TimeSeriesSplit(n_splits=5)

models = []

FEATURES = [c for c in main_train_data.columns if c not in ['id', 'timestamp', target]]

for i, (train_index, test_index) in enumerate(kf.split(X=main_train_data)):
    
    train_x = main_train_data.iloc[train_index][FEATURES]
    train_y = main_train_data[target].iloc[train_index]

    # VALID DATA
    valid_x = main_train_data.iloc[test_index][FEATURES]
    valid_y = main_train_data[target].iloc[test_index]


    # TRAIN MODEL        
    model = ct.CatBoostRegressor(loss_function='RMSE')
    cat_features = ['leisure_facilities', 'church_facilities']
    model.fit(train_x, train_y,
            eval_set=[(valid_x, valid_y)], early_stopping_rounds = 50,
            use_best_model=True,
            verbose=100, cat_features = cat_features)
    
    models.append(model)

Learning rate set to 0.083421
0:	learn: 4901522.4747542	test: 4657885.4717088	best: 4657885.4717088 (0)	total: 19.6ms	remaining: 19.6s
100:	learn: 2562731.4460439	test: 3407706.5771566	best: 3406915.9798533 (93)	total: 1.82s	remaining: 16.2s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 3389881.121
bestIteration = 136

Shrink model to first 137 iterations.
Learning rate set to 0.083421
0:	learn: 4956638.6475368	test: 4397296.8196639	best: 4397296.8196639 (0)	total: 19.9ms	remaining: 19.9s
100:	learn: 2756707.2357912	test: 2788983.3159248	best: 2788983.3159248 (100)	total: 1.98s	remaining: 17.6s
200:	learn: 2574487.8139100	test: 2730427.9049948	best: 2730427.9049948 (200)	total: 3.82s	remaining: 15.2s
300:	learn: 2446087.4832128	test: 2696188.3657125	best: 2696054.5971135 (299)	total: 5.64s	remaining: 13.1s
400:	learn: 2359231.5144656	test: 2683804.1944897	best: 2683804.1944897 (400)	total: 7.42s	remaining: 11.1s
Stopped by overfitting detector  (50 iterations wait)


In [16]:
pred['id'] = main_test_data["id"]
pred = pred.sort_values(by=["id"])
pred = pred.rename(columns={"prediction_label":"predicted_price"}).reset_index()
pred[['id', "predicted_price"]].to_csv(f'{VERSION}.csv', index=False)
