# **1) Initiall instructions**

In [1]:
!pip install catboost
!pip install xgboost
!pip install optuna

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7
Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.8-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import optuna

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score, KFold

from xgboost import XGBRegressor
from catboost import CatBoostRegressor

# **2) Training data preparation**

In [4]:
data_train = pd.read_csv('/content/drive/MyDrive/train_houses.csv')
df_train = pd.DataFrame(data_train)

In [5]:
y = df_train['SalePrice']
X = df_train.drop(columns=['Id', 'SalePrice'])

cat_features = X.select_dtypes(include=['object', 'category']).columns
num_features = X.select_dtypes(include=['int64', 'float64']).columns

In [6]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), num_features),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]), cat_features)
    ]
)

# **3) Model evaluation**

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=13)

In [7]:
list_of_models = [
    ('CatBoost', CatBoostRegressor(random_state=13, verbose=0)),
    ('XGBoost', XGBRegressor(random_state=13, eval_metric='rmse'))
]

results = {}
for name, model in list_of_models:
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])

    scores = cross_val_score(pipeline, X, y, cv=kf, scoring='neg_mean_absolute_error')
    mean_score = scores.mean()
    results[name] = mean_score


for model_name, score in results.items():
    print(f'{model_name}: Mean Negative MAE = {score:.4f}')


best_model = max(results, key=results.get)
print(f'Best model: {best_model}')


CatBoost: Mean Negative MAE = -14502.4121
XGBoost: Mean Negative MAE = -17811.9416
Najlepszy model: CatBoost


# **4) Optimization of the best estimator**

In [29]:
def objective(trial):
  iterations = trial.suggest_int('iterations', 100, 1000)
  depth = trial.suggest_int('depth', 4, 12)
  learning_rate = trial.suggest_float('learning_rate', 0.001, 0.1)
  l2_leaf_reg = trial.suggest_float('l2_leaf_reg', 1e-3, 10)
  border_count = trial.suggest_int('border_count', 32, 255)
  random_strength = trial.suggest_float('random_strength', 0.0, 1.0)
  bagging_temperature = trial.suggest_float('bagging_temperature', 0.0, 1.0)
  verbose = False
  random_state = 13


  model = CatBoostRegressor(iterations=iterations, depth=depth, learning_rate=learning_rate, l2_leaf_reg=l2_leaf_reg, border_count=border_count, verbose=verbose, random_state=random_state)
  pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
  score = cross_val_score(pipeline, X, y, cv=kf, scoring='neg_mean_absolute_error').mean()
  return score

In [30]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)

[I 2024-12-09 17:18:57,453] A new study created in memory with name: no-name-81b150de-c7c0-49d4-aca9-813ac848ea85
[I 2024-12-09 17:19:00,601] Trial 0 finished with value: -17901.482423064776 and parameters: {'iterations': 517, 'depth': 4, 'learning_rate': 0.012283135354459636, 'l2_leaf_reg': 4.871971487345727, 'border_count': 161, 'random_strength': 0.6237273215527153, 'bagging_temperature': 0.7291026906762165}. Best is trial 0 with value: -17901.482423064776.
[I 2024-12-09 17:19:04,505] Trial 1 finished with value: -17355.40341658712 and parameters: {'iterations': 333, 'depth': 6, 'learning_rate': 0.016760217860777103, 'l2_leaf_reg': 3.7552585132112424, 'border_count': 223, 'random_strength': 0.35705172405411767, 'bagging_temperature': 0.8806193500347215}. Best is trial 1 with value: -17355.40341658712.
[I 2024-12-09 17:19:07,373] Trial 2 finished with value: -16488.069282176155 and parameters: {'iterations': 499, 'depth': 4, 'learning_rate': 0.029687809162304863, 'l2_leaf_reg': 8.303

In [31]:
best_params = study.best_params
iterations = best_params['iterations']
depth = best_params['depth']
learning_rate = best_params['learning_rate']
l2_leaf_reg = best_params['l2_leaf_reg']
border_count = best_params['border_count']
verbose = False
random_state = 13

best_cat_model = CatBoostRegressor(iterations=iterations, depth=depth, learning_rate=learning_rate, l2_leaf_reg=l2_leaf_reg, border_count=border_count, verbose=verbose, random_state=random_state)

# **5) Prediction data preparation and prediction**

In [None]:
data_test = pd.read_csv('/content/drive/MyDrive/test_houses.csv')
df_test = pd.DataFrame(data_test)

In [None]:
X_test = df_test.drop(columns=['Id'])

cat_features_test = X_test.select_dtypes(include=['object', 'category']).columns
num_features_test = X_test.select_dtypes(include=['int64', 'float64']).columns

X_test_transformed = preprocessor.transform(X_test)

In [34]:
best_model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', best_cat_model)
])
best_model_pipeline.fit(X, y)
predictions = best_model_pipeline.predict(X_test)

# **6) Save output file**

In [35]:
data_submission = pd.read_csv('/content/drive/MyDrive/sample_submission_houses.csv')
df_submission = pd.DataFrame(data_submission)

df_submission["SalePrice"] = predictions
df_submission.to_csv("cat_output_submission_houses.csv", index=False)
