In [None]:
!pip install catboost
!pip install optuna

In [None]:
import pandas as pd
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score
from math import sqrt

In [None]:
train = pd.read_csv('train.csv')
train_id = train['Id']
train.drop('Id', axis=1, inplace=True)

X = train.drop('SalePrice', axis=1)
y = train['SalePrice']

for col in X.columns:
    if X[col].dtype != 'object':
        median_val = X[col].median()
        X[col].fillna(median_val, inplace=True)

X.fillna('missing', inplace=True)

cat_features = [i for i in X.columns if X[i].dtype == 'object']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = CatBoostRegressor(iterations=1000, depth=6, learning_rate=0.1, loss_function='RMSE', cat_features=cat_features, verbose=200)
model.fit(X_train, y_train, eval_set=(X_valid, y_valid), early_stopping_rounds=50)

0:	learn: 72294.6354378	test: 82692.0044123	best: 82692.0044123 (0)	total: 86.2ms	remaining: 1m 26s
200:	learn: 12609.0998199	test: 27187.5927625	best: 27182.2168744 (197)	total: 8.19s	remaining: 32.6s
400:	learn: 8152.8980172	test: 26573.3311617	best: 26573.3311617 (400)	total: 14.2s	remaining: 21.1s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 26567.92742
bestIteration = 402

Shrink model to first 403 iterations.


<catboost.core.CatBoostRegressor at 0x7d8adbe17a60>

In [None]:
y_pred = model.predict(X_valid)

mae = mean_absolute_error(y_valid, y_pred)
mse = mean_squared_error(y_valid, y_pred)
rmse = sqrt(mse)
r2 = r2_score(y_valid, y_pred)

print(f'Mean Absolute Error (MAE): {mae}')
print(f'Mean Squared Error (MSE): {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'R-squared (R2 Score): {r2}')

Mean Absolute Error (MAE): 16200.464509588272
Mean Squared Error (MSE): 705854767.6586078
Root Mean Squared Error (RMSE): 26567.927424972535
R-squared (R2 Score): 0.9079759128109546


In [None]:
import optuna
from sklearn.model_selection import cross_val_score

def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'random_strength': trial.suggest_float('random_strength', 0, 1),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1)
    }

    model = CatBoostRegressor(**params, loss_function='RMSE', cat_features=cat_features, verbose=False)
    return cross_val_score(model, X, y, cv=3, scoring='r2').mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

print("\nBest R2 Score:", study.best_value)
print("\nBest hyperparameters:", study.best_params)


In [None]:
best_params = {'iterations': 556, 'depth': 5, 'learning_rate': 0.20107935865240828, 'random_strength': 0.2805400794249189, 'bagging_temperature': 0.12228467179968683}
best_model = CatBoostRegressor(**best_params, loss_function='RMSE', cat_features=cat_features, verbose=False)
best_model.fit(X, y)

<catboost.core.CatBoostRegressor at 0x7d8abfcf0700>

In [None]:
y_pred = best_model.predict(X_valid)

mae = mean_absolute_error(y_valid, y_pred)
mse = mean_squared_error(y_valid, y_pred)
rmse = sqrt(mse)
r2 = r2_score(y_valid, y_pred)

print(f'Mean Absolute Error (MAE): {mae}')
print(f'Mean Squared Error (MSE): {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'R-squared (R2 Score): {r2}')

Mean Absolute Error (MAE): 6121.56131138173
Mean Squared Error (MSE): 68710862.00276913
Root Mean Squared Error (RMSE): 8289.201529868189
R-squared (R2 Score): 0.9910419895912136


In [None]:
test = pd.read_csv('test.csv')

test_ids = test['Id']

test.drop('Id', axis=1, inplace=True)

for col in test.columns:
    if test[col].dtype != 'object':  # If not categorical
        median_val = test[col].median()
        test[col].fillna(median_val, inplace=True)

test.fillna('missing', inplace=True)

predictions = best_model.predict(test)

submission = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': predictions
})
submission.to_csv('submission.csv', index=False)
