In [None]:
!pip install catboost

In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor

from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_log_error, make_scorer

In [None]:
train_file = pd.read_csv('filepath/train.csv')
df = pd.DataFrame(train_file)
target_train = df['SalePrice']
df.drop(['SalePrice', 'Id'], axis='columns', inplace=True)

list_for_indexes_of_objects_columns = []
for nr, element in enumerate(df.dtypes):
  if element == 'object':
    list_for_indexes_of_objects_columns.append(nr)

In [None]:
encoder = LabelEncoder()
imputer = SimpleImputer(strategy = 'mean')
for column_nr in list_for_indexes_of_objects_columns:
    df.iloc[:,column_nr] = encoder.fit_transform(df.iloc[:, column_nr])


fullfilled_df = imputer.fit_transform(df)
data_train = fullfilled_df

In [None]:
minimizer = MinMaxScaler()
data_train = minimizer.fit_transform(data_train)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data_train, target_train, test_size=0.3, random_state=42)

In [None]:
def create_model_and_params():
    model_params = {
        'CatBoostRegressor': {
            'model': CatBoostRegressor(verbose=0),
            'params': {
                'n_estimators': [32, 64, 128, 256, 512],
                'learning_rate': [0.001, 0.01, 0.1 ,0.2],
                'max_depth': [3, 6, 9],
            }
        },
        'XGBRegressor': {
            'model': xgb.XGBRegressor(use_label_encoder=False, eval_metric='rmse'),
            'params': {
                'n_estimators': [32, 64, 128, 256, 512],
                'learning_rate': [0.001, 0.01, 0.1 ,0.2],
                'max_depth': [3, 6, 9],
            }
        },
        'GradientBoostingRegressor': {
            'model': GradientBoostingRegressor(),
            'params': {
                'n_estimators': [32, 64, 128, 256, 512],
                'learning_rate': [0.001, 0.01, 0.1 ,0.2],
                'max_depth': [3, 6, 9],
            }
        },
    }

    return model_params

In [1]:
def rmsle_scorer(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

rmsle_scorer_cv = make_scorer(rmsle_scorer, greater_is_better=False)

model_params = create_model_and_params()
best_model = None
best_score = float('inf')
best_params = None

NameError: name 'make_scorer' is not defined

In [None]:
for model_name, mp in model_params.items():
    grid_search = GridSearchCV(mp['model'], mp['params'], scoring=rmsle_scorer_cv, cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)

    if grid_search.best_score_ < best_score:
        best_score = grid_search.best_score_
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_

print('Best model:', best_model)
print('Best parameters:', best_params)
print('Best cross-validated RMSLE:', -best_score)  # Converting back to positive RMSLE


Best model: GradientBoostingRegressor(n_estimators=256)
Best parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 256}
Best cross-validated RMSLE: 0.14196775998304573


In [None]:
y_pred = best_model.predict(X_test)
test_rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
print(f'Test RMSLE: {test_rmsle}')

Test RMSLE: 0.13308241921421202


In [None]:
test_file = pd.read_csv('filepath/test.csv')
df_test = pd.DataFrame(test_file)


last_nr = df_test.iloc[-1, df_test.columns.get_loc('Id')]


df_test.drop(['Id'], axis='columns', inplace=True)
list_for_indexes_of_objects_columns_test = []
for nr, element in enumerate(df_test.dtypes):
  if element == 'object':
    list_for_indexes_of_objects_columns_test.append(nr)

In [None]:
encoder = LabelEncoder()
imputer = SimpleImputer(strategy = 'mean')
for column_nr in list_for_indexes_of_objects_columns_test:
    df_test.iloc[:,column_nr] = encoder.fit_transform(df_test.iloc[:, column_nr])


fullfilled_test_df = imputer.fit_transform(df_test)
data_test = fullfilled_test_df

In [None]:
minimizer = MinMaxScaler()
data_test = minimizer.fit_transform(data_test)

In [None]:
predictions = best_model.predict(data_test)

[149492.77614463 162792.73815445 223843.39940169 ... 191702.7896355
 146555.87400409 266044.53291184]


In [None]:
idx_arange = np.arange(1461, last_nr+1)
idx_array = np.array(idx_arange)

In [None]:
df_output = pd.DataFrame(data = zip(idx_array, predictions),
                          columns = ['Id', 'SalePrice'])

save_path = 'filepath/submision.csv'
df_output.to_csv(save_path, index=False)