#Group Details


**Group Members (student ID):** August Asheim Birkelan (506753), Ørjan Carlsen (507694), Alexey Gusev (477979)

**Kaggle Competition:** Moscow Housing

**Kaggle Team:** Group 1

# PIP INSTALLS

In [1]:
pip install xgboost

In [2]:
pip install lightgbm

In [None]:
pip install geopy

In [None]:
pip install catboost

In [None]:
pip install category_encoders -q

# IMPORTS

In [5]:
import json
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler, KBinsDiscretizer
from sklearn.model_selection import train_test_split, KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.neighbors import KNeighborsRegressor
from category_encoders.target_encoder import TargetEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import RidgeCV, LinearRegression
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.impute import SimpleImputer
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from geopy.distance import geodesic

from copy import deepcopy

%matplotlib inline


plt.style.use('ggplot')
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

# EDA

### Collect data

In [None]:
# TRAIN SET
buildings = pd.read_csv('../input/moscow/buildings_train.csv')
apartments = pd.read_csv('../input/moscow/apartments_train.csv
print(f'All apartments have an associated building: {apartments.building_id.isin(buildings.id).all()}')
data_train = pd.merge(apartments, buildings.set_index('id'), how='left', left_on='building_id', right_index=True)

# Dropping duplicates in traindata_train = data_train.drop_duplicates(subset=data_train.columns.difference(['d']))
 TEST SETapartments_test = pd.read_csv('../input/moscow/apartments_test.csv'
buildings_test = pd.read_csv('../input/moscow/buildings_test.csv')
print(f'All test apartments have an associated building: {apartments_test.building_id.isin(buildings_test.id).all()}')
data_test = pd.merge(apartments_test, buildings_test.set_index('id'), how='left', left_on='building_id', right_index=True)

# ALL DATA
data_all = pd.concat([data_train, data_test])
data_all['Split'] = np.where(data_all['id'] <= np.max(data_train['id']), 'Train', 'Test')
data_all = data_all.drop(['id'], axis=1)
pd.set_option("display.max_rows", 2000)

 ### Remove outliers

**Remove data with too high price**

In [None]:
data_all = data_all[(data_all['area_total']<=1175) | (data_all['Split']=='Test')]

**Remove data with too big area total**

In [None]:
data_all = data_all[(data_all['price']<1000000000) | (data_all['price'].isna())]

### Bathrooms

Use median of bathrooms for same building to fill in for NaN

In [None]:
data_all["bathrooms_shared"] = data_all.groupby("building_id").transform(lambda x: x.fillna(x.median()))["bathrooms_shared"]
data_all["bathrooms_private"] = data_all.groupby("building_id").transform(lambda x: x.fillna(x.median()))["bathrooms_private"]

### District

In [None]:
pd.set_option("display.max_rows", 2000)
data_all.loc[(data_all['latitude']==55.595160) & (data_all['longitude']==37.741109) & (data_all['district'].isnull()), 'district'] = 5
data_all.loc[(data_all['latitude']==17.141734) & (data_all['longitude']==-61.790500) & (data_all['district'].isnull()), 'district'] = 11
data_all.loc[(data_all['latitude']==55.583537) & (data_all['longitude']==37.478025	) & (data_all['district'].isnull()), 'district'] = 11
data_all.loc[(data_all['latitude']==55.583551) & (data_all['longitude']==37.711356) & (data_all['district'].isnull()), 'district'] = 5
data_all.loc[(data_all['street']=='В мкр') & (data_all['district'].isnull()), 'district'] = 2
data_all.loc[(data_all['street']=='улица 1-я Линия') & (data_all['district'].isnull()), 'district'] = 3
data_all.loc[(data_all['street']=='улица Центральная') & (data_all['district'].isnull()), 'district'] = 11
data_all.loc[(data_all['address']=='Москва А101 ЖК') & (data_all['district'].isnull()), 'district'] = 11
data_all.loc[(data_all['street']=='Бунинские Луга ЖК') & (data_all['district'].isnull()), 'district'] = 11 #

### Longitude and Latitude

In [None]:


data_all.loc[(data_all['address']=='к2/2/1') & (data_all['street']=='Бунинские Луга ЖК') & (data_all['longitude']<0), ['latitude', 'longitude']] = [55.544046, 37.478055]
data_all.loc[(data_all['address']=='к2/2/2') & (data_all['street']=='Бунинские Луга ЖК') & (data_all['longitude']<0), ['latitude', 'longitude']] = [55.544886, 37.478459] 
data_all.loc[data_all['address'] == 'Москва А101 ЖК', ['latitude', 'longitude']] = [55.560891,	37.473761]
data_all.loc[(data_all['street']=='улица Центральная') & (data_all['address']=='48'), ['latitude', 'longitude']] = [55.853511, 37.384711]	 #[55.809245, 37.350090]
data_all.loc[(data_all['street']=='улица Центральная') & (data_all['address']=='75'), ['latitude', 'longitude']] = [55.853511, 37.384711]   #[55.809245, 37.350090]]

### Ceiling

In [None]:
data_all.loc[(data_all['ceiling']>20), 'ceiling'] = np.nan
data_all.loc[(data_all['ceiling']<1), 'ceiling'] = np.nan

### Distance to Universities/Colleges

In [None]:
area_kitchen = data_all.area_kitchen
area_living = data_all.area_living
data_all = data_all.drop(['area_kitchen', 'area_living'], axis=1)
coordinate_dict = {
    'MSU'    : ( 55.704279331013915 , 37.527720613854    ), # center/sw
    'MSUCE'  : ( 55.859955674414444 , 37.707267495936996 ), # nw
    'BMSTU'  : ( 55.76615846588919  , 37.68505253621469  ), # center/east
    'IUFS'   : ( 55.628344081320236 , 37.593311336163644 ), # south
    'LIS'    : ( 55.892843325039514 , 37.57455519923625  ), # north
    'NW'     : ( 55.81084987524267  , 37.51065533037661  ),
    'MUG'    : ( 55.72997580854584  , 37.816477554416245 ), # east
    'KKAR'   : ( 55.65828905630157  , 37.770577681885065 ), # se
    'IESR'   : ( 55.79239687372863  , 37.82063842368287  ), # east
    'RCTU'   : ( 55.85867123962497  , 37.4158124386546   ), # nw
    'MEI'    : ( 55.77480692814678  , 37.52265213121898  ), # center/west
    'MPU'    : ( 55.723255925832    , 37.674135958782806 ), # center/se
    'RMANPO' : ( 55.867347229014214 , 37.4761356952223   ), # nw
    'IIEP'   : ( 55.594679800406006 , 37.6686510189342   ), # south
    'MPU2'   : ( 55.820408245566306 , 37.664296132279986 ), # center/north
    'MSIEA'  : ( 55.80611763965845  , 37.41016712643405  ), # nw
    'SW'     : ( 55.6514038020098   , 37.49940394537357  ),
    'IETVS'  : ( 55.7262213003841   , 37.399531618530936 ), # east
    'CENTER' : ( 55.75377154250644  , 37.6197263162158   )
}

import geopy.distance

for university, coordinate in coordinate_dict.items():
  data_all[university] = data_all.apply(lambda row: geopy.distance.geodesic((row['latitude'],row['longitude']), coordinate).km, axis=1)


def angleFromCoordinate(lat1, long1, lat2, long2):

    dLon = (long2 - long1)

    y = np.sin(dLon) * np.cos(lat2)
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(dLon)

    return np.arctan2(y, x);

for university, coordinate in coordinate_dict.items():
  data_all['theta_' + university] = angleFromCoordinate(coordinate[0], coordinate[1], data_all['latitude'], data_all['longitude'])

In [None]:
data_all.loc[(data_all['seller']==2), 'seller'] = 0
data_all.loc[(data_all['seller']==3), 'seller'] = 1

In [None]:
unique_street = data_all['street'].unique()
street_map = dict([(y,x+1) for x,y in enumerate(sorted(unique_street))])
data_all['street'] = data_all['street'].apply(lambda x: street_map[x])

unique_address = data_all['address'].unique()
address_map = dict([(y,x+1) for x,y in enumerate(sorted(unique_address))])
data_all['address'] = data_all['address'].apply(lambda x: address_map[x])

### Store data for CatBoost

In [None]:
data_all_cat = deepcopy(data_all)
data_all_cat

**Make features categorical for CatBoost**

### OneHotEncoding

In [None]:
stored_data_all = deepcopy(data_all)
for col in list(data_all.columns[data_all.nunique()<9]):
  if col == 'Split':
    continue
  stored_data_all = pd.concat([stored_data_all, pd.get_dummies(data_all[col], prefix=col)], axis=1)
  stored_data_all = stored_data_all.drop(col, axis=1)

In [None]:
# Copy the data
split = stored_data_all.Split
stored_data_all = stored_data_all.drop('Split', axis=1)
# Init
ii_imp = IterativeImputer(estimator=ExtraTreesRegressor(n_jobs=-1, random_state=42), max_iter=4, random_state=42, verbose=2)

# Tranform
stored_data_all.loc[:, :] = ii_imp.fit_transform(stored_data_all)

In [None]:
stored_data_all['Split'] = split

In [None]:
data_all = deepcopy(stored_data_all)
data_all

In [None]:
stored_data = deepcopy(data_all)
stored_data

In [None]:
stored_data_cat = deepcopy(data_all_cat)
stored_data_cat

# Feature engineering 

**Clustering**

In [None]:
def cluster_geo_data(df, df_test):
    from sklearn.cluster import KMeans
    k_means = KMeans(n_clusters = 400, max_iter = 10000, init='k-means++', random_state=42)

    lat_long_pairs = df[['latitude','longitude']]
    lat_long_pairs_test = df_test[['latitude','longitude']]
    target_data = np.log2(df.price)

    k_means.fit(lat_long_pairs,sample_weight = target_data)
    df['cluster'] = k_means.predict(lat_long_pairs)
    df_test['cluster'] = k_means.predict(lat_long_pairs_test)

    return df, df_test

data_all['cluster'] = np.nan
data_all[data_all['Split']=='Train'], data_all[data_all['Split']=='Test'] = cluster_geo_data(deepcopy(data_all[data_all['Split']=='Train']), deepcopy(data_all[data_all['Split']=='Test']))

data_all_cat['cluster'] = np.nan
data_all_cat[data_all_cat['Split']=='Train'], data_all_cat[data_all_cat['Split']=='Test'] = cluster_geo_data(deepcopy(data_all_cat[data_all_cat['Split']=='Train']), deepcopy(data_all_cat[data_all_cat['Split']=='Test']))

In [None]:
data_all['penthouse'] = (data_all['floor']*(data_all['floor'] / data_all['stories'] - 0.5)**3).astype(float)
data_all_cat['penthouse'] = (data_all_cat['floor']*(data_all_cat['floor'] / data_all_cat['stories'] - 0.5)**3).astype(float)

In [None]:
data_all[['penthouse', 'floor', 'stories']]

In [None]:
data_corr = data_all[data_all['Split']=='Train'].select_dtypes(include=[np.number])

In [None]:
corr = data_corr.corr()
corr.head(20)
corr.sort_values(['price'], ascending=False, inplace=True)
corr['price']

# NaN values after EDA and feature engineering



In [None]:
data_all_cat

In [None]:
lgbm_categorical = (0, 6, 10, 11, 14, 16, 17, 20, 21, 22, 24, 25, 26, 27, 28, 29, 30, 31, 51)


# Stacking

In [None]:
SEED=42
model1 = RandomForestRegressor(
    n_estimators=1500,
    n_jobs=-1,
    random_state=SEED,
    verbose=1,
)
model2 = GradientBoostingRegressor(
    n_estimators=600,
    learning_rate=0.06,
    min_samples_leaf=4, 
    max_depth=9, 
    random_state=SEED,
    verbose=1,
)
model3 = lgb.LGBMRegressor(
    n_estimators=6000,
    learning_rate=0.08,
    num_leaves=10,
    random_state=SEED, 
    n_jobs=-1,
    categorical_feature = lgbm_categorical,
)
model4 = xgb.XGBRegressor(
    n_estimators=5000,
    learning_rate=0.14,
    n_jobs=-1, 
    random_state=SEED,
    max_depth = 4,
    seed=SEED,
    verbosity=1,
)
model5 = CatBoostRegressor(
    n_estimators=2500,
    learning_rate=0.1,
    thread_count=-1,
    depth=9,
    random_seed=SEED,
    silent=True,
#     cat_features = ["layout", "condition", "new", "material", "seller", "parking", "heating", "district"],
)

In [None]:
STORED_data_all = deepcopy(data_all)
STORED_data_all_cat = deepcopy(data_all_cat)

In [None]:
kræsj

In [None]:
import optuna
import lightgbm as lgb
def objective(trial,data=X_train,target=y_train):
 
    #Loading the dataset

    #train = train.drop('Split', axis=1)
    y_mean = np.log2(target.mean())
    y = deepcopy(np.log2(target.to_numpy()))
    X = deepcopy(data)
    
    #Implementing cross validation
    k = 3
    kf = KFold(n_splits=3, shuffle=True, random_state=42)

    param = {
        'tree_method':'gpu_hist', 
        'n_estimators': trial.suggest_categorical('n_estmators', [3000,5000,6500,7000,9000]),
        'max_depth': trial.suggest_categorical('max_depth', [3,4,5,6,8,10,12]),
        'learning_rate':trial.suggest_categorical('learning_rate', [0.001,0.06,0.1, 0.14, 0.3,0.5]),
        #'min_samples_leaf':trial.suggest_categorical('min_samples_leaf', [1,2,4,6,9,12]),
        #'num_leaves': trial.suggest_categorical('num_leaves', [4,17,21,24,27,50]),
        #'boosting': trial.suggest_categorical('boosting', ['gbdt', 'dart']),
        #'booster': trial.suggest_categorical('booster', ['gbtree', 'dart']),'reg:squaredlogerror',
        #'objective': trial.suggest_categorical('objective', ['reg:squarederror']),
        'min_child_wheight': trial.suggest_categorical('min_child_wheight', [0,0.5,2,6,15,30,70,300,1000]),
        'n_jobs': -1,
        'seed': 42,
    }

#     model4 = xgb.XGBRegressor(
#     n_estimators=5000,
#     learning_rate=0.14,
#     max_depth = 4,
#     n_iter_no_change = 3,
#     n_jobs=-1, 
#     random_state=SEED,
#     seed=SEED,
#     verbosity=1,
# )
#min_child_wheight
#objective til reg:squaredlogerror

#     model3 = lgb.LGBMRegressor(
#     n_estimators=6000,
#     learning_rate=0.08,
#     num_leaves=10,
#     random_state=SEED,
#     n_jobs=-1,
# )
#Trial 25 finished with value: 0.12818131318606005 and parameters: {'n_estmators': 7000, 'max_depth': None, 'learning_rate': 0.06, 'num_leaves': 24}. Best is trial 25 with value: 0.12818131318606005.

    model = xgb.XGBRegressor(**param)
    
    acc_score = []

    for train_index , test_index in kf.split(X):
        X_train_k , X_test_k = X.iloc[train_index,:],X.iloc[test_index,:]
        y_train_k , y_test_k = y[train_index] , y[test_index]
        
        model.fit(X_train_k,y_train_k)
      
        pred_values = model.predict(X_test_k)
        pred_values = np.power(2, pred_values)
        acc = root_mean_squared_log_error(y_true=np.power(2, y_test_k), y_pred=pred_values)
        acc_score.append(acc)
        
    avg_acc_score = sum(acc_score)/k
      
    return avg_acc_score

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

study.trials_dataframe()

In [None]:
data_all_cat

In [None]:
data_train_all = data_all[data_all['Split']=='Train'].drop(['Split'], axis=1)
data_test_all = data_all[data_all['Split']=='Test'].drop(['Split'], axis=1)

data_train_cat = data_all_cat[data_all_cat['Split']=='Train'].drop(['Split'], axis=1)
data_test_cat = data_all_cat[data_all_cat['Split']=='Test'].drop(['Split'], axis=1)

X_train = data_train_all.drop('price', axis=1)
X_test = data_test_all.drop('price', axis=1)

X_train_cat = data_train_cat.drop('price', axis=1)
X_test_cat = data_test_cat.drop('price', axis=1)

y_train = np.log2(data_train_all.loc[X_train.index].price)

In [None]:
ntrain = X_train.shape[0]
ntest = X_test.shape[0]
SEED = 42 # for reproducibility
NFOLDS = 5 # set number of folds for out-of-fold prediction
kf = KFold(
    n_splits=NFOLDS,
    shuffle=True,
    random_state=SEED
) # K-Folds cross-validator

def get_oof(clf, x_train, y_train, x_test):
    """
    Popular function on Kaggle.
    
    Trains a classifier on 4/5 of the training data and
    predicts the rest (1/5). This procedure is repeated for all 5 folds,
    thus we have predictions for all training set. This prediction is one
    column of meta-data, later on used as a feature column by a meta-algorithm.
    We predict the test part and average predictions across all 5 models.
    
    Keyword arguments:
    clf -- classifier
    x_train -- 4/5 of training data
    y_train -- corresponding labels
    x_test -- all test data
    
    """
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf.split(x_train)):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.fit(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [None]:
X_train = X_train.values
X_test = X_test.values
X_train_cat = X_train_cat.values
X_test_cat = X_test_cat.values
y_train = y_train.ravel()

In [None]:
rf_oof_train, rf_oof_test = get_oof(model1, X_train, y_train, X_test)
gb_oof_train, gb_oof_test = get_oof(model2, X_train, y_train, X_test)
lgb_oof_train, lgb_oof_test = get_oof(model3, X_train_cat, y_train, X_test_cat)
xgb_oof_train, xgb_oof_test = get_oof(model4, X_train, y_train, X_test) # set reg:squarederror?
cat_oof_train, cat_oof_test = get_oof(model5, X_train_cat, y_train, X_test_cat)

In [None]:
x_train = np.concatenate((
    rf_oof_train,
    gb_oof_train,
    lgb_oof_train,
    xgb_oof_train,
    cat_oof_train
), axis=1)

x_test = np.concatenate((
    rf_oof_test,
    gb_oof_test,
    lgb_oof_test,
    xgb_oof_test,
    cat_oof_test
), axis=1)

In [None]:
# META_MODEL = lgb.LGBMRegressor(
#     num_leaves=5,
#     max_depth=7, 
#     random_state=SEED, 
#     silent=True, 
#     metric='mse',
#     n_jobs=4, 
#     n_estimators=200,
#     colsample_bytree=1,
#     subsample=0.9,
#     learning_rate=0.05
# )
# META_MODEL = LinearRegression(
#     n_jobs=-1,
# )
META_MODEL = RidgeCV(cv=5)
META_MODEL.fit(x_train, y_train)
final_predictions = np.power(2, META_MODEL.predict(x_test))

# final_predictions = np.average(
#     [
#      rf_oof_test,
#      gb_oof_test,
#      lgb_oof_test,
#      xgb_oof_test,
#      cat_oof_test
#     ],
#     weights = 1 / acc['RMSLE']**9,
#     axis=0
# )
# final_predictions = np.power(2, final_predictions)
final_predictions

In [None]:
for i in range(len(final_predictions)):
    if final_predictions[i] < y_min:
        print(i, final_predictions[i])

In [None]:
min_pred = min(final_predictions)
print(min_pred)

# To CSV

In [None]:
submission = pd.DataFrame()
submission['id'] = data_test.id
submission['price_prediction'] = final_predictions
submission

# Construct submission dataframe
# submission = pd.DataFrame()
# submission['id'] = data_test.id
# submission.loc[~X_test_nan, 'price_prediction'] = prediction # Predict on non-nan entries
# submission['price_prediction'].fillna(y_train.mean(), inplace=True) # Fill missing entries with mean predictor
# print(f'Generated {len(submission)} predictions')

# submission.loc[~X_test_nan, 'price_prediction'] = prediction # Predict on non-nan entries
# submission['price_prediction'].fillna(y_train.mean(), inplace=True) # Fill missing entries with mean predictor
# print(f'Generated {len(submission)} predictions')

In [None]:
submission.to_csv('mandagsSubmissionReproduceLinearRegressor.csv', index=False)

In [13]:
data_all = pd.read_csv("../input/moscow/data_allNew.csv")

In [46]:
data_all

In [None]:
SEED=42
model1 = RandomForestRegressor(
    n_estimators=1500,
    n_jobs=-1,
    random_state=SEED,
    verbose=1,
)
model2 = GradientBoostingRegressor(
    n_estimators=600,
    learning_rate=0.06,
    min_samples_leaf=4, 
    max_depth=9, 
    random_state=SEED,
    verbose=1,
)
model3 = lgb.LGBMRegressor(
    n_estimators=6000,
    learning_rate=0.08,
    num_leaves=10,
    random_state=SEED, 
    n_jobs=-1,
    categorical_feature = lgbm_categorical,
)
model4 = xgb.XGBRegressor(
    n_estimators=5000,
    learning_rate=0.14,
    n_jobs=-1, 
    random_state=SEED,
    max_depth = 4,
    seed=SEED,
    verbosity=1,
)
model5 = CatBoostRegressor(
    n_estimators=2500,
    learning_rate=0.1,
    thread_count=-1,
    depth=9,
    random_seed=SEED,
    silent=True,
#     cat_features = ["layout", "condition", "new", "material", "seller", "parking", "heating", "district"],
)

In [14]:
def root_mean_squared_log_error(y_true, y_pred):
    # Alternatively: sklearn.metrics.mean_squared_log_error(y_true, y_pred) ** 0.5
    assert (y_true >= 0).all() 
    assert (y_pred >= 0).all()
    log_error = np.log1p(y_pred) - np.log1p(y_true)  # Note: log1p(x) = log(1 + x)
    return np.mean(log_error ** 2) ** 0.5

In [47]:
X_train = data_all[data_all['Split']=='Train']

In [48]:
y_train = data_all.price
X_train = data_all.drop('price', axis=1)


In [50]:
#X_train = X_train.drop('Unnamed: 0', axis=1)
X_train = X_train.drop('Split', axis=1)

In [51]:
X_train.columns

In [37]:
def root_mean_squared_log_error(y_true, y_pred):
    # Alternatively: sklearn.metrics.mean_squared_log_error(y_true, y_pred) ** 0.5
    assert (y_true >= 0).all() 
    assert (y_pred >= 0).all()
    log_error = np.log1p(y_pred) - np.log1p(y_true)  # Note: log1p(x) = log(1 + x)
    return np.mean(log_error ** 2) ** 0.5

In [None]:


import optuna
import lightgbm as lgb
def objective(trial,data=X_train,target=y_train):
 
    #Loading the dataset

    #train = train.drop('Split', axis=1)
    y = deepcopy(np.log2(target.to_numpy()))
    X = deepcopy(data)
    
    #Implementing cross validation
    k = 5
    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    param = {
        #'tree_method':'gpu_hist', 
        'n_estimators': trial.suggest_categorical('n_estmators', [600,1500,3000,5000,6500,7000,9000]),
        'max_depth': trial.suggest_categorical('max_depth', [4,5,6,8,10,12,15]),
        'learning_rate':trial.suggest_categorical('learning_rate', [0.001,0.01,0.06,0.1, 0.14, 0.3,0.5]),
        #'min_samples_leaf':trial.suggest_categorical('min_samples_leaf', [1,2,4,6,9,12]),
        'subsample': trial.suggest_float("subsample", 0.3, 1.0, step=0.1),
        'gamma': trial.suggest_float("gamma", 0, 5, step=1),
        #'num_leaves': trial.suggest_categorical('num_leaves', [4,17,21,24,27,50]),
        #'boosting': trial.suggest_categorical('boosting', ['gbdt', 'dart']),
        #'booster': trial.suggest_categorical('booster', ['gbtree', 'dart']),'reg:squaredlogerror',
        #'objective': trial.suggest_categorical('objective', ['reg:squarederror']),
        #'min_child_wheight': trial.suggest_categorical('min_child_wheight', [0,0.5,2,6,15,30,70,300,1000]),
        #'min_samples_split': trial.suggest_categorical('min_samples_split', [2, 4, 8, 16]),
        'n_jobs': -1,
    }

#     model4 = xgb.XGBRegressor(
#     n_estimators=5000,
#     learning_rate=0.14,
#     max_depth = 4,
#     n_iter_no_change = 3,
#     n_jobs=-1, 
#     random_state=SEED,
#     seed=SEED,
#     verbosity=1,
# )
#min_child_wheight
#objective til reg:squaredlogerror

#     model3 = lgb.LGBMRegressor(
#     n_estimators=6000,
#     learning_rate=0.08,
#     num_leaves=10,
#     random_state=SEED,
#     n_jobs=-1,
# )
#Trial 25 finished with value: 0.12818131318606005 and parameters: {'n_estmators': 7000, 'max_depth': None, 'learning_rate': 0.06, 'num_leaves': 24}. Best is trial 25 with value: 0.12818131318606005.

   # model = GradientBoostingRegressor(**param)
    model = xgb.XGBRegressor(
                seed=42,
                tree_method = 'gpu_hist',
                predictor = 'gpu_predictor',
                #booster = 'gbtree',
                **param
    )
    acc_score = []
    #task_type = 'GPU'

    for train_index , test_index in kf.split(X):
        X_train_k , X_test_k = X.iloc[train_index,:],X.iloc[test_index,:]
        y_train_k , y_test_k = y[train_index] , y[test_index]
        
        model.fit(X_train_k,y_train_k)
      
        pred_values = model.predict(X_test_k)
        pred_values = np.power(2, pred_values)
        acc = root_mean_squared_log_error(y_true=np.power(2, y_test_k), y_pred=pred_values)
        acc_score.append(acc)
        
    avg_acc_score = sum(acc_score)/k
      
    return avg_acc_score

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

study.trials_dataframe()

In [None]:


import optuna
import lightgbm as lgb
def objective(trial,data=X_train,target=y_train):
 
    #Loading the dataset

    #train = train.drop('Split', axis=1)
    y = deepcopy(np.log2(target.to_numpy()))
    X = deepcopy(data)
    
    #Implementing cross validation
    k = 5
    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    param = {
        #'tree_method':'gpu_hist', 
        'n_estimators': trial.suggest_categorical('n_estmators', [600,1500,2500,5000,6500,7000,9000]),
        #'max_leaves': None,
        #'max_depth': trial.suggest_categorical('max_depth', [4,5,6,8,10,12,15]),
        'learning_rate':trial.suggest_categorical('learning_rate', [0.001,0.01,0.06,0.1, 0.14, 0.3,0.5]),
        #'min_samples_leaf':trial.suggest_categorical('min_samples_leaf', [1,2,4,6,9,12]),
        #'subsample': trial.suggest_float("subsample", 0.3, 1.0, step=0.1),
        #'gamma': trial.suggest_float("gamma", 0, 5, step=1),
        'num_leaves': trial.suggest_categorical('num_leaves', [None, 4,17,21,24,27,50]),
        #'boosting': trial.suggest_categorical('boosting', ['gbdt', 'dart']),
        #'booster': trial.suggest_categorical('booster', ['gbtree', 'dart']),'reg:squaredlogerror',
        #'objective': trial.suggest_categorical('objective', ['reg:squarederror']),
        #'min_child_wheight': trial.suggest_categorical('min_child_wheight', [0,0.5,2,6,15,30,70,300,1000]),
        'min_child_samples': trial.suggest_categorical('min_samples_split', [None, 2, 4, 8, 16]),
        #'n_jobs': -1,
        #"loss_function": trial.suggest_categorical("loss_function", "RMSE"),
        #"learning_rate": trial.suggest_loguniform("learning_rate", 1e-5, 1e0),
        #"l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1e-2, 1e0),
        #"colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "depth": trial.suggest_categorical("depth", [4,5,6,8,10,12,15]),
        "bootstrap_type": 'Bernoulli',#trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]),
        'grow_policy':'Lossguide',
        #"boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        #"min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 2, 20),
        #"one_hot_max_size": trial.suggest_int("one_hot_max_size", 2, 20),
        
    }
    #if param["bootstrap_type"] == "Bayesian":
    #    param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    #elif param["bootstrap_type"] == "Bernoulli":
    #    param["subsample"] = trial.suggest_float("subsample", 0.3, 1.0, step=0.1),

#Trial 25 finished with value: 0.12818131318606005 and parameters: {'n_estmators': 7000, 'max_depth': None, 'learning_rate': 0.06, 'num_leaves': 24}. Best is trial 25 with value: 0.12818131318606005.

    model = CatBoostRegressor(
        random_seed=42,
        #tree_method = 'gpu_hist',
        #predictor = 'gpu_predictor'
        task_type = 'GPU',
        silent = True,
        #booster = 'gbtree',
        **param
    )
    acc_score = []
    #task_type = 'GPU'

    for train_index , test_index in kf.split(X):
        X_train_k , X_test_k = X.iloc[train_index,:],X.iloc[test_index,:]
        y_train_k , y_test_k = y[train_index] , y[test_index]
        
        model.fit(X_train_k,y_train_k)
      
        pred_values = model.predict(X_test_k)
        pred_values = np.power(2, pred_values)
        acc = root_mean_squared_log_error(y_true=np.power(2, y_test_k), y_pred=pred_values)
        acc_score.append(acc)
        
    avg_acc_score = sum(acc_score)/k
      
    return avg_acc_score

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

study.trials_dataframe()

In [None]:
model4 = xgb.XGBRegressor(
    n_estimators=5000,
    learning_rate=0.14,
    n_jobs=-1, 
    random_state=SEED,
    max_depth = 4,
    seed=SEED,
    verbosity=1,
)
model5 = CatBoostRegressor(
    n_estimators=2500,
    learning_rate=0.1,
    thread_count=-1,
    depth=9,
    random_seed=SEED,
    silent=True,
#     cat_features = ["layout", "condition", "new", "material", "seller", "parking", "heating", "district"],
)