In [109]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

In [110]:
df_test = pd.read_csv('test.csv')
df_train = pd.read_csv('train.csv')

In [111]:
df_train = df_train.loc[~df_train.duplicated()].reset_index(drop=True)

In [112]:
df_train.loc[:, 'price'] = np.log1p(df_train.price)

In [113]:
data = pd.concat([df_train, df_test], axis=0, ignore_index=True, sort=False)

In [114]:
data.running = data.running.apply(lambda x: int(x.split()[0]) if 'km' in x else int(x.split()[0])*1.60934)

In [115]:
data.loc[:, 'running'] = np.log1p(data.running)

In [116]:
data = data.drop(['wheel', 'Id'], axis=1)

In [117]:
data["running_per_motor"] = data["running"] / data["motor_volume"]
data['running_per_year'] = data['running'] / (2024-data['year'])

In [118]:
data = pd.get_dummies(data, dtype=int).reset_index(drop=True)

In [133]:
data

Unnamed: 0,year,running,motor_volume,price,running_per_motor,running_per_year,model_hyundai,model_kia,model_mercedes-benz,model_nissan,...,type_hatchback,type_minivan / minibus,type_pickup,type_sedan,type_suv,status_crashed,status_excellent,status_good,status_new,status_normal
0,2022,8.006701,2.0,10.106469,4.003350,4.003350,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
1,2014,11.790565,2.0,10.146473,5.895282,1.179056,0,0,1,0,...,0,0,0,1,0,0,1,0,0,0
2,2018,11.937463,2.0,9.367430,5.968731,1.989577,0,1,0,0,...,0,0,0,1,0,0,1,0,0,0
3,2002,12.303565,3.2,9.392745,3.844864,0.559253,0,0,1,0,...,0,0,0,1,0,0,1,0,0,0
4,2017,11.775297,2.0,10.165890,5.887649,1.682185,0,0,1,0,...,0,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2039,2021,10.416341,2.0,,5.208171,3.472114,0,0,0,1,...,0,0,0,0,1,0,1,0,0,0
2040,2017,11.477934,2.0,,5.738967,1.639705,1,0,0,0,...,0,0,0,1,0,0,1,0,0,0
2041,2012,12.292255,2.0,,6.146127,1.024355,0,0,1,0,...,0,0,0,1,0,0,0,1,0,0
2042,2020,11.072474,2.0,,5.536237,2.768119,0,1,0,0,...,0,0,0,1,0,0,0,1,0,0


In [119]:
train = data[~data["price"].isnull()]
test = data[data["price"].isnull()]

In [120]:
X = train.drop('price',axis=1)
y = train.price
X_test = test.drop('price', axis=1)

In [121]:
params = {'booster': 'gbtree',
          'max_depth': 3,
          'max_leaves': 769,
          'learning_rate': 0.04538451353216046,
          'n_estimators': 1171,
          'min_child_weight': 13,
          'subsample': 0.6578720167306904,
          'reg_alpha': 0.4622943878867952,
          'reg_lambda': 0.6211309481623339,
          'colsample_bylevel': 0.7985625445322192,
          'colsample_bytree': 0.9634723040072963,
          'colsample_bynode': 0.49814271378837316,
          'random_state': 42,
          'objective': 'reg:absoluteerror',
          'n_jobs': -1,
         }
model_xgb = XGBRegressor(**params)
model_xgb.fit(X, y)

In [122]:
df_test.loc[:, 'price'] = np.expm1(xgb.predict(X_test))

In [123]:
submission = df_test[['Id', 'price']]

In [124]:
submission.to_csv("xgb.csv", index=False)

# cat


In [157]:
from sklearn.model_selection import GridSearchCV

In [158]:
model = CatBoostRegressor(loss_function='MAE', silent = True)

In [163]:
param_cat = {
    'iterations': [10, 20, 30, 50],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [4, 6, 8],
    'l2_leaf_reg': [3, 5, 7],
}

In [165]:
full_model = GridSearchCV(
    estimator = model ,
    param_grid= param_cat,
    scoring = 'neg_mean_absolute_error',
    verbose=2, 
    cv = 5,
     )
 


In [166]:
full_model.fit(X,y)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV] END depth=4, iterations=10, l2_leaf_reg=3, learning_rate=0.01; total time=   0.3s
[CV] END depth=4, iterations=10, l2_leaf_reg=3, learning_rate=0.01; total time=   0.1s
[CV] END depth=4, iterations=10, l2_leaf_reg=3, learning_rate=0.01; total time=   0.1s
[CV] END depth=4, iterations=10, l2_leaf_reg=3, learning_rate=0.01; total time=   0.1s
[CV] END depth=4, iterations=10, l2_leaf_reg=3, learning_rate=0.01; total time=   0.1s
[CV] END depth=4, iterations=10, l2_leaf_reg=3, learning_rate=0.05; total time=   0.1s
[CV] END depth=4, iterations=10, l2_leaf_reg=3, learning_rate=0.05; total time=   0.1s
[CV] END depth=4, iterations=10, l2_leaf_reg=3, learning_rate=0.05; total time=   0.0s
[CV] END depth=4, iterations=10, l2_leaf_reg=3, learning_rate=0.05; total time=   0.1s
[CV] END depth=4, iterations=10, l2_leaf_reg=3, learning_rate=0.05; total time=   0.0s
[CV] END depth=4, iterations=10, l2_leaf_reg=3, learning_rate=0.1; 

In [167]:
full_model.best_params_

{'depth': 6, 'iterations': 50, 'l2_leaf_reg': 7, 'learning_rate': 0.1}

In [168]:
y_pred = full_model.predict(X_test)

# best model XGBoost

In [122]:
df_test.loc[:, 'price'] = np.expm1(xgb.predict(X_test))

In [123]:
submission = df_test[['Id', 'price']]

In [124]:
submission.to_csv("final.csv", index=False)