In [6]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c playground-series-s4e9
!unzip playground-series-s4e9.zip

Downloading playground-series-s4e9.zip to /content
  0% 0.00/7.84M [00:00<?, ?B/s] 64% 5.00M/7.84M [00:00<00:00, 48.6MB/s]
100% 7.84M/7.84M [00:00<00:00, 57.0MB/s]
Archive:  playground-series-s4e9.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [17]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [21]:
df = pd.read_csv('/content/train.csv')
df

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200
1,1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900
3,3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000
4,4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500
...,...,...,...,...,...,...,...,...,...,...,...,...,...
188528,188528,Cadillac,Escalade ESV Platinum,2017,49000,Gasoline,420.0HP 6.2L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,White,Beige,None reported,Yes,27500
188529,188529,Mercedes-Benz,AMG C 43 AMG C 43 4MATIC,2018,28600,Gasoline,385.0HP 3.0L V6 Cylinder Engine Gasoline Fuel,8-Speed A/T,White,Black,At least 1 accident or damage reported,Yes,30000
188530,188530,Mercedes-Benz,AMG GLC 63 Base 4MATIC,2021,13650,Gasoline,469.0HP 4.0L 8 Cylinder Engine Gasoline Fuel,7-Speed A/T,White,Black,None reported,Yes,86900
188531,188531,Audi,S5 3.0T Prestige,2022,13895,Gasoline,3.0L,1-Speed Automatic,Daytona Gray Pearl Effect,Black,None reported,,84900


In [22]:
for column in df.columns:
  print(column, len(df[column].unique()))

id 188533
brand 57
model 1897
model_year 34
milage 6651
fuel_type 8
engine 1117
transmission 52
ext_col 319
int_col 156
accident 3
clean_title 2
price 1569


In [23]:
def encode(df):
    df = df.drop(columns=['model', 'engine', 'brand', 'ext_col'])
    df['clean_title'] = df['clean_title'].fillna('No')
    df['model_year'] = df['model_year'] - 1973
    df = pd.get_dummies(df, columns=['fuel_type', 'transmission', 'int_col', 'accident'], prefix=['fuel_type', 'transmission', 'int_col', 'accident'])
    mapping = {False: 0, True: 1, 'Yes': 1, 'No': 0}
    df = df.replace(mapping)
    return df

In [24]:
df = df.dropna()
df = encode(df)
df.head()

Unnamed: 0,id,model_year,milage,clean_title,price,fuel_type_Diesel,fuel_type_E85 Flex Fuel,fuel_type_Gasoline,fuel_type_Hybrid,fuel_type_Plug-In Hybrid,...,int_col_Very Light Cashmere,int_col_WHITE,int_col_Walnut,int_col_Whisper Beige,int_col_White,int_col_White / Brown,int_col_Yellow,int_col_–,accident_At least 1 accident or damage reported,accident_None reported
0,0,34,213000,1,4200,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1,29,143250,1,4999,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
2,2,29,136731,1,13900,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,3,44,19500,1,45000,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
4,4,48,7388,1,97500,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1


In [25]:
df.describe()

Unnamed: 0,id,model_year,milage,clean_title,price,fuel_type_Diesel,fuel_type_E85 Flex Fuel,fuel_type_Gasoline,fuel_type_Hybrid,fuel_type_Plug-In Hybrid,...,int_col_Very Light Cashmere,int_col_WHITE,int_col_Walnut,int_col_Whisper Beige,int_col_White,int_col_White / Brown,int_col_Yellow,int_col_–,accident_At least 1 accident or damage reported,accident_None reported
count,162610.0,162610.0,162610.0,162610.0,162610.0,162610.0,162610.0,162610.0,162610.0,162610.0,...,162610.0,162610.0,162610.0,162610.0,162610.0,162610.0,162610.0,162610.0,162610.0,162610.0
mean,94197.039137,42.158822,71105.487086,1.0,40816.99,0.022459,0.032194,0.906445,0.031124,0.003044,...,4.3e-05,1.8e-05,0.000141,2.5e-05,0.021911,0.000344,0.000314,0.021807,0.247211,0.752789
std,54467.770815,5.699531,50157.874111,0.0,76538.57,0.14817,0.176514,0.29121,0.173652,0.055089,...,0.006561,0.004295,0.011892,0.00496,0.146394,0.018554,0.017707,0.146053,0.431392,0.431392
min,0.0,1.0,100.0,1.0,2000.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47026.25,39.0,29000.0,1.0,15700.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,94139.0,43.0,64300.0,1.0,28500.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,141421.75,46.0,101000.0,1.0,46999.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,188532.0,51.0,405000.0,1.0,2954083.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 162610 entries, 0 to 188532
Columns: 222 entries, id to accident_None reported
dtypes: int64(222)
memory usage: 276.7 MB


In [27]:
X = df.drop(columns=['price'])
y = df['price']

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
params_xgb = {
    'learning_rate': 0.1,
    'n_estimators': 100,
    'max_depth': 8,
}

# Initialize and train the model
XGB = XGBRegressor()
# XGB.fit(X, y)

In [30]:
params_cb = {
    'iterations': 1200,
    'learning_rate': 0.1,
    'depth': 8,
    'l2_leaf_reg': 3,
    'bootstrap_type': 'Bayesian',
    'bagging_temperature': 1,
    'rsm': 1,
    'loss_function': 'RMSE',
    'eval_metric': 'RMSE',
    'logging_level': 'Silent',
}

# Initialize and train the model
CB = CatBoostRegressor()
# CB.fit(X, y)

In [31]:
RF = RandomForestRegressor()
# RF.fit(X, y)

In [32]:
lgb_reg = LGBMRegressor(
    objective='regression',
    num_leaves=120,
    learning_rate=0.1,
    n_estimators=100,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=5
)
# lgb_reg.fit(X, y)

In [33]:
model = VotingRegressor(
    estimators=[('XGB', XGB), ('catboost', CB), ('LGB', lgb_reg), ('RF', RF)]
)
model.fit(X, y)

Learning rate set to 0.091525
0:	learn: 75848.8855657	total: 70.3ms	remaining: 1m 10s
1:	learn: 75253.1442573	total: 95.8ms	remaining: 47.8s
2:	learn: 74742.0358628	total: 116ms	remaining: 38.5s
3:	learn: 74333.2313159	total: 136ms	remaining: 33.9s
4:	learn: 73967.6766375	total: 155ms	remaining: 30.9s
5:	learn: 73669.7162593	total: 176ms	remaining: 29.2s
6:	learn: 73410.2103798	total: 196ms	remaining: 27.8s
7:	learn: 73183.9496232	total: 216ms	remaining: 26.8s
8:	learn: 72995.2564487	total: 235ms	remaining: 25.9s
9:	learn: 72817.6983304	total: 255ms	remaining: 25.3s
10:	learn: 72666.9767827	total: 274ms	remaining: 24.7s
11:	learn: 72537.0538523	total: 296ms	remaining: 24.4s
12:	learn: 72413.8988030	total: 316ms	remaining: 24s
13:	learn: 72317.6031553	total: 336ms	remaining: 23.7s
14:	learn: 72225.3006236	total: 354ms	remaining: 23.2s
15:	learn: 72142.3675467	total: 372ms	remaining: 22.9s
16:	learn: 72075.8000696	total: 392ms	remaining: 22.7s
17:	learn: 72014.8601067	total: 410ms	remain

In [34]:
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Root Mean Squared Error: {np.sqrt(mse):.2f}")
print(f"Mean Absolute Error: {mae:.2f}")
print(f"R² Score: {r2:.2f}")

Root Mean Squared Error: 57390.50
Mean Absolute Error: 15515.49
R² Score: 0.52


In [35]:
test = pd.read_csv('/content/test.csv')

for column in test.columns:
  if test[column].dtype == 'object':
    test[column].fillna(test[column].mode()[0], inplace=True)
  else:
    test[column].fillna(test[column].mean(), inplace=True)
test.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title
0,188533,Land,Rover LR2 Base,2015,98000,Gasoline,240.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,6-Speed A/T,White,Beige,None reported,Yes
1,188534,Land,Rover Defender SE,2020,9142,Hybrid,395.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,8-Speed A/T,Silver,Black,None reported,Yes
2,188535,Ford,Expedition Limited,2022,28121,Gasoline,3.5L V6 24V PDI DOHC Twin Turbo,10-Speed Automatic,White,Ebony,None reported,Yes
3,188536,Audi,A6 2.0T Sport,2016,61258,Gasoline,2.0 Liter TFSI,Automatic,Silician Yellow,Black,None reported,Yes
4,188537,Audi,A6 2.0T Premium Plus,2018,59000,Gasoline,252.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,A/T,Gray,Black,None reported,Yes


In [36]:
test = encode(test)

In [37]:
y_pred = model.predict(test)

# Create a DataFrame for the submission
submission_df = pd.DataFrame({'id': test['id'], 'price': y_pred})
submission_df['price'] = submission_df['price'].astype(int)

# Save the submission to a CSV file
submission_df.to_csv('submission.csv', index=False)



## This is the best model so far and scored on RMSE: 77223.43918

### Competition link:
https://www.kaggle.com/competitions/playground-series-s4e9