In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s4e9/sample_submission.csv
/kaggle/input/playground-series-s4e9/train.csv
/kaggle/input/playground-series-s4e9/test.csv


In [2]:
import warnings
# Ignore all warnings
warnings.filterwarnings('ignore')

## 1. Load the Dataset

In [3]:
import pandas as pd

# Load the datasets
train_data = pd.read_csv('/kaggle/input/playground-series-s4e9/train.csv')  # Assuming train.csv contains training data
test_data = pd.read_csv('/kaggle/input/playground-series-s4e9/test.csv')    # Assuming test.csv contains test data


In [4]:
train_data.sample(5)

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
141508,141508,Lexus,RX 350 F Sport Performance,2016,123256,Gasoline,295.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,A/T,Black,Black,None reported,Yes,18500
181856,181856,Land,Rover Range Rover Sport HST MHEV,2023,4126,Hybrid,395.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,Transmission w/Dual Shift Mode,Gray,Black,None reported,Yes,117500
94865,94865,GMC,Acadia Denali,2016,89000,Gasoline,288.0HP 3.6L V6 Cylinder Engine Gasoline Fuel,A/T,White,Beige,None reported,Yes,19500
100449,100449,BMW,340 i,2015,76465,Gasoline,320.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed M/T,Black,Black,None reported,Yes,29000
68395,68395,Porsche,Panamera Turbo,2018,7458,Gasoline,550.0HP 4.8L 8 Cylinder Engine Gasoline Fuel,8-Speed A/T,Red,Red,None reported,,162950


In [5]:
test_data.sample(5)

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title
107763,296296,Mercedes-Benz,E-Class E 350,2014,59400,Gasoline,302.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,A/T,White,Beige,At least 1 accident or damage reported,Yes
106713,295246,Mercedes-Benz,AMG GLE 43 Coupe 4MATIC,2018,12000,Gasoline,385.0HP 3.0L V6 Cylinder Engine Gasoline Fuel,9-Speed A/T,White,Brown,None reported,Yes
63747,252280,Mercedes-Benz,GLC 300 GLC 300,2022,41545,Gasoline,2.0 Liter Turbo,Automatic,Graphite Grey,–,None reported,
64878,253411,Porsche,911 Carrera S,2013,40800,Gasoline,385.0HP 3.8L Flat 6 Cylinder Engine Gasoline Fuel,7-Speed A/T,Silver,Black,None reported,Yes
68319,256852,Toyota,Camry Solara SLE V6,2006,199400,Gasoline,225.0HP 3.3L V6 Cylinder Engine Gasoline Fuel,A/T,White,Beige,At least 1 accident or damage reported,Yes


## 2. Data Preprocessing

### Handling missing values

In [6]:
# Check for missing values
print(train_data.isnull().sum())

id                  0
brand               0
model               0
model_year          0
milage              0
fuel_type        5083
engine              0
transmission        0
ext_col             0
int_col             0
accident         2452
clean_title     21419
price               0
dtype: int64


In [7]:
# Check for percentage of  missing values
print(train_data.isnull().sum()*100/len(train_data))


id               0.000000
brand            0.000000
model            0.000000
model_year       0.000000
milage           0.000000
fuel_type        2.696080
engine           0.000000
transmission     0.000000
ext_col          0.000000
int_col          0.000000
accident         1.300568
clean_title     11.360876
price            0.000000
dtype: float64


In [8]:
# Impute missing values for numerical features
#train_data.fillna(train_data.median(), inplace=True)

# Impute missing values for categorical features
#train_data.fillna(train_data.mode().iloc[0], inplace=True)


The error occurs because some columns in dataset are categorical (e.g., car make, model, fuel type, color, etc.), and the machine learning algorithm using expects numeric input. To fix this,we'll need to convert the categorical data into a numeric format that the model can handle.

We can use either **Label Encoding** or **One-Hot Encoding** for this conversion depending on whether the categorical variable is *ordinal* or *nominal*.

Here’s how we can correct the code by applying the appropriate encoding to the categorical features:

**1. Label Encoding for Ordinal Categorical Variables**

*For features that have a meaningful order (like 'low', 'medium', 'high'), use LabelEncoder.*

**2. One-Hot Encoding for Nominal Categorical Variables**

*For features that don’t have any specific order (like car brand, fuel type, etc.), use OneHotEncoder.*

### Feature Encoding

`Convert categorical features into numeric using one-hot encoding or label encoding`

In [9]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Identify categorical columns
categorical_cols = train_data.select_dtypes(include=['object']).columns
categorical_cols


Index(['brand', 'model', 'fuel_type', 'engine', 'transmission', 'ext_col',
       'int_col', 'accident', 'clean_title'],
      dtype='object')

In [10]:
# Apply OneHotEncoding for nominal categorical variables (e.g., car make, fuel type, etc.)
train_data = pd.get_dummies(train_data, columns=categorical_cols, drop_first=True)
test_data = pd.get_dummies(test_data, columns=categorical_cols, drop_first=True)

**pd.get_dummies():** This function converts categorical columns into multiple binary (0/1) columns (one-hot encoding), which allows the model to interpret them.

**drop_first=True:** Drops the first category to avoid multicollinearity.

**Reindexing Test Data:** After applying one-hot encoding, the columns generated in the training and test sets may differ. Using reindex ensures that the test set has the same columns as the training set, with missing columns filled with zeros.

**Handling Target Column in Test Data:** Since the test set does not have the price column, we remove it from the test set to avoid errors during prediction.

In [11]:
train_data.sample(5)

Unnamed: 0,id,model_year,milage,price,brand_Alfa,brand_Aston,brand_Audi,brand_BMW,brand_Bentley,brand_Bugatti,...,int_col_Tupelo,int_col_Very Light Cashmere,int_col_WHITE,int_col_Walnut,int_col_Whisper Beige,int_col_White,int_col_White / Brown,int_col_Yellow,int_col_–,accident_None reported
181817,181817,2016,94000,19500,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
19548,19548,2022,9000,54999,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
64723,64723,2010,149460,12500,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
14266,14266,2015,102200,31999,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
103109,103109,2022,10071,27999,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,True


In [12]:
test_data.sample(5)

Unnamed: 0,id,model_year,milage,brand_Alfa,brand_Aston,brand_Audi,brand_BMW,brand_Bentley,brand_Bugatti,brand_Buick,...,int_col_Tupelo,int_col_Very Light Cashmere,int_col_WHITE,int_col_Walnut,int_col_Whisper Beige,int_col_White,int_col_White / Brown,int_col_Yellow,int_col_–,accident_None reported
6386,194919,2021,15000,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,True
100976,289509,2021,22000,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
92568,281101,2023,535,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,True
83579,272112,2018,23823,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
12388,200921,2003,142215,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


In [13]:
# Align columns of test data with training data (since one-hot encoding can produce different sets of columns)
test_data = test_data.reindex(columns=train_data.columns, fill_value=0)
test_data.sample(5)

Unnamed: 0,id,model_year,milage,price,brand_Alfa,brand_Aston,brand_Audi,brand_BMW,brand_Bentley,brand_Bugatti,...,int_col_Tupelo,int_col_Very Light Cashmere,int_col_WHITE,int_col_Walnut,int_col_Whisper Beige,int_col_White,int_col_White / Brown,int_col_Yellow,int_col_–,accident_None reported
77040,265573,2023,6090,0,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
121580,310113,2016,86000,0,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
14702,203235,2014,38400,0,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,True
97227,285760,2008,95000,0,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
38544,227077,2008,61000,0,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,True


In [14]:
# Drop 'price' column from test data as it's not available in test
test_data.drop(columns=['price'], errors='ignore', inplace=True)

In [15]:
# Check to ensure the data is now numeric
print(train_data.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188533 entries, 0 to 188532
Columns: 3603 entries, id to accident_None reported
dtypes: bool(3599), int64(4)
memory usage: 652.9 MB
None


In [16]:
print(test_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125690 entries, 0 to 125689
Columns: 3602 entries, id to accident_None reported
dtypes: bool(3587), int64(15)
memory usage: 444.3 MB
None


## 3. Feature Engineering

`Create additional features (like car age, mileage categories, etc.) to improve model performance`

In [17]:
# Example: Create a new feature for car age
train_data['car_age'] = 2024 - train_data['model_year'] 
test_data['car_age'] = 2024 - test_data['model_year']


In [18]:
print(train_data['car_age'].head())

0    17
1    22
2    22
3     7
4     3
Name: car_age, dtype: int64


In [19]:
# Drop unnecessary columns like 'year'
train_data.drop('model_year', axis=1, inplace=True)
test_data.drop('model_year', axis=1, inplace=True)

## 4. Splitting the Data

*Split the data into features (**X**) and target (**y**)*

In [20]:
X = train_data.drop(columns=['price', 'id'])  # Drop target and id from features
y = train_data['price']

In [21]:
# Prepare the test features (without price column)
X_test = test_data.drop(columns=['id'])

## 5. Train-Test Split

In [22]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


## 6. Model Training

- We will use Gradient Boosting or Random Forest, which are typically effective for tabular data. 
- Let’s use XGBoost for this purpose as it is fast and efficient.



In [23]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

# Initialize XGBoost regressor
xgb_model = xgb.XGBRegressor(n_estimators=1000, learning_rate=0.05, max_depth=6, subsample=0.8, colsample_bytree=0.8, random_state=42)

In [24]:
# Fit the model with early stopping
xgb_model.fit(
    X_train, 
    y_train, 
    eval_set=[(X_val, y_val)],  # Validation set for early stopping
    early_stopping_rounds=50,   # Stop if performance doesn't improve after 50 rounds
    verbose=False               # Suppress training output
)

In [25]:
# Make predictions on the validation set
y_val_pred = xgb_model.predict(X_val)

In [26]:
# Calculate RMSE
rmse = mean_squared_error(y_val, y_val_pred, squared=False)
print(f'Validation RMSE: {rmse}')

Validation RMSE: 69294.54372656536


## 7. Make Predictions on Test Set

In [27]:
# Make predictions on the test set
test_predictions = xgb_model.predict(X_test)

In [28]:
test_predictions

array([21629.504, 71963.125, 57056.19 , ..., 23772.69 , 14266.046,
       36072.9  ], dtype=float32)

In [29]:
# Prepare the submission file
submission = pd.DataFrame({'id': test_data['id'], 'price': test_predictions})

In [30]:
submission

Unnamed: 0,id,price
0,188533,21629.503906
1,188534,71963.125000
2,188535,57056.191406
3,188536,27783.203125
4,188537,36993.617188
...,...,...
125685,314218,24584.654297
125686,314219,50949.507812
125687,314220,23772.689453
125688,314221,14266.045898


In [31]:
#submission.to_csv('submission2.csv', index=False)