In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import tensorflow as tf

# Check if GPU is available
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

/kaggle/input/playground-series-s4e9/sample_submission.csv
/kaggle/input/playground-series-s4e9/train.csv
/kaggle/input/playground-series-s4e9/test.csv
Num GPUs Available:  1


# Data Manipulation

In [2]:
df = pd.read_csv('/kaggle/input/playground-series-s4e9/train.csv')


## Cleaning Data

In [3]:
# Extract Horsepower (HP)
df['HP'] = df['engine'].str.extract(r'(\d+\.?\d*)HP', expand=False)
df['HP'] = pd.to_numeric(df['HP'], errors='coerce')

# Extract Engine Size in Liters (L)
df['EngineSize_L'] = df['engine'].str.extract(r'(\d+\.?\d*)L', expand=False)
df['EngineSize_L'] = pd.to_numeric(df['EngineSize_L'], errors='coerce')

# Extract Cylinder Number
df['CylinderNumber'] = df['engine'].str.extract(r'(\d+)\s*Cylinder', expand=False)
df['CylinderNumber'] = pd.to_numeric(df['CylinderNumber'], errors='coerce').astype('str')

# Extract Fuel Type
df['FuelType'] = df['engine'].str.extract(r'Engine\s+(.+?)\s+Fuel', expand=False)

df['EngineSize_L'] = df['EngineSize_L'].replace(np.nan, 0)
df.drop(['engine','FuelType'], axis = 1, inplace=True)

df['CylinderNumber']=df['CylinderNumber'].replace('nan',"Not Reported")

df['accident']=df['accident'].replace('nan', 'yes')
df['accident']=df['accident'].replace('None reported', 'no')
df['accident']=df['accident'].replace('At least 1 accident or damage reported', 'yes')

df['clean_title']=df['clean_title'].replace('nan', 'no')

df['fuel_type']=df['fuel_type'].replace(np.nan,'–')
df['fuel_type']=df['fuel_type'].replace('not supported','–')
df['fuel_type']=df['fuel_type'].replace('E85 Flex Fuel','Gasoline')
df['fuel_type']=df['fuel_type'].replace('Plug-In Hybrid','Hybrid')
df['clean_title']=df['clean_title'].replace(np.nan, 'no')
df['accident']=df['accident'].replace(np.nan, 'no')
df['HP']=df['HP'].replace(np.nan, 0)

# Function to classify as 'Manual' or 'Automatic'
def classify_transmission(trans):
    trans_lower = trans.lower()
    
    if 'manual' in trans_lower or 'mt' in trans_lower or 'm/t' in trans_lower:
        return 'Manual'
    else:
        return 'Automatic'

# Apply classification function
df['transmission_type'] = df['transmission'].apply(classify_transmission)

df.drop('transmission', axis=1 ,inplace = True)

## Encoding Data

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler


# Select features and target variable
X = df.drop(['price', 'id'], axis=1)
y = df['price']

# List of categorical columns that need one-hot encoding
categorical_cols = ['brand', 'model', 'fuel_type', 'ext_col', 'int_col', 
                    'accident', 'clean_title', 'CylinderNumber', 'transmission_type']

# List of numerical columns that need scaling
numerical_cols = ['model_year', 'milage', 'HP', 'EngineSize_L']

# Define the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),  # Standardize numerical features
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse=False), categorical_cols)  # One-hot encode categorical features
    ]
)

# Fit the preprocessor on the training data
X_train_transformed = preprocessor.fit_transform(X)

# Extract column names
# Get numerical column names
num_feature_names = numerical_cols

# Get categorical column names after one-hot encoding
cat_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols)

# Combine both sets of feature names
all_feature_names = list(num_feature_names) + list(cat_feature_names)

# Convert the transformed array back to a DataFrame with the correct column names
X = pd.DataFrame(X_train_transformed, columns=all_feature_names, index=X.index)



# Neural Net

In [5]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow.keras.backend as K

# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)

# Custom RMSE metric
def rmse(y_true, y_pred):
    y_true = K.cast(y_true, dtype='float32')
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

# Define the neural network architecture

model = Sequential()
model.add(Dense(256, input_dim=X.shape[1], activation='relu'))  
model.add(Dropout(0.3)) 
model.add(Dense(128, activation='relu'))                              
model.add(Dropout(0.3)) 
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2)) 
model.add(Dense(32, activation='relu'))                               
model.add(Dense(1))                                                   # Output layer for regression (no activation function)


# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error', metrics=[rmse])


# Train the model with early stopping
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

history = model.fit(X_train, y_train, 
                    epochs=100, 
                    batch_size=32, 
                    validation_split=0.2, 
                    verbose=1, 
                    callbacks=[early_stopping])
# Evaluate the model on the test data
test_loss, test_rmse = model.evaluate(X_test, y_test)
y_pred_nn = model.predict(X_test)

print(f"Test RMSE: {test_rmse}")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100


I0000 00:00:1727543966.133512      68 service.cc:145] XLA service 0x795aa800cea0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1727543966.133609      68 service.cc:153]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0


[1m  86/3771[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m6s[0m 2ms/step - loss: 11119830016.0000 - rmse: 73857.0000

I0000 00:00:1727543971.056949      68 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m3771/3771[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 4ms/step - loss: 6177734656.0000 - rmse: 47355.0781 - val_loss: 6371491840.0000 - val_rmse: 44352.5586
Epoch 2/100
[1m3771/3771[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - loss: 4448916992.0000 - rmse: 40367.0352 - val_loss: 6336737792.0000 - val_rmse: 44635.0000
Epoch 3/100
[1m3771/3771[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - loss: 5207675904.0000 - rmse: 42538.1562 - val_loss: 6323976704.0000 - val_rmse: 44826.9414
Epoch 4/100
[1m3771/3771[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - loss: 4702616576.0000 - rmse: 41102.5820 - val_loss: 6332716544.0000 - val_rmse: 44776.9062
Epoch 5/100
[1m3771/3771[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - loss: 4947664896.0000 - rmse: 40996.3438 - val_loss: 6349118464.0000 - val_rmse: 45652.0000
Epoch 6/100
[1m3771/3771[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - loss:

1179/1179 ━━━━━━━━━━━━━━━━━━━━ 2s 2ms/step - loss: 4432596480.0000 - rmse: 41750.2148 Test RMSE: 42117.45703125 - 72395.08036 King

2ms/step - loss: 4445777920.0000 - rmse: 40666.0234 Test RMSE: 40972.703125 with extra 16 neuron layer .2 dropuout in. last layer  - best score 72494.71653 

1179/1179 ━━━━━━━━━━━━━━━━━━━━ 2s 2ms/step - loss: 4412199936.0000 - rmse: 41141.9102
Test RMSE: 41484.37109375 - best score 72480.06759

1179/1179 ━━━━━━━━━━━━━━━━━━━━ 2s 2ms/step - loss: 4421828096.0000 - rmse: 41316.0977
Test RMSE: 41651.546875 - Best 72434.98778




# Gradient Boosting

In [6]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

In [7]:

'''# Create a LightGBM dataset
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# Define the fixed parameters
fixed_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'force_row_wise': True
}

# Initialize the model with fixed parameters
lgb_model = lgb.LGBMRegressor(**fixed_params)

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'num_leaves': [31, 50, 70],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [50, 100, 200],
    'feature_fraction': [0.8, 0.9, 1.0]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=lgb_model, param_grid=param_grid,
                           cv=5, scoring='neg_mean_squared_error', 
                           n_jobs=-1, verbose=1)'''

"# Create a LightGBM dataset\ntrain_data = lgb.Dataset(X_train, label=y_train)\ntest_data = lgb.Dataset(X_test, label=y_test, reference=train_data)\n\n# Define the fixed parameters\nfixed_params = {\n    'objective': 'regression',\n    'metric': 'rmse',\n    'boosting_type': 'gbdt',\n    'force_row_wise': True\n}\n\n# Initialize the model with fixed parameters\nlgb_model = lgb.LGBMRegressor(**fixed_params)\n\n# Define the parameter grid for hyperparameter tuning\nparam_grid = {\n    'num_leaves': [31, 50, 70],\n    'learning_rate': [0.01, 0.05, 0.1],\n    'n_estimators': [50, 100, 200],\n    'feature_fraction': [0.8, 0.9, 1.0]\n}\n\n# Initialize GridSearchCV\ngrid_search = GridSearchCV(estimator=lgb_model, param_grid=param_grid,\n                           cv=5, scoring='neg_mean_squared_error', \n                           n_jobs=-1, verbose=1)"

In [8]:
'''
fit_params = {
    'eval_set': [(X_test, y_test)],
    'eval_metric': 'rmse',
    'callbacks': [lgb.early_stopping(stopping_rounds=10)]}

# Fit the grid search with the fit parameters
grid_search.fit(X_train, y_train, **fit_params)

# Get the best model
best_model = grid_search.best_estimator_

# Access best_iteration_
best_iteration = best_model.best_iteration_
print(f"Best iteration: {best_iteration}")

# Predictions using the best iteration
y_pred = best_model.predict(X_test, num_iteration=best_iteration)
'''

'\nfit_params = {\n    \'eval_set\': [(X_test, y_test)],\n    \'eval_metric\': \'rmse\',\n    \'callbacks\': [lgb.early_stopping(stopping_rounds=10)]}\n\n# Fit the grid search with the fit parameters\ngrid_search.fit(X_train, y_train, **fit_params)\n\n# Get the best model\nbest_model = grid_search.best_estimator_\n\n# Access best_iteration_\nbest_iteration = best_model.best_iteration_\nprint(f"Best iteration: {best_iteration}")\n\n# Predictions using the best iteration\ny_pred = best_model.predict(X_test, num_iteration=best_iteration)\n'

In [9]:
'''best_model'''

'best_model'

In [10]:
'''#### Make predictions with LightGBM
y_pred_lgb = best_model.predict(X_test, num_iteration=best_iteration)

# Evaluate the model
rmse_lgb = mean_squared_error(y_test, y_pred_lgb, squared=False)
rmse_nn = mean_squared_error(y_test, y_pred_nn, squared=False)
print(f"LightGBM RMSE: {rmse_lgb}")
print(f"LightGBM NN: {rmse_nn}")'''

'#### Make predictions with LightGBM\ny_pred_lgb = best_model.predict(X_test, num_iteration=best_iteration)\n\n# Evaluate the model\nrmse_lgb = mean_squared_error(y_test, y_pred_lgb, squared=False)\nrmse_nn = mean_squared_error(y_test, y_pred_nn, squared=False)\nprint(f"LightGBM RMSE: {rmse_lgb}")\nprint(f"LightGBM NN: {rmse_nn}")'

## Ensemble

In [11]:
'''# Ensemble by averaging predictions
y_pred_ensemble = (y_pred_nn.flatten() + y_pred_lgb.flatten()) / 2

# Evaluate the ensemble
rmse_ensemble = mean_squared_error(y_test, y_pred_ensemble, squared=False)
print(f"Ensemble RMSE: {rmse_ensemble}")

print(f"LightGBM RMSE: {rmse_lgb}")
print(f"LightGBM NN: {rmse_nn}")'''


'# Ensemble by averaging predictions\ny_pred_ensemble = (y_pred_nn.flatten() + y_pred_lgb.flatten()) / 2\n\n# Evaluate the ensemble\nrmse_ensemble = mean_squared_error(y_test, y_pred_ensemble, squared=False)\nprint(f"Ensemble RMSE: {rmse_ensemble}")\n\nprint(f"LightGBM RMSE: {rmse_lgb}")\nprint(f"LightGBM NN: {rmse_nn}")'

# Model Training

## Neural Network

In [12]:
# Train the model with early stopping
history = model.fit(X, y, 
                    epochs=100, 
                    batch_size=32, 
                    validation_split=0.2, 
                    verbose=1, 
                    callbacks=[early_stopping])

Epoch 1/100
[1m4714/4714[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 3ms/step - loss: 5108706816.0000 - rmse: 41812.9805 - val_loss: 5473142784.0000 - val_rmse: 42566.4922
Epoch 2/100
[1m4714/4714[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - loss: 4993657856.0000 - rmse: 42025.0078 - val_loss: 5472196096.0000 - val_rmse: 42840.1680
Epoch 3/100
[1m4714/4714[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - loss: 5258655744.0000 - rmse: 42319.1719 - val_loss: 5492331008.0000 - val_rmse: 43119.5781
Epoch 4/100
[1m4714/4714[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - loss: 5021905408.0000 - rmse: 42324.3672 - val_loss: 5483990528.0000 - val_rmse: 43211.2852
Epoch 5/100
[1m4714/4714[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - loss: 4557867520.0000 - rmse: 41067.5977 - val_loss: 5510266880.0000 - val_rmse: 44273.7070
Epoch 6/100
[1m4714/4714[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m

## GBM

In [13]:
# Combine X_train and X_test, y_train and y_test
X_full = pd.concat([X_train, X_test], axis=0)
y_full = pd.concat([y_train, y_test], axis=0)

# Create a LightGBM dataset with all data
full_data = lgb.Dataset(X_full, label=y_full)

# Set parameters for the model (tune based on your data)
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'force_row_wise': True,
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

# Train the model on the entire dataset
lgb_model_full = lgb.train(params, full_data, num_boost_round=100)

[LightGBM] [Info] Total Bins 4404
[LightGBM] [Info] Number of data points in the train set: 188533, number of used features: 1916
[LightGBM] [Info] Start training from score 43878.016178


# Test Data

In [14]:
df = pd.read_csv('/kaggle/input/playground-series-s4e9/test.csv')

In [15]:
df

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title
0,188533,Land,Rover LR2 Base,2015,98000,Gasoline,240.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,6-Speed A/T,White,Beige,None reported,Yes
1,188534,Land,Rover Defender SE,2020,9142,Hybrid,395.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,8-Speed A/T,Silver,Black,None reported,Yes
2,188535,Ford,Expedition Limited,2022,28121,Gasoline,3.5L V6 24V PDI DOHC Twin Turbo,10-Speed Automatic,White,Ebony,None reported,
3,188536,Audi,A6 2.0T Sport,2016,61258,Gasoline,2.0 Liter TFSI,Automatic,Silician Yellow,Black,None reported,
4,188537,Audi,A6 2.0T Premium Plus,2018,59000,Gasoline,252.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,A/T,Gray,Black,None reported,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...
125685,314218,Mercedes-Benz,GL-Class GL 450 4MATIC,2014,83315,Gasoline,362.0HP 3.0L V6 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Black,None reported,Yes
125686,314219,Audi,Q7 55 Prestige,2019,29336,Gasoline,3.0 Liter Turbo,Automatic,White,Black,None reported,
125687,314220,Audi,A6 3.0T Premium Plus,2012,77634,Gasoline,333.0HP 3.0L V6 Cylinder Engine Gasoline Fuel,A/T,Black,Black,None reported,Yes
125688,314221,Audi,Q7 3.0T Premium,2012,112000,Gasoline,333.0HP 3.0L V6 Cylinder Engine Gasoline Fuel,A/T,Black,Black,None reported,Yes


In [16]:
# Extract Horsepower (HP)
df['HP'] = df['engine'].str.extract(r'(\d+\.?\d*)HP', expand=False)
df['HP'] = pd.to_numeric(df['HP'], errors='coerce')

# Extract Engine Size in Liters (L)
df['EngineSize_L'] = df['engine'].str.extract(r'(\d+\.?\d*)L', expand=False)
df['EngineSize_L'] = pd.to_numeric(df['EngineSize_L'], errors='coerce')

# Extract Cylinder Number
df['CylinderNumber'] = df['engine'].str.extract(r'(\d+)\s*Cylinder', expand=False)
df['CylinderNumber'] = pd.to_numeric(df['CylinderNumber'], errors='coerce').astype('str')

# Extract Fuel Type
df['FuelType'] = df['engine'].str.extract(r'Engine\s+(.+?)\s+Fuel', expand=False)

df['EngineSize_L'] = df['EngineSize_L'].replace(np.nan, 0)
df.drop(['engine','FuelType'], axis = 1, inplace=True)

df['CylinderNumber']=df['CylinderNumber'].replace('nan',"Not Reported")

df['accident']=df['accident'].replace('nan', 'yes')
df['accident']=df['accident'].replace('None reported', 'no')
df['accident']=df['accident'].replace('At least 1 accident or damage reported', 'yes')

df['clean_title']=df['clean_title'].replace('nan', 'no')

df['fuel_type']=df['fuel_type'].replace(np.nan,'–')
df['fuel_type']=df['fuel_type'].replace('not supported','–')
df['fuel_type']=df['fuel_type'].replace('E85 Flex Fuel','Gasoline')
df['fuel_type']=df['fuel_type'].replace('Plug-In Hybrid','Hybrid')
df['clean_title']=df['clean_title'].replace(np.nan, 'no')
df['accident']=df['accident'].replace(np.nan, 'no')
df['HP']=df['HP'].replace(np.nan, 0)

# Function to classify as 'Manual' or 'Automatic'
def classify_transmission(trans):
    trans_lower = trans.lower()
    
    if 'manual' in trans_lower or 'mt' in trans_lower or 'm/t' in trans_lower:
        return 'Manual'
    else:
        return 'Automatic'

# Apply classification function
df['transmission_type'] = df['transmission'].apply(classify_transmission)

df.drop('transmission', axis=1 ,inplace = True)

# Select features and target variable
X = df.drop(['id'], axis=1)


# Extract column names
# Get numerical column names
num_feature_names = numerical_cols

# Get categorical column names after one-hot encoding
cat_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols)

# Combine both sets of feature names
all_feature_names = list(num_feature_names) + list(cat_feature_names)

# Fit the preprocessor on the training data
X_test_transformed = preprocessor.transform(X)

# Convert the transformed array back to a DataFrame with the correct column names
X = pd.DataFrame(X_test_transformed, columns=all_feature_names, index=X.index)

In [17]:
# Make predictions on the test data
y_pred_nn = model.predict(X)
y_pred_lgb = lgb_model_full.predict(X)

[1m3928/3928[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step


In [18]:
# Ensemble by averaging predictions
y_pred_ensemble = (0.7*y_pred_nn.flatten() + 0.3*y_pred_lgb.flatten())


In [19]:
# Assuming 'id' column was preserved from the original DataFrame's test split
test_results = pd.DataFrame({'id': df.loc[df.index, 'id'], 'predicted_price': y_pred_ensemble.flatten()})



In [20]:
test_results

Unnamed: 0,id,predicted_price
0,188533,19002.200667
1,188534,68787.864958
2,188535,54758.207685
3,188536,23771.907100
4,188537,29207.575536
...,...,...
125685,314218,24162.062198
125686,314219,47447.060782
125687,314220,18340.245061
125688,314221,15633.915284


In [21]:
# Save the predictions to a CSV file
test_results.to_csv('submission.csv', index=False)