In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor

# Load the dataset
df = pd.read_csv('cleaned_data.csv')

In [None]:
# Selecting the desired features for the model and the target variable
X = df[['body_type', 'city_fuel_economy', 'engine_type', 'exterior_color', 'fuel_tank_volume', 'fuel_type', 'highway_fuel_economy', 'horsepower', 'isCab', 'make_name', 'maximum_seating', 'mileage', 'model_name', 'seller_rating', 'torque', 'transmission', 'wheel_system', 'year', 'damage_history', 'major_options_count'
]]  # Features
y = df['price']  # Target variable

# One-Hot Encoding categorial features
categorical_features = ['body_type', 'engine_type', 'damage_history', 'fuel_type', 'isCab', 'make_name', 'transmission', 'wheel_system']
numerical_features = ['city_fuel_economy', 'highway_fuel_economy', 'exterior_color', 'fuel_tank_volume', 'horsepower', 'mileage', 'model_name', 'major_options_count', 'seller_rating', 'torque', 'year' ]

# Creating preprocessing pipelines for categorical features
categorical_pipeline = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # Applies One-Hot Encoding
])

# No transformation for numerical features in this pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_pipeline, categorical_features),
        ('num', 'passthrough', numerical_features)  # No changes to numerical features
    ]
)

# Splitting the data into training, validation, and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Apply preprocessing to the training and validation data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_val_preprocessed = preprocessor.transform(X_val)
X_test_preprocessed = preprocessor.transform(X_test)

# Define the CatBoost model
model = CatBoostRegressor(
    iterations=12000, 
    learning_rate=0.01, 
    depth=10, 
    silent=True, 
    random_state=42
)

# Fit the model with early stopping
model.fit(
    X_train_preprocessed, y_train, 
    eval_set=(X_val_preprocessed, y_val), 
    early_stopping_rounds=200,  # Number of rounds to stop if no improvement
    verbose=200  # Set output verbosity
)

# Predicting the 'price' for the test data
y_pred = model.predict(X_test_preprocessed)

# Calculating the performance of the predictions
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

# Printing performance metrics
print(f'Root Mean Squared Error: {rmse}')
print(f'R^2 Score: {r2}')

0:	learn: 15972.4144398	test: 15954.1956994	best: 15954.1956994 (0)	total: 155ms	remaining: 30m 55s
200:	learn: 6001.1142081	test: 5975.4341861	best: 5975.4341861 (200)	total: 27.8s	remaining: 27m 14s
400:	learn: 4838.6617844	test: 4819.3779729	best: 4819.3779729 (400)	total: 54.6s	remaining: 26m 18s
600:	learn: 4468.5220324	test: 4458.0632971	best: 4458.0632971 (600)	total: 1m 21s	remaining: 25m 38s
800:	learn: 4240.1086061	test: 4237.5918626	best: 4237.5918626 (800)	total: 1m 47s	remaining: 25m 8s
1000:	learn: 4077.3199066	test: 4082.0408278	best: 4082.0408278 (1000)	total: 2m 14s	remaining: 24m 35s
1200:	learn: 3958.1413599	test: 3968.6565492	best: 3968.6565492 (1200)	total: 2m 40s	remaining: 24m 6s
1400:	learn: 3864.0767836	test: 3878.9975767	best: 3878.9975767 (1400)	total: 3m 7s	remaining: 23m 40s
1600:	learn: 3786.1494883	test: 3804.9714221	best: 3804.9714221 (1600)	total: 3m 34s	remaining: 23m 12s
1800:	learn: 3723.7400614	test: 3746.5998516	best: 3746.5998516 (1800)	total: 4m 

In [3]:
# Predicting the 'price' for the test data
y_pred = model.predict(X_test_preprocessed)

# Calculating the performance of the predictions
rmse = np.sqrt(mean_squared_error(y_test,y_pred))
r2 = r2_score(y_test, y_pred)

# Printing performance metrics
print(f'Root Mean Squared Error: {rmse}')
print(f'R^2 Score: {r2}')

Root Mean Squared Error: 2975.021629307054
R^2 Score: 0.9659971736531674


In [16]:
# Comparing predicted prices with actual prices
comparison_df = pd.DataFrame({'Actual Price': y_test, 'Predicted Price': y_pred})
comparison_df['Difference'] = comparison_df['Predicted Price'] - comparison_df['Actual Price']
comparison_df['Difference%'] = np.abs(comparison_df['Difference'] / comparison_df['Actual Price'] * 100)

# Sorting the DataFrame by the difference percentage to see the predictions with the biggest discrepancies
comparison_df.sort_values(by='Difference%', ascending=False, inplace=True)

# Reset index for better readability
comparison_df.reset_index(drop=True, inplace=True)

# Generate a random sample from the comparison dataframe
random_comparison_sample = comparison_df.sample(n=15, random_state=None)  # 'n' is the number of samples

# Display the random sample
print(random_comparison_sample)

       Actual Price  Predicted Price   Difference  Difference%
6835        33000.0     36984.528353  3984.528353    12.074328
16433       27995.0     29876.542345  1881.542345     6.720994
39861       26990.0     26980.505458    -9.494542     0.035178
36693       20459.0     20625.486816   166.486816     0.813758
22108       48905.0     46575.279630 -2329.720370     4.763767
20495       68870.0     65232.162687 -3637.837313     5.282180
28673       27935.0     27134.758810  -800.241190     2.864654
6769        59245.0     52060.845046 -7184.154954    12.126179
27193       19895.0     20543.127656   648.127656     3.257741
12708       11900.0     10906.206975  -993.793025     8.351202
39382       72981.0     73092.907375   111.907375     0.153338
809         16995.0     12595.813428 -4399.186572    25.885181
4830        34695.0     39588.409043  4893.409043    14.104076
6159        26988.0     30405.825796  3417.825796    12.664243
5782        63194.0     54969.452420 -8224.547580    13