In [35]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Load the dataset
df = pd.read_csv('cleaned_data_200k.csv')

### Creating and training the machine learning model to predict price of the car

The initial phase involves selecting a set of features and a target variable from a DataFrame. The chosen features include various car attributes such as body type, fuel economy, engine type, and exterior color, with the car's price being the target variable.

Then we preprosess the data. Categorical features undergo One-Hot Encoding, a process that converts these variables into a binary representation suitable for algorithmic processing. Numerical features are left unchanged.

The data is then divided into training and testing subsets.

The core of this process is the implementation of a LightGBM regressor. We decided to use this model because it delivered superior results compared to other algorithms we tested (random forest, linear regression, catboost and xgboost). It operates by constructing gradient boosting decision trees and employing a gradient-based one-side sampling and exclusive feature bundling, which enhances prediction accuracy and reduces the risks of overfitting. LightGBM is also known for its efficiency with large datasets and speed, making it a robust alternative to traditional random forest models.

Upon training the model with the training dataset, it is employed to predict the prices of cars in the test dataset. The evaluation of these predictions is conducted using key performance metrics: Root Mean Squared Error (RMSE), and the R^2 Score. These metrics are integral with assessing the model's predictive accuracy.

RMSE tells us how much the prediction differs from actual price in average
R^2 Score tells us how accurate the models predictions are

In [None]:
# Selecting the desired features for the model and the target variable
X = df[['body_type', 'city_fuel_economy', 'engine_type', 'exterior_color', 'fuel_tank_volume', 'fuel_type', 'highway_fuel_economy', 'horsepower', 'isCab', 'make_name', 'maximum_seating', 'mileage', 'model_name', 'seller_rating', 'torque', 'transmission', 'wheel_system', 'year', 'damage_history', 'major_options_count'
]]  # Features
y = df['price']  # Target variable

# One-Hot Encoding categorial features
categorical_features = ['body_type', 'engine_type', 'damage_history', 'fuel_type', 'isCab', 'make_name', 'transmission', 'wheel_system']
numerical_features = ['city_fuel_economy', 'highway_fuel_economy', 'exterior_color', 'fuel_tank_volume', 'horsepower', 'mileage', 'model_name', 'major_options_count', 'seller_rating', 'torque', 'year' ]

# Creating preprocessing pipelines for categorical features
categorical_pipeline = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # Applies One-Hot Encoding
])

# No transformation for numerical features in this pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_pipeline, categorical_features),
        ('num', 'passthrough', numerical_features)  # No changes to numerical features
    ]
)

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating a Random Forest model
model = RandomForestRegressor(n_estimators=100, min_samples_leaf=1, min_samples_split=2, random_state=42, n_jobs=6)
# Consider setting max_depth if necessary!

# Creating the final pipeline including preprocessing and the model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])

# Training the model with the training data
pipeline.fit(X_train, y_train)

# Predicting the 'price' for the test data
y_pred = pipeline.predict(X_test)

# Calculating the performance of the predictions
rmse = np.sqrt(mean_squared_error(y_test,y_pred))
r2 = r2_score(y_test, y_pred)

# Printing performance metrics
print(f'Root Mean Squared Error: {rmse}')
print(f'R^2 Score: {r2}')

In [36]:
# Calculating the performance of the predictions
rmse = np.sqrt(mean_squared_error(y_test,y_pred))
r2 = r2_score(y_test, y_pred)

# Printing performance metrics
print(f'Root Mean Squared Error: {rmse}')
print(f'R^2 Score: {r2}')

Root Mean Squared Error: 3737.1972354273917
R^2 Score: 0.9540277916212505


### Tulostetaan satunnaista 15 arvausta sekä autojen oikeat arvot. Tästä voimme katsoa kuinka tarkasti malli ennusti auton arvon 15 satunnaisen auton kohdalla. 

In [21]:
# Comparing predicted prices with actual prices
comparison_df = pd.DataFrame({'Actual Price': y_test, 'Predicted Price': y_pred})
comparison_df['Difference'] = comparison_df['Predicted Price'] - comparison_df['Actual Price']
comparison_df['Difference%'] = np.abs(comparison_df['Difference'] / comparison_df['Actual Price'] * 100)

# Sorting the DataFrame by the difference percentage to see the predictions with the biggest discrepancies
comparison_df.sort_values(by='Difference%', ascending=False, inplace=True)

# Reset index for better readability
comparison_df.reset_index(drop=True, inplace=True)

# Generate a random sample from the comparison dataframe
random_comparison_sample = comparison_df.sample(n=15, random_state=None)  # 'n' is the number of samples

# Display the random sample
print(random_comparison_sample)

       Actual Price  Predicted Price    Difference  Difference%
78          19800.0     34151.994750  14351.994750    72.484822
13910       39380.0     36595.431408  -2784.568592     7.071022
22315       18995.0     19721.952024    726.952024     3.827070
901         14968.0     19141.045654   4173.045654    27.879781
29743       33980.0     34625.344868    645.344868     1.899190
27958       33497.0     32722.442972   -774.557028     2.312318
27728       24433.0     23855.664140   -577.335860     2.362935
30762       36340.0     35730.650458   -609.349542     1.676801
25303       26585.0     27378.418168    793.418168     2.984458
11902       71700.0     65914.065170  -5785.934830     8.069644
3062        24962.0     29271.539171   4309.539171    17.264399
4312        33495.0     28533.820319  -4961.179681    14.811702
1595        25120.0     19466.946752  -5653.053248    22.504193
799          7795.0     10082.814996   2287.814996    29.349775
2183        12995.0     15578.313643   2