In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
import lightgbm as lgb
from lightgbm.callback import early_stopping

# Load the dataset
df = pd.read_csv('final_data.csv')

In [3]:
# Selecting the desired features for the model and the target variable
X = df[['body_type', 'city_fuel_economy',  'fuel_tank_volume', 'fuel_type', 'highway_fuel_economy', 'horsepower', 'make_name', 'mileage', 'seller_rating', 'torque', 'transmission', 'year',  'major_options_count'
]]  # Features
y = df['price']  # Target variable

# käytettävät sarakkeet:
# make_name, fuel_type, body_type, transmission, mileage, year, horsepower, torque, major_options_count, fuel_tank_volume, highway_fuel_economy, city_fuel_economy, seller_rating

# One-Hot Encoding categorial features
categorical_features = ['body_type', 'fuel_type', 'make_name', 'transmission',]
numerical_features = ['city_fuel_economy', 'highway_fuel_economy', 'fuel_tank_volume', 'horsepower', 'mileage', 'major_options_count', 'seller_rating', 'torque', 'year' ]

# Creating preprocessing pipelines for categorical features
categorical_pipeline = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # Applies One-Hot Encoding
])

# No transformation for numerical features in this pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_pipeline, categorical_features),
        ('num', 'passthrough', numerical_features)  # No changes to numerical features
    ]
)

# Splitting the data into training, validation, and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Apply preprocessing to the training and validation data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_val_preprocessed = preprocessor.transform(X_val)
X_test_preprocessed = preprocessor.transform(X_test)

# Define the LightGBM model
model = lgb.LGBMRegressor(
    objective='regression', 
    n_estimators=21485,
    learning_rate=0.037,
    num_leaves=180, 
    max_depth=-1,
    n_jobs=7, 
    random_state=42,
    min_child_samples=6,
    #subsample=0.8,
    #colsample_bytree=0.8,
    #force_row_wise=True
    #force_col_wise=True
)

# Fit the model with early stopping using callback
model.fit(
    X_train_preprocessed, y_train, 
    eval_set=[(X_val_preprocessed, y_val)], 
    callbacks=[early_stopping(stopping_rounds=150, verbose=True)]
)

# Predicting the 'price' for the test data
y_pred = model.predict(X_test_preprocessed)

# Calculating the performance of the predictions
rmse = np.sqrt(mean_squared_error(y_test,y_pred))
r2 = r2_score(y_test, y_pred)

# Printing performance metrics
print(f'Root Mean Squared Error: {rmse}')
print(f'R^2 Score: {r2}')

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.024836 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1417
[LightGBM] [Info] Number of data points in the train set: 1409224, number of used features: 84
[LightGBM] [Info] Start training from score 30691.202544
Training until validation scores don't improve for 150 rounds
Did not meet early stopping. Best iteration is:
[21461]	valid_0's l2: 8.91646e+06
Root Mean Squared Error: 3001.835109409577
R^2 Score: 0.9653814855057747


In [4]:
# Predicting the 'price' for the test data
# y_pred = model.predict(X_test_preprocessed)

# Calculating the performance of the predictions
rmse = np.sqrt(mean_squared_error(y_test,y_pred))
r2 = r2_score(y_test, y_pred)

# Printing performance metrics
print(f'Root Mean Squared Error: {rmse}')
print(f'R^2 Score: {r2}')

Root Mean Squared Error: 3001.835109409577
R^2 Score: 0.9653814855057747


In [11]:
# Comparing predicted prices with actual prices
comparison_df = pd.DataFrame({'Actual Price': y_test, 'Predicted Price': y_pred})
comparison_df['Difference'] = comparison_df['Predicted Price'] - comparison_df['Actual Price']
comparison_df['Difference%'] = np.abs(comparison_df['Difference'] / comparison_df['Actual Price'] * 100)

# Sorting the DataFrame by the difference percentage to see the predictions with the biggest discrepancies
comparison_df.sort_values(by='Difference%', ascending=False, inplace=True)

# Reset index for better readability
comparison_df.reset_index(drop=True, inplace=True)

# Generate a random sample from the comparison dataframe
random_comparison_sample = comparison_df.sample(n=15, random_state=None)  # 'n' is the number of samples

# Display the random sample
print(random_comparison_sample)

        Actual Price  Predicted Price   Difference  Difference%
145619       12999.0     11994.136388 -1004.863612     7.730315
208243       67853.0     64105.911237 -3747.088763     5.522363
345230       32778.0     32054.204903  -723.795097     2.208173
348720       50685.0     51768.918438  1083.918438     2.138539
422610       37880.0     37581.272344  -298.727656     0.788616
168818       47495.0     44257.611070 -3237.388930     6.816273
379784       31515.0     31028.770748  -486.229252     1.542850
434764       44461.0     44201.839880  -259.160120     0.582893
206407       31995.0     30210.013324 -1784.986676     5.578955
414063        9990.0     10083.114023    93.114023     0.932072
250162       14500.0     15131.788387   631.788387     4.357161
314061       14675.0     14255.687729  -419.312271     2.857324
396986       19995.0     19748.248237  -246.751763     1.234067
249898       39999.0     41744.690592  1745.690592     4.364336
15470        23182.0     28099.979653  4