In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Load the dataset
df = pd.read_csv('model_data_100k.csv')

### Malli
Random Forest

Kokeiltu myös Linear Regressionia jolla sain huonoja tuloksia

Voi testata vaihtelemalla hyperparametreja, datatiedostoa ja käytettäviä sarakkeita. 

Testissä käytetty datatiedostoa joka sisältää 100 000 ensimmäistä riviä alkuperäisestä datasta koodin suoritusajan nopeuttamiseksi. 

Dataa on myös muokattu, kuten tyhjät arvot korvattu False:lla joissain tapauksissa, sekä käytetty mediaania tai keskiarvoa tyhjien arvojen kohdalla tietyissä sarakkeissa jne.

##### HUOM, mallin tarkkuus on hyvä, mutta kun virheitä arvioinnissa tulee, niin hinnan heitto voi olla todella suurta.

In [10]:
# Selecting the desired features for the model and the target variable
X = df[['mileage', 'horsepower', 'year', 'seller_rating', 'city_fuel_economy', 'make_name', 'body_type', 'wheel_system', 'isCab', 'transmission', 'fuel_type', "is_cpo", "is_oemcpo"]]  # Features
y = df['price']  # Target variable

# All the columns that can be used in training:
# "city_fuel_economy", "body_type",
# "horsepower", "exterior_color", "mileage",
# "make_name", "model_name", "year", "wheel_system", "seller_rating", "is_new", 
# "is_cpo", "is_oemcpo", "isCab", "transmission", "fuel_type", "price"


# One-Hot Encoding categorial features
categorical_features = ['make_name', 'body_type', 'wheel_system', 'isCab', 'transmission', 'fuel_type', "is_cpo", "is_oemcpo"]
numerical_features = ['mileage', 'horsepower', 'year', 'seller_rating', 'city_fuel_economy']

# Creating preprocessing pipelines for categorical features
categorical_pipeline = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # Applies One-Hot Encoding
])

# No transformation for numerical features in this pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_pipeline, categorical_features),
        ('num', 'passthrough', numerical_features)  # No changes to numerical features
    ]
)

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating a Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=6)
# Consider setting max_depth if necessary!

# Creating the final pipeline including preprocessing and the model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])

# Training the model with the training data
pipeline.fit(X_train, y_train)

# Predicting the 'price' for the test data
y_pred = pipeline.predict(X_test)

# Calculating the performance of the predictions
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test,y_pred))
r2 = r2_score(y_test, y_pred)

# Printing performance metrics
print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')
print(f'R^2 Score: {r2}')


# Comparing predicted prices with actual prices
comparison_df = pd.DataFrame({'Actual Price': y_test, 'Predicted Price': y_pred})
comparison_df['Difference'] = comparison_df['Predicted Price'] - comparison_df['Actual Price']
comparison_df['Difference%'] = np.abs(comparison_df['Difference'] / comparison_df['Actual Price'] * 100)

# Sorting the DataFrame by the difference percentage to see the predictions with the biggest discrepancies
comparison_df.sort_values(by='Difference%', ascending=False, inplace=True)

# Reset index for better readability
comparison_df.reset_index(drop=True, inplace=True)

# Generate a random sample from the comparison dataframe
random_comparison_sample = comparison_df.sample(n=15, random_state=None)  # 'n' is the number of samples

# Display the random sample
print(random_comparison_sample)


Mean Squared Error: 27809348.09484101
Root Mean Squared Error: 5273.456939697243
R^2 Score: 0.9271776014121227
       Actual Price  Predicted Price   Difference  Difference%
9328        29740.0     31651.819385  1911.819385     6.428444
372          9995.0     14356.160000  4361.160000    43.633417
14160       23737.0     24382.094843   645.094843     2.717676
18288       40988.0     41240.150000   252.150000     0.615180
12440       47368.0     49188.519683  1820.519683     3.843353
15324       38368.0     39159.261503   791.261503     2.062295
4406        42795.0     37226.990000 -5568.010000    13.010889
14702       29770.0     29052.993964  -717.006036     2.408485
16969       26545.0     26225.610308  -319.389692     1.203201
1651         2995.0      2325.340000  -669.660000    22.359265
3047        20395.0     17058.070000 -3336.930000    16.361510
13784       40998.0     42210.960000  1212.960000     2.958583
6809         5900.0      5354.340000  -545.660000     9.248475
19931  