In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
df = pd.read_csv('model_data.csv')

### Ominaisuuksien tärkeyden arviointi

Tässä on arvioitu ominaisuuksien tärkeyttä auton hinnan kannalta.

Käytetty random forest -mallia ominaisuuksien tärkeyden arviointiin

Käytetty koko puhdistettua dataa ja koodin ajamisessa kesti kauan.

n_estimators arvo on pieni ja siinä olisi parantamisen varaa tulosten tarkkuuden kannalta

ominaisuudet_100k.ipynb tiedostossa on ajettu sama koodi, mutta pienemmällä datatiedostolla.

Nämä tulokset kyllä antavat suuntaa siitä mitkä ominaisuudet ovat tärkeitä auton hinnan kannalta

In [3]:
# Set features and target variable
X = df.drop('price', axis=1)
y = df['price']

# Define categorical and numerical columns
categorical_features = X.select_dtypes(include=['object', 'bool']).columns.tolist()
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create random forest model
rf_model = RandomForestRegressor(n_estimators=10, random_state=42, n_jobs=-1)

# Create pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', rf_model)
])

# Train the model
pipeline.fit(X_train, y_train)

# Get the learned feature importances from the model
feature_importances = rf_model.feature_importances_

# Get preprocessed feature names
encoded_feature_names = (pipeline.named_steps['preprocessor']
                         .transformers_[1][1]
                         .get_feature_names_out(categorical_features))

# Create a list of all feature names
all_feature_names = numerical_features + encoded_feature_names.tolist()

# Create a DataFrame for feature importances
importances = pd.DataFrame({
    'Feature': all_feature_names,
    'Importance': feature_importances
})

# Sort features by their importance
importances_sorted = importances.sort_values(by='Importance', ascending=False)

# Print the most important features
print(importances_sorted)


                       Feature  Importance
1                   horsepower    0.431989
2                      mileage    0.220409
4                         year    0.058751
3                seller_rating    0.019506
0            city_fuel_economy    0.018987
...                        ...         ...
663   model_name_Falcon Futura    0.000000
1377     model_name_Terraplane    0.000000
1380          model_name_Thing    0.000000
649          model_name_F12tdf    0.000000
918         model_name_Mark IX    0.000000

[1547 rows x 2 columns]
