In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
df = pd.read_csv('model_data_100k.csv')

### Ominaisuuksien tärkeyden arviointi

Tässä on arvioitu ominaisuuksien tärkeyttä auton hinnan kannalta.

Datana on käytetty puhdistettua/muokattua dataa jossa on 100 000 ensimmäistä riviä alkuperäisestä datatiedostosta. Koodin suorituksessa kestää ikä ja terveys jos käyttää suuria datatiedostoja.

Käytetty random forest -mallia ominaisuuksien tärkeyden arviointiin.

Tulokset kuvastavat suhteellisen tarkasti mitkä ominaisuudet ovat tärkeitä.

En tiedä kuinka paljon tuloksiin vaikuttaa se, että dataa on muokattu ja siitä on tyhjät arvot korvattu keskiarvoilla jne.

In [7]:
# Set features and target variable
X = df.drop('price', axis=1)
y = df['price']

# Define categorical and numerical columns
categorical_features = X.select_dtypes(include=['object', 'bool']).columns.tolist()
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=6)
# Maybe try using different max_depth values

# Create a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', rf_model)
])

# Train the model with the training data
pipeline.fit(X_train, y_train)

# Get the learned feature importances from the model
feature_importances = rf_model.feature_importances_

# Get preprocessed feature names
encoded_feature_names = (pipeline.named_steps['preprocessor']
                         .transformers_[1][1]
                         .get_feature_names_out(categorical_features))

# Create a list of all feature names
all_feature_names = numerical_features + list(encoded_feature_names)

# Create a DataFrame for feature importances
importances = pd.DataFrame({
    'Feature': all_feature_names,
    'Importance': feature_importances
})

# Sort features by their importance
importances_sorted = importances.sort_values(by='Importance', ascending=False)

# Print the most important features
print(importances_sorted.head(15))

                        Feature  Importance
1                    horsepower    0.394254
2                       mileage    0.244415
4                          year    0.046512
102           model_name_250 GT    0.034530
131  model_name_599 GTB Fiorano    0.026344
52            make_name_Ferrari    0.025732
10       body_type_Pickup Truck    0.018990
3                 seller_rating    0.015386
0             city_fuel_economy    0.013234
729      model_name_SLR McLaren    0.011387
84        make_name_Rolls-Royce    0.006892
44                make_name_BMW    0.006577
144              model_name_911    0.005883
920            wheel_system_FWD    0.005876
919            wheel_system_AWD    0.004972
