In [None]:
#import sys
#!{sys.executable} -m pip install category_encoders

import pandas as pd
import numpy as np
import re
import joblib
from datetime import datetime
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.stats import randint, uniform

In [None]:
# Load data
car_data = pd.read_csv('F://Nedlastinger/cardata/train.csv')

# Extract target variable
target = 'price'
X = car_data.drop(columns=[target])
y = car_data[target]

# Remove outliers from the target variable (price)
upper_limit = y.quantile(0.99)
lower_limit = y.quantile(0.01)
mask = (y >= lower_limit) & (y <= upper_limit)
X = X[mask]
y = y[mask]

In [None]:
# Preprocess engine column: Extract horsepower as a numerical feature
def extract_horsepower(engine_str):
    match = re.search(r"(\d+\.\d+)HP", engine_str)
    return float(match.group(1)) if match else np.nan

X['horsepower'] = X['engine'].apply(extract_horsepower)
X['engine_type'] = X['engine'].str.extract(r"(\d+\.\d+L \d+ Cylinder)")[0]

# Simplify accident and clean_title
X['accident'] = (X['accident'] != 'None reported').astype(int)
X['clean_title'] = (X['clean_title'] == 'Yes').astype(int)

# Add Feature: Car Age
current_year = datetime.now().year
X['car_age'] = current_year - X['model_year']

# Add Feature: Mileage Per Year
X['mileage_per_year'] = X['milage'] / X['car_age']
X['mileage_per_year'] = X['mileage_per_year'].replace([np.inf, -np.inf], np.nan)  # Handle division by zero

# Add Interaction Feature: Milage * Car Age
X['milage_car_age_interaction'] = X['milage'] * X['car_age']

# Add Log-Transformed Features for Skewed Data
X['log_milage'] = np.log1p(X['milage'])
X['log_mileage_per_year'] = np.log1p(X['mileage_per_year'])

# Add Feature: Number of Cylinders
X['cylinders'] = X['engine_type'].str.extract(r'(\d+) Cylinder').astype(float)

# Simplify Categorical Features: Group Luxury Brands
luxury_brands = ['Porsche', 'Lamborghini', 'Bentley']
X['brand_luxury'] = X['brand'].apply(lambda x: 1 if x in luxury_brands else 0)

# Encode Brand Using Target Encoding
from category_encoders import TargetEncoder

brand_encoder = TargetEncoder()
X['brand_encoded'] = brand_encoder.fit_transform(X['brand'], y)

# Drop redundant columns
X = X.drop(columns=['engine', 'brand'])  # Drop original `engine` and `brand`

# Define categorical and numerical columns
categorical_features = ['model', 'fuel_type', 'transmission', 'ext_col', 'int_col', 'engine_type', 'brand_luxury']
numerical_features = [
    'model_year', 'milage', 'horsepower', 'accident', 'clean_title', 
    'car_age', 'mileage_per_year', 'cylinders', 
    'milage_car_age_interaction', 'log_milage', 'log_mileage_per_year', 'brand_encoded'
]

In [None]:
# Imputation strategies
numerical_imputer = SimpleImputer(strategy='median')  # Fill numerical NaNs with median
categorical_imputer = SimpleImputer(strategy='most_frequent')  # Fill categorical NaNs with mode

# Preprocessing pipeline
numerical_transformer = Pipeline(steps=[
    ('imputer', numerical_imputer),
    ('scaler', MinMaxScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', categorical_imputer),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [None]:
# Model pipeline found by randomizedsearch
optimized_model = RandomForestRegressor(
    max_depth=18,
    max_features=0.39388669192525183,
    min_samples_leaf=7,
    min_samples_split=5,
    n_estimators=298,
    random_state=42
)

final_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', optimized_model)
])

In [None]:
# Split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the pipeline
final_pipeline.fit(X_train, y_train)

# Predict on the validation set
y_pred_val = final_pipeline.predict(X_val)

# Evaluate the optimized model
mae = mean_absolute_error(y_val, y_pred_val)
mse = mean_squared_error(y_val, y_pred_val)
rmse = np.sqrt(mse)

print(f"Optimized Model with Enhanced Features - MAE: {mae}, MSE: {mse}, RMSE: {rmse}")

# Access the trained regressor inside the pipeline
feature_importances = final_pipeline.named_steps['regressor'].feature_importances_

# Get feature names after preprocessing
feature_names = final_pipeline.named_steps['preprocessor'].get_feature_names_out()

# Create a DataFrame for better visualization
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

print(importance_df)


In [None]:
print(importance_df.head(10))

In [None]:
import joblib

# Save the pipeline
joblib.dump(final_pipeline, 'F://Nedlastinger/cardata/final_modelv2.pkl')

In [None]:
# Save the TargetEncoder
joblib.dump(brand_encoder, 'F://Nedlastinger/cardata/brand_encoder.pkl')