In [1]:
"hello"

'hello'

In [10]:
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, VotingRegressor
import xgboost as xgb
from scipy.stats import randint, uniform

# Load the dataset
dataset = pd.read_csv("train.csv")

# Data Preprocessing
dataset.drop(columns=["id", "int_col", "clean_title"], axis=1, inplace=True)
dataset.dropna(inplace=True)

# Drop rows with invalid fuel types
dataset = dataset[dataset["fuel_type"].isin(["-", "–"]) == False]

# Replace accident labels with binary values
dataset['accident'].replace({"None reported": 0, "At least 1 accident or damage reported": 1}, inplace=True)

# Encode categorical variables
label_cols = ["brand", "model", "fuel_type", "transmission", "ext_col", "engine"]
Lb = LabelEncoder()
for col in label_cols:
    dataset[col] = Lb.fit_transform(dataset[col])

# Log transformation of skewed features
dataset['price'] = np.log1p(dataset['price'])  # log1p to handle zero values
dataset['milage'] = np.log1p(dataset['milage'])  # Apply to any other numeric column as needed

# Define features and target variable
X = dataset.drop("price", axis=1)
Y = dataset["price"]

# Add polynomial features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_poly)

# Train-Test Split
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=42)

# Choose model type (XGBoost is typically faster)
model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Hyperparameter tuning with a reduced search space for faster results
param_dist = {
    'n_estimators': randint(100, 300),  # Reduced range for n_estimators
    'learning_rate': uniform(0.05, 0.2),  # Tighter range for learning rate
    'max_depth': randint(3, 8),  # Reduced depth
}

# RandomizedSearchCV with fewer iterations and fewer folds (cv=3)
random_search = RandomizedSearchCV(estimator=model, 
                                   param_distributions=param_dist, 
                                   n_iter=30,  # Reduced to 30 iterations
                                   cv=3,  # Reduced cross-validation folds
                                   scoring='r2', 
                                   verbose=2, 
                                   random_state=42, 
                                   n_jobs=-1)
random_search.fit(X_train, Y_train)

# Train the best model from RandomizedSearch
best_model = random_search.best_estimator_
best_model.fit(X_train, Y_train)

# Predict and evaluate
Y_train_pred = best_model.predict(X_train)
Y_test_pred = best_model.predict(X_test)

# Evaluate the model
print("Training R² score:", r2_score(Y_train, Y_train_pred))
print("Testing R² score:", r2_score(Y_test, Y_test_pred))
print("Testing MSE:", mean_squared_error(Y_test, Y_test_pred))

# Optional: Cross-validation scores
scores = cross_val_score(best_model, X_train, Y_train, cv=3, scoring='r2')
print("Cross-validation R² scores:", scores)
print("Average R² score:", scores.mean())

# Feature Importance
importances = best_model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': poly.get_feature_names_out(X.columns), 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)


Fitting 3 folds for each of 30 candidates, totalling 90 fits
Training R² score: 0.6918666337495116
Testing R² score: 0.6624203619991109
Testing MSE: 0.24195708857877843
Cross-validation R² scores: [0.66241011 0.66196928 0.65965049]
Average R² score: 0.6613432950710405
                    Feature  Importance
3                    milage    0.406492
2                model_year    0.210697
29        model_year engine    0.093166
5                    engine    0.033560
34         milage fuel_type    0.019548
43       fuel_type accident    0.019331
40         fuel_type engine    0.012492
27        model_year milage    0.012060
21          model fuel_type    0.010640
14             brand engine    0.009994
13          brand fuel_type    0.009987
50    transmission accident    0.009794
6              transmission    0.007376
10              brand model    0.007344
0                     brand    0.007239
41   fuel_type transmission    0.007033
38          milage accident    0.006865
22         