In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
import optuna
import joblib

# Load the dataset
data = pd.read_csv('filtered_data.csv')

# Preprocessing
X = data.drop('Price', axis=1)
y = data['Price']
categorical_features = X.select_dtypes(include=['object']).columns

# Define a preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ],
    remainder='passthrough'
)

# Splitting the data into training and test sets for final evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def objective(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 1, 2000),
        'max_depth': trial.suggest_int('max_depth', 1, 20),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'min_child_weight': trial.suggest_int('min_child_weight', 0, 20),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0)
    }

    model = XGBRegressor(**param)
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', model)])
    
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X_train, y_train, cv=kf, scoring='r2', n_jobs=-1)
    r2_mean = np.mean(scores)
    return -r2_mean  # Return the negative R^2 to maximize it

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=1)  # You can adjust the number of trials

# Best hyperparameters
print(f"Best hyperparameters: {study.best_trial.params}")

# Train the model with the best parameters
best_params = study.best_trial.params
best_model = XGBRegressor(**best_params)
best_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                ('model', best_model)])

best_pipeline.fit(X_train, y_train)

# Predict and evaluate on the test set
y_pred = best_pipeline.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
n = len(y_test)
p = X_test.shape[1]
adjusted_r2 = 1 - ((1 - r2) * (n - 1) / (n - p - 1))

# Performance Metrics
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R^2: {r2}")
print(f"Adjusted R^2: {adjusted_r2}")


[I 2024-04-12 20:55:06,052] A new study created in memory with name: no-name-3302fc05-b09c-43d9-addb-bdfbcbfc3104
[I 2024-04-12 20:55:07,000] Trial 0 finished with value: -0.2523047175646897 and parameters: {'n_estimators': 544, 'max_depth': 15, 'learning_rate': 0.18097690090772048, 'min_child_weight': 20, 'subsample': 0.8405345444512003, 'colsample_bytree': 0.33181646191820147}. Best is trial 0 with value: -0.2523047175646897.


Best hyperparameters: {'n_estimators': 544, 'max_depth': 15, 'learning_rate': 0.18097690090772048, 'min_child_weight': 20, 'subsample': 0.8405345444512003, 'colsample_bytree': 0.33181646191820147}
RMSE: 324.9420249959768
MAE: 211.52565375434028
R^2: 0.5528294816380489
Adjusted R^2: 0.3441499064024717


### Save model

In [2]:
# import joblib
# best_pipeline.fit(X_train, y_train)
# # Save the trained model to disk
# joblib.dump(best_pipeline, 'best_model.joblib')

### Load model

In [27]:
import joblib
loaded_model = joblib.load('best_model.joblib')
# Re-create the pipeline with the loaded model
loaded_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                  ('model', loaded_model.named_steps['model'])])

In [29]:
def get_feature_names(column_transformer, input_features):
    # This will hold the final names of the features
    output_features = []

    # Go through all the transformers in the ColumnTransformer
    for transformer_name, transformer, feature_indices in column_transformer.transformers_:
        if transformer_name == 'remainder':
            # Handle the remainder case (those features that were passed through)
            remainder_features = [input_features[i] for i in feature_indices]
            output_features.extend(remainder_features)
        else:
            # Process transformed features
            if hasattr(transformer, 'get_feature_names_out'):
                feature_names = transformer.get_feature_names_out()
                output_features.extend(feature_names)
            else:
                # If no method to get feature names, use the input feature names directly
                output_features.extend(feature_indices)

    return output_features

# Assuming X contains all the original features
input_features = X.columns.tolist()

# Fetch feature names from the preprocessor using the revised function
feature_names = get_feature_names(preprocessor, input_features)

# Assuming best_pipeline is already fitted
if hasattr(best_pipeline.named_steps['model'], 'feature_importances_'):
    importances = best_pipeline.named_steps['model'].feature_importances_
    feature_importances = pd.DataFrame({'feature': feature_names, 'importance': importances})
    print(feature_importances.sort_values(by='importance', ascending=False))
else:
    print("The model is not fitted or does not support feature importance extraction.")


                           feature  importance
4              Number of Bathrooms    0.298964
2                            Rooms    0.219700
7                      Size (sqft)    0.078517
1              Property Type_House    0.071104
6                        Longitude    0.054869
0          Property Type_Apartment    0.044569
12  Time to Nearest Police Station    0.039564
13           Time to Nearest Store    0.035461
14        Time to Nearest Pharmacy    0.032957
11        Time to Nearest Hospital    0.030542
8                       Walk Score    0.026786
5                         Latitude    0.025818
10                      Bike Score    0.021546
9                    Transit Score    0.019603
3                     Den Included    0.000000


In [None]:
from sklearn.model_selection import train_test_split

# Assuming X, y, and the best_pipeline? have already been defined as shown in previous steps.

# Splitting the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Re-train the best model found by Optuna on the training set
best_pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_pipeline.predict(X_test)

# Calculate performance metrics
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Calculate Adjusted R-squared
n = len(y_test)  # Number of observations
p = X_test.shape[1]  # Number of features
adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)

# Print performance metrics
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R-squared: {r2}")
print(f"Adjusted R-squared: {adjusted_r2}")