In [1]:
# Importing required libraries
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

# Listing the data available
import os
folder = 'bluebook-for-bulldozers'
for item in os.listdir(folder):
    print(item)

# Loading TRAINING data separately (NOT combined TrainAndValid.csv)
df_train = pd.read_csv("bluebook-for-bulldozers/Train.csv", low_memory=False, parse_dates=['saledate'])
print(f"Training data shape: {df_train.shape}")
df_train.head(10)

# Loading VALIDATION data separately and merging with validation solutions
df_valid_features = pd.read_csv("bluebook-for-bulldozers/Valid.csv", low_memory=False, parse_dates=['saledate'])
df_valid_targets = pd.read_csv("bluebook-for-bulldozers/ValidSolution.csv")

# Merge validation features with targets
df_valid = df_valid_features.merge(df_valid_targets[['SalesID', 'SalePrice']], on='SalesID', how='left')
print(f"Validation data shape: {df_valid.shape}")
print(f"Validation features shape: {df_valid_features.shape}")
print(f"Validation targets shape: {df_valid_targets.shape}")
df_valid.head(10)

# Loading TEST data
df_test = pd.read_csv("bluebook-for-bulldozers/Test.csv", low_memory=False, parse_dates=['saledate'])
print(f"Test data shape: {df_test.shape}")
df_test.head(10)

# Checking the date ranges to confirm proper separation
print("Training data date range:")
print(f"From: {df_train['saledate'].min()} To: {df_train['saledate'].max()}")
print("\nValidation data date range:")
print(f"From: {df_valid['saledate'].min()} To: {df_valid['saledate'].max()}")
print("\nTest data date range:")
print(f"From: {df_test['saledate'].min()} To: {df_test['saledate'].max()}")

# Visualizing the training data sales over time
fig, ax = plt.subplots()
ax.scatter(x=df_train["saledate"][:1000], # visualize the first 1000 values
           y=df_train["SalePrice"][:1000])
ax.set_xlabel("Sale Date")
ax.set_ylabel("Sale Price ($)")
ax.set_title("Training Data: Sale Price vs Date");

# Sorting training data by date
df_train.sort_values(by='saledate', ascending=True, inplace=True)
print("Training data sorted by date")
df_train.head(10)

# Sorting validation data by date
df_valid.sort_values(by='saledate', ascending=True, inplace=True)
print("Validation data sorted by date")
df_valid.head(10)

# Sorting test data by date
df_test.sort_values(by='saledate', ascending=True, inplace=True)
print("Test data sorted by date")
df_test.head(10)

# Function to add date features to any dataframe
def add_datepart_features(df):
    """Add datetime parameters for saledate column"""
    df_copy = df.copy()
    df_copy["saleYear"] = df_copy.saledate.dt.year
    df_copy["saleMonth"] = df_copy.saledate.dt.month
    df_copy["saleDay"] = df_copy.saledate.dt.day
    df_copy["saleDayofweek"] = df_copy.saledate.dt.dayofweek
    df_copy["saleDayofyear"] = df_copy.saledate.dt.dayofyear

    # Drop original saledate column
    df_copy.drop("saledate", axis=1, inplace=True)
    return df_copy

# Adding date features to training data
df_train_processed = add_datepart_features(df_train)
print("Training data with date features:")
df_train_processed[["SalePrice", "saleYear", "saleMonth", "saleDay", "saleDayofweek", "saleDayofyear"]].head()

# Adding date features to validation data
df_valid_processed = add_datepart_features(df_valid)
print("Validation data with date features:")
df_valid_processed[["SalePrice", "saleYear", "saleMonth", "saleDay", "saleDayofweek", "saleDayofyear"]].head()

# Adding date features to test data
df_test_processed = add_datepart_features(df_test)
print("Test data with date features:")
df_test_processed[["saleYear", "saleMonth", "saleDay", "saleDayofweek", "saleDayofyear"]].head()

# Visualizing monthly sales patterns in training data
df_train_processed.groupby(["saleMonth"])["SalePrice"].median().plot()
plt.xlabel("Month")
plt.ylabel("Median Sale Price ($)")
plt.title("Training Data: Median Sale Price by Month");

# Checking data info for training set
print("Training data info:")
df_train_processed.info()

# Checking missing values in training data
print("Missing values in training data:")
print(df_train_processed.isna().sum().sort_values(ascending=False))

# Function to analyze data types
def analyze_data_types(df, dataset_name):
    """Analyze and print data types for object and numeric columns"""
    print(f"\n= {dataset_name} Data Type Analysis =")

    # Object type columns
    number_of_object_type_columns = 0
    print("\nObject type columns:")
    for label, content in df.items():
        if pd.api.types.is_object_dtype(content):
            column_datatype = df[label].dtype.name
            example_value = content.sample(1).values
            example_value_dtype = pd.api.types.infer_dtype(example_value)
            print(f"Column name: {label} | Column dtype: {column_datatype} | Example value: {example_value} | Example value dtype: {example_value_dtype}")
            number_of_object_type_columns += 1

    print(f"\n[INFO] Total number of object type columns: {number_of_object_type_columns}")

    # Numeric type columns
    print("\nNumeric type columns:")
    for label, content in df.items():
        if pd.api.types.is_numeric_dtype(content):
            column_datatype = df[label].dtype.name
            example_value = content.sample(1).values
            example_value_dtype = pd.api.types.infer_dtype(example_value)
            print(f"Column name: {label} | Column dtype: {column_datatype} | Example value: {example_value} | Example value dtype: {example_value_dtype}")

# Analyze training data types
analyze_data_types(df_train_processed, "Training")

# Function to convert object columns to categories
def convert_strings_to_categories(df):
    """Convert object type columns to category type"""
    df_copy = df.copy()
    for label, content in df_copy.items():
        if pd.api.types.is_object_dtype(content):
            df_copy[label] = content.astype('category')
    return df_copy

# Convert training data
df_train_processed = convert_strings_to_categories(df_train_processed)
print("Training data after converting strings to categories:")
df_train_processed.info()

# Convert validation data
df_valid_processed = convert_strings_to_categories(df_valid_processed)
print("Validation data after converting strings to categories:")
df_valid_processed.info()

# Convert test data
df_test_processed = convert_strings_to_categories(df_test_processed)
print("Test data after converting strings to categories:")
df_test_processed.info()

# Function to handle missing values for numeric columns
def fill_missing_numeric(df):
    """Fill missing values in numeric columns with median and create missing indicators"""
    df_copy = df.copy()

    # Check for which numeric columns have null values
    print("Numeric columns with missing values:")
    for label, content in df_copy.items():
        if pd.api.types.is_numeric_dtype(content):
            if pd.isnull(content).sum():
                print(f"Column name: {label} | Has missing values: {True} | Count: {pd.isnull(content).sum()}")
            else:
                print(f"Column name: {label} | Has missing values: {False}")

    # Filling missing values with Median
    for label, content in df_copy.items():
        if pd.api.types.is_numeric_dtype(content):
            if pd.isnull(content).sum():
                df_copy[label+"_is_missing"] = pd.isnull(content).astype(int)
                df_copy[label] = content.fillna(content.median())

    return df_copy

# Fill missing values in training data
df_train_processed = fill_missing_numeric(df_train_processed)
print("\nTraining data after filling missing numeric values")

# Fill missing values in validation data using training data medians
def fill_missing_numeric_with_train_stats(df_valid, df_train):
    """Fill missing values in validation data using training data statistics"""
    df_copy = df_valid.copy()

    for label, content in df_copy.items():
        if pd.api.types.is_numeric_dtype(content):
            if pd.isnull(content).sum():
                df_copy[label+"_is_missing"] = pd.isnull(content).astype(int)
                # Use training data median for validation data
                train_median = df_train[label].median()
                df_copy[label] = content.fillna(train_median)

    return df_copy

# Fill missing values in validation data using training stats
df_valid_processed = fill_missing_numeric_with_train_stats(df_valid_processed, df_train_processed)
print("Validation data after filling missing numeric values with training stats")

# Fill missing values in test data using training data medians
df_test_processed = fill_missing_numeric_with_train_stats(df_test_processed, df_train_processed)
print("Test data after filling missing numeric values with training stats")

# Function to handle categorical columns and convert to numeric
def handle_categorical_columns(df_train, df_valid, df_test):
    """
    Process categorical columns by mapping validation and test datasets
    to the same category codes as the training dataset.
    Parameters:
        df_train: DataFrame for training
        df_valid: DataFrame for validation
        df_test: DataFrame for testing
    Returns:
        Tuple: (Processed DataFrames, category mappings)
    """
    column_to_category_dict = {}
    df_train_copy = df_train.copy()
    df_valid_copy = df_valid.copy()
    df_test_copy = df_test.copy()

    for label, content in df_train.items():
        if not pd.api.types.is_numeric_dtype(content):
            # Train Dataset: Convert to Categorical and Map Codes
            train_categories = pd.Categorical(content)
            df_train_copy[label] = train_categories.codes + 1
            column_to_category_dict[label] = dict(
                zip(range(1, len(train_categories.categories) + 1), train_categories.categories)
            )

            if label in df_valid.columns:
                # Validation Dataset: Map Categories
                valid_categories = pd.Categorical(df_valid[label], categories=train_categories.categories)
                df_valid_copy[label] = valid_categories.codes + 1
                # Handle unknown categories in validation
                df_valid_copy[label] = df_valid_copy[label].fillna(0).astype(int)

            if label in df_test.columns:
                # Test Dataset: Map Categories
                test_categories = pd.Categorical(df_test[label], categories=train_categories.categories)
                df_test_copy[label] = test_categories.codes + 1
                # Handle unknown categories in testing
                df_test_copy[label] = df_test_copy[label].fillna(0).astype(int)

    return (df_train_copy, df_valid_copy, df_test_copy), column_to_category_dict

# Process all datasets with consistent categorical encoding
processed_data, category_mappings = handle_categorical_columns(
    df_train_processed, df_valid_processed, df_test_processed
)
# Corrected Code
# Process all datasets and unpack them correctly
(df_train_final,
 df_valid_final,
 df_test_final), category_mappings = handle_categorical_columns(
    df_train_processed, df_valid_processed, df_test_processed
)
print("All datasets processed with consistent categorical encoding")

# Verify no missing values remain
print("Final check for missing values:")
print(f"Training data missing values: {df_train_final.isna().sum().sum()}")
print(f"Validation data missing values: {df_valid_final.isna().sum().sum()}")
print(f"Test data missing values: {df_test_final.isna().sum().sum()}")

# Importing machine learning libraries
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit, cross_val_score
import numpy as np

# Create features for all datasets
X_train = df_train_final.drop("SalePrice", axis=1)
X_valid = df_valid_final.drop("SalePrice", axis=1)
X_test = df_test_final

# Create log-transformed targets for RMSLE optimization
y_train = np.log1p(df_train_final["SalePrice"])  # log1p = log(1 + x) handles zeros safely
y_valid = np.log1p(df_valid_final["SalePrice"])

print(f"Training features shape: {X_train.shape}")
print(f"Training targets shape (log-transformed): {y_train.shape}")
print(f"Validation features shape: {X_valid.shape}")
print(f"Validation targets shape (log-transformed): {y_valid.shape}")

# Align columns of validation and test sets with training set
missing_cols_valid = set(X_train.columns) - set(X_valid.columns)
for c in missing_cols_valid:
    X_valid[c] = 0
missing_cols_test = set(X_train.columns) - set(X_test.columns)
for c in missing_cols_test:
    X_test[c] = 0

# Ensure the order of columns is the same
X_valid = X_valid[X_train.columns]
X_test = X_test[X_train.columns]
print(f"Test features shape: {X_test.shape}")

# Function to calculate RMSLE optimized for log-transformed targets
def rmsle_optimized(y_true_log, y_pred_log):
    """Calculate RMSE on log-transformed values (equivalent to RMSLE on original scale)"""
    return np.sqrt(mean_squared_error(y_true_log, y_pred_log))

# Train initial model on log-transformed targets
model = RandomForestRegressor(n_jobs=-1, random_state=42)
print("Training initial model on log-transformed targets...")
model.fit(X=X_train, y=y_train)
print("Initial model training completed!")

# Making predictions on training data (in log space)
y_train_pred_log = model.predict(X_train)
train_rmsle = rmsle_optimized(y_train, y_train_pred_log)

# Convert back to original scale for MAE calculation
y_train_pred_original = np.expm1(y_train_pred_log)
y_train_original = np.expm1(y_train)
train_mae = mean_absolute_error(y_train_original, y_train_pred_original)

print(f"Training RMSLE: {train_rmsle:.4f}")
print(f"Training MAE: ${train_mae:,.2f}")

# Making predictions on validation data for initial evaluation
y_valid_pred_log = model.predict(X_valid)
valid_rmsle = rmsle_optimized(y_valid, y_valid_pred_log)

# Convert back to original scale for MAE calculation
y_valid_pred_original = np.expm1(y_valid_pred_log)
y_valid_original = np.expm1(y_valid)
valid_mae = mean_absolute_error(y_valid_original, y_valid_pred_original)

print(f"Validation RMSLE: {valid_rmsle:.4f}")
print(f"Validation MAE: ${valid_mae:,.2f}")

# Define hyperparameter grid for optimization
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 15, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

# Create TimeSeriesSplit for proper time-series cross-validation
tscv = TimeSeriesSplit(n_splits=3)
print("Created TimeSeriesSplit with 3 splits for proper temporal validation")

# Create RandomizedSearchCV with time-series aware cross-validation
rf_random = RandomizedSearchCV(
    estimator=RandomForestRegressor(random_state=42, n_jobs=-1),
    param_distributions=param_grid,
    n_iter=25,
    cv=tscv,  # Use TimeSeriesSplit instead of default CV
    verbose=1,
    random_state=42,
    n_jobs=-1,
    scoring='neg_mean_squared_error'  # MSE on log-transformed targets = RMSLE optimization
)

# Start hyperparameter tuning with proper time-series validation
print("Starting hyperparameter tuning with TimeSeriesSplit (no data leakage)...")
rf_random.fit(X_train, y_train)
print("Hyperparameter tuning completed!")

# Get the best model from hyperparameter tuning
best_model = rf_random.best_estimator_
print(f"Best parameters: {rf_random.best_params_}")
print(f"Best cross-validation RMSE (log scale): {np.sqrt(-rf_random.best_score_):.4f}")

# Evaluate best model on validation data
y_valid_pred_best_log = best_model.predict(X_valid)
valid_rmsle_best = rmsle_optimized(y_valid, y_valid_pred_best_log)

# Convert to original scale for interpretability
y_valid_pred_best_original = np.expm1(y_valid_pred_best_log)
valid_mae_best = mean_absolute_error(y_valid_original, y_valid_pred_best_original)

print(f"Best Model Validation RMSLE: {valid_rmsle_best:.4f}")
print(f"Best Model Validation MAE: ${valid_mae_best:,.2f}")

# Combine training and validation data for final model training
print("Combining training and validation data for final model...")
X_combined = pd.concat([X_train, X_valid], axis=0, ignore_index=True)
y_combined = pd.concat([
    pd.Series(y_train, name='SalePrice_log'),
    pd.Series(y_valid, name='SalePrice_log')
], axis=0, ignore_index=True)

print(f"Combined dataset shape: {X_combined.shape}")
print(f"Combined targets shape: {y_combined.shape}")

# Create final model with best hyperparameters
final_model = RandomForestRegressor(
    **rf_random.best_params_,
    random_state=42,
    n_jobs=-1
)

# Train final model on all available data
print("Training final model on combined training + validation data...")
final_model.fit(X_combined, y_combined)
print("Final model training completed!")

# Estimate final model performance using cross-validation on combined data
print("Estimating final model performance with cross-validation...")
cv_scores = cross_val_score(
    final_model,
    X_combined,
    y_combined,
    cv=TimeSeriesSplit(n_splits=5),
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

final_cv_rmse = np.sqrt(-cv_scores.mean())
final_cv_std = np.sqrt(cv_scores.std())
print(f"Final Model CV RMSLE: {final_cv_rmse:.4f} (+/- {final_cv_std:.4f})")

# Make final predictions on test set
print("Making final predictions on test set...")
y_test_pred_log = final_model.predict(X_test)

# Convert predictions back to original scale for submission
y_test_pred_final = np.expm1(y_test_pred_log)

print(f"Test predictions completed!")
print(f"Test predictions shape: {y_test_pred_final.shape}")
print(f"Sample test predictions: {y_test_pred_final[:10]}")

# Create optimized submission file
submission_final = pd.DataFrame({
    'SalesID': df_test['SalesID'],
    'SalePrice': y_test_pred_final
})

submission_final.to_csv('bulldozer_price_predictions_optimized.csv', index=False)
print("Optimized submission file created: bulldozer_price_predictions_optimized.csv")

# Feature importance analysis using final model
feature_importance_final = pd.DataFrame({
    'feature': X_combined.columns,
    'importance': final_model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 20 most important features from final optimized model:")
print(feature_importance_final.head(20))

# Plot feature importance for final model
plt.figure(figsize=(12, 8))
top_features = feature_importance_final.head(20)
plt.barh(top_features['feature'], top_features['importance'])
plt.xlabel('Feature Importance')
plt.title('Top 20 Feature Importances - Final Optimized Model')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

print("\n=== OPTIMIZATION SUMMARY ===")
print("✅ Log transformation applied for RMSLE optimization")
print("✅ TimeSeriesSplit used to prevent data leakage")
print("✅ Final model trained on combined training + validation data")
print("✅ All predictions properly converted from log space to original scale")
print("✅ Model optimized for competition metric (RMSLE)")
