In [1]:
# Cell 1: Importing Necessary Libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, r2_score
import xgboost as xgb
import joblib
import optuna

warnings.filterwarnings('ignore')

# Cell 2: Loading the Dataset
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Cell 3: Previewing the Training Dataset
print("Training Dataset Preview:")
print(train.head(3))

# Cell 4: Dataset Information
print("Training Dataset Info:")
train.info()

# Cell 5: Statistical Summary
print("Training Dataset Statistical Summary:")
print(train.describe())

# Cell 6: Data Preprocessing and Feature Engineering
def preprocess_data(df, is_train=True):
    # Select more features based on domain knowledge and correlation analysis
    features = ['GrLivArea', 'BedroomAbvGr', 'FullBath', 'HalfBath', 'Neighborhood',
                'OverallQual', 'YearBuilt', 'LotArea', 'GarageCars', 'Fireplaces',
                'KitchenQual', 'SalePrice'] if 'SalePrice' in df.columns else \
                         ['GrLivArea', 'BedroomAbvGr', 'FullBath', 'HalfBath', 'Neighborhood',
                          'OverallQual', 'YearBuilt', 'LotArea', 'GarageCars', 'Fireplaces',
                          'KitchenQual']

    df = df[features].copy()

    # Handle missing values
    df['GrLivArea'] = df['GrLivArea'].fillna(df['GrLivArea'].median())
    df['BedroomAbvGr'] = df['BedroomAbvGr'].fillna(df['BedroomAbvGr'].median())
    df['FullBath'] = df['FullBath'].fillna(df['FullBath'].median())
    df['HalfBath'] = df['HalfBath'].fillna(0)
    df['OverallQual'] = df['OverallQual'].fillna(df['OverallQual'].median())
    df['YearBuilt'] = df['YearBuilt'].fillna(df['YearBuilt'].median())
    df['LotArea'] = df['LotArea'].fillna(df['LotArea'].median())
    df['GarageCars'] = df['GarageCars'].fillna(df['GarageCars'].median())
    df['Fireplaces'] = df['Fireplaces'].fillna(0)
    df['KitchenQual'] = df['KitchenQual'].fillna(df['KitchenQual'].mode()[0])

    # Feature engineering
    df['TotalBath'] = df['FullBath'] + 0.5 * df['HalfBath']
    df['HouseAge'] = 2025 - df['YearBuilt']  # Assuming current year is 2025
    df['QualityArea'] = df['GrLivArea'] * df['OverallQual']  # Interaction term

    return df

# Preprocess training data
train_processed = preprocess_data(train)
X = train_processed.drop('SalePrice', axis=1)
y = np.log1p(train_processed['SalePrice'])

# Split data for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing pipeline
numeric_features = ['GrLivArea', 'BedroomAbvGr', 'FullBath', 'HalfBath', 'OverallQual',
                    'YearBuilt', 'LotArea', 'GarageCars', 'Fireplaces', 'TotalBath',
                    'HouseAge', 'QualityArea']
categorical_features = ['Neighborhood', 'KitchenQual']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features)
    ])

# Cell 7: Training the XGBoost Model with Optuna
def objective(trial):
    params = {
        'objective': 'reg:squarederror',
        'random_state': 42,
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 10)
    }

    model = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', xgb.XGBRegressor(**params))
    ])

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X, y, cv=kf, scoring='neg_mean_absolute_error', n_jobs=-1)
    return -scores.mean()

# Run Optuna optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

# Best model
best_params = study.best_params
print(f"Best Parameters: {best_params}")

# Train final model with best parameters
final_model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', xgb.XGBRegressor(**best_params, objective='reg:squarederror', random_state=42))
])
final_model.fit(X_train, y_train)

# Evaluate on validation set
y_val_pred_log = final_model.predict(X_val)
y_val_pred = np.expm1(y_val_pred_log)
y_val_raw = np.expm1(y_val)
mae = mean_absolute_error(y_val_raw, y_val_pred)
r2 = r2_score(y_val_raw, y_val_pred)
print(f"Validation MAE: ${mae:.2f}")
print(f"Validation R²: {r2:.4f}")

# Cell 8: Visualization (Actual vs Predicted)
plt.figure(figsize=(10, 6))
plt.scatter(y_val_raw, y_val_pred, alpha=0.5)
plt.plot([y_val_raw.min(), y_val_raw.max()], [y_val_raw.min(), y_val_raw.max()], 'r--', lw=2)
plt.xlabel('Actual SalePrice')
plt.ylabel('Predicted SalePrice')
plt.title('Actual vs Predicted SalePrice (Validation Set)')
plt.tight_layout()
plt.show()

# Cell 9: Saving the Trained Model
joblib.dump(final_model, 'xgboost_pipeline.pkl', compress=3)
print("XGBoost pipeline saved successfully.")

# Cell 10: Loading the Model and Making Predictions
def predict_sale_price(area, bedrooms, full_bath, half_bath, neighborhood, overall_qual,
                      year_built, lot_area, garage_cars, fireplaces, kitchen_qual):
    # Prepare input data
    input_data = pd.DataFrame({
        'GrLivArea': [area],
        'BedroomAbvGr': [bedrooms],
        'FullBath': [full_bath],
        'HalfBath': [half_bath],
        'Neighborhood': [neighborhood],
        'OverallQual': [overall_qual],
        'YearBuilt': [year_built],
        'LotArea': [lot_area],
        'GarageCars': [garage_cars],
        'Fireplaces': [fireplaces],
        'KitchenQual': [kitchen_qual]
    })

    # Preprocess input data
    input_data = preprocess_data(input_data, is_train=False)

    # Load model
    model = joblib.load('xgboost_pipeline.pkl')

    # Predict
    pred_log = model.predict(input_data)[0]
    return np.expm1(pred_log)

# Example user input
try:
    area = float(input("Enter the area (e.g., square footage): "))
    bedrooms = int(input("Enter the number of bedrooms: "))
    full_bath = int(input("Enter the number of full bathrooms: "))
    half_bath = int(input("Enter the number of half bathrooms: "))
    neighborhood = input("Enter the neighborhood (e.g., 'NAmes'): ")
    overall_qual = int(input("Enter overall quality (1-10): "))
    year_built = int(input("Enter the year built: "))
    lot_area = float(input("Enter the lot area (sq ft): "))
    garage_cars = int(input("Enter the number of garage cars: "))
    fireplaces = int(input("Enter the number of fireplaces: "))
    kitchen_qual = input("Enter the kitchen quality (e.g., 'TA'): ")  # Fixed syntax error

    predicted_price = predict_sale_price(area, bedrooms, full_bath, half_bath, neighborhood,
                                        overall_qual, year_built, lot_area, garage_cars,
                                        fireplaces, kitchen_qual)
    print(f"The predicted sale price is: ${predicted_price:.2f}")
except ValueError:
    print("Error: Please enter valid numeric values for numeric fields.")
except Exception as e:
    print(f"Error: {str(e)}")

ModuleNotFoundError: No module named 'optuna'