In [60]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

In [61]:
# Load the dataset
df = pd.read_csv('AmesHousing.csv')

# Display shape and first few rows
print("Dataset shape:", df.shape)
df.head()

Dataset shape: (2930, 82)


Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,,MnPrv,,0,3,2010,WD,Normal,189900


In [62]:
# Create Total_Bathrooms feature (not in raw dataset, derived from bathroom columns)
# Formula: Full Bath + 0.5 * Half Bath + Bsmt Full Bath + 0.5 * Bsmt Half Bath
df['Total_Bathrooms'] = (df['Full Bath'].fillna(0) +
                        0.5 * df['Half Bath'].fillna(0) +
                        df['Bsmt Full Bath'].fillna(0) +
                        0.5 * df['Bsmt Half Bath'].fillna(0))

# Select features and target
features = ['Gr Liv Area', 'Bedroom AbvGr', 'Total_Bathrooms', 'Overall Qual', 'Year Built']
target = 'SalePrice'
data = df[features + [target]]

# Check SalePrice units (min, max, mean)
print("SalePrice statistics (in dollars):")
print(data['SalePrice'].describe())

# Check missing values
print("\nMissing values in selected columns:")
print(data.isnull().sum())

# Outlier handling: Remove extreme SalePrice and Gr Liv Area (top/bottom 0.5%)
data = data[(data['SalePrice'] >= data['SalePrice'].quantile(0.005)) &
            (data['SalePrice'] <= data['SalePrice'].quantile(0.995)) &
            (data['Gr Liv Area'] >= data['Gr Liv Area'].quantile(0.005)) &
            (data['Gr Liv Area'] <= data['Gr Liv Area'].quantile(0.995))]

# Verify shape after outlier removal
print("Shape after outlier removal:", data.shape)

SalePrice statistics (in dollars):
count      2930.000000
mean     180796.060068
std       79886.692357
min       12789.000000
25%      129500.000000
50%      160000.000000
75%      213500.000000
max      755000.000000
Name: SalePrice, dtype: float64

Missing values in selected columns:
Gr Liv Area        0
Bedroom AbvGr      0
Total_Bathrooms    0
Overall Qual       0
Year Built         0
SalePrice          0
dtype: int64
Shape after outlier removal: (2883, 6)


In [63]:
# Impute missing values for numeric features with median
for col in ['Gr Liv Area', 'Bedroom AbvGr', 'Total_Bathrooms', 'Overall Qual', 'Year Built']:
    if data[col].isnull().sum() > 0:
        data[col] = data[col].fillna(data[col].median())

# Drop rows with missing SalePrice
data = data.dropna(subset=[target])

# Verify no missing values
print("Missing values after cleaning:")
print(data.isnull().sum())

Missing values after cleaning:
Gr Liv Area        0
Bedroom AbvGr      0
Total_Bathrooms    0
Overall Qual       0
Year Built         0
SalePrice          0
dtype: int64


In [64]:
# Define X and y
X = data[features]
y = data[target]

# Split data: 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Print shapes
print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

Training set shape: (2306, 5) (2306,)
Testing set shape: (577, 5) (577,)


In [65]:
# Initialize models with improved hyperparameters
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=300, max_depth=25, min_samples_split=5, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=300, learning_rate=0.05, max_depth=7, subsample=0.8, random_state=42),
    'SVR': SVR(kernel='rbf', C=10000, epsilon=0.05)
}

# Train models and store results
results = {}
for name, model in models.items():
    # Use scaled data for SVR and Linear Regression, original for tree-based
    X_tr = X_train_scaled if name in ['SVR', 'Linear Regression'] else X_train
    model.fit(X_tr, y_train)
    print(f"{name} trained.")
    results[name] = model

Linear Regression trained.
Random Forest trained.
XGBoost trained.
SVR trained.


In [66]:
# Evaluate models
eval_results = {'Model': [], 'RMSE': [], 'R²': [], 'CV RMSE': []}

for name, model in models.items():
    # Predict on test set
    X_te = X_test_scaled if name == 'SVR' else X_test
    y_pred = model.predict(X_te)

    # Calculate metrics
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    # Cross-validation RMSE
    X_scaled = X_train_scaled if name == 'SVR' else X_train
    cv_scores = -cross_val_score(model, X_scaled, y_train, scoring='neg_root_mean_squared_error', cv=5)
    cv_rmse = cv_scores.mean()

    # Store results
    eval_results['Model'].append(name)
    eval_results['RMSE'].append(rmse)
    eval_results['R²'].append(r2)
    eval_results['CV RMSE'].append(cv_rmse)

    print(f"\n{name}:")
    print(f"RMSE: ${rmse:.2f}")
    print(f"R²: {r2:.4f}")
    print(f"5-Fold CV RMSE: ${cv_rmse:.2f}")

    # Check prediction errors
    errors = np.abs(y_test - y_pred)
    print(f"Mean Absolute Error: ${errors.mean():.2f}")
    print(f"Max Absolute Error: ${errors.max():.2f}")

# Create DataFrame for results
eval_df = pd.DataFrame(eval_results)
print("\nEvaluation Summary:")
print(eval_df)


Linear Regression:
RMSE: $80605113.53
R²: -1244965.6820
5-Fold CV RMSE: $32806.25
Mean Absolute Error: $78999080.41
Max Absolute Error: $139101124.68





Random Forest:
RMSE: $28599.72
R²: 0.8433
5-Fold CV RMSE: $27221.80
Mean Absolute Error: $19177.31
Max Absolute Error: $163904.70

XGBoost:
RMSE: $27546.14
R²: 0.8546
5-Fold CV RMSE: $27178.20
Mean Absolute Error: $18503.38
Max Absolute Error: $152747.70

SVR:
RMSE: $30283.32
R²: 0.8243
5-Fold CV RMSE: $30159.47
Mean Absolute Error: $20475.42
Max Absolute Error: $195861.79

Evaluation Summary:
               Model          RMSE            R²       CV RMSE
0  Linear Regression  8.060511e+07 -1.244966e+06  32806.246757
1      Random Forest  2.859972e+04  8.432686e-01  27221.803813
2            XGBoost  2.754614e+04  8.546034e-01  27178.201562
3                SVR  3.028332e+04  8.242725e-01  30159.468594


In [67]:
# Select 5 representative rows (typical houses)
# Criteria: 1500-2500 sq ft, 2-4 bedrooms, 1-3 bathrooms, non-null values
pred_rows = data[
    (data['Gr Liv Area'].between(1500, 2500)) &
    (data['Bedroom AbvGr'].between(2, 4)) &
    (data['Total_Bathrooms'].between(1, 3))
].sample(5, random_state=42)

# Display selected rows
print("Selected rows for prediction:")
print(pred_rows[['Gr Liv Area', 'Bedroom AbvGr', 'Total_Bathrooms', 'SalePrice']])

Selected rows for prediction:
      Gr Liv Area  Bedroom AbvGr  Total_Bathrooms  SalePrice
136          2207              3              3.0     180000
1240         1570              3              2.0     166800
2337         2320              2              2.0     169000
2144         2000              3              3.0     305900
2902         1838              3              3.0     359900


In [68]:
# Select best model (lowest RMSE)
best_model_name = eval_df.loc[eval_df['RMSE'].idxmin(), 'Model']
best_model = results[best_model_name]
print(f"Best model: {best_model_name}")

# Function to get and validate user input
def get_user_input():
    print("\nNote: Total_Bathrooms is the sum of full bathrooms plus 0.5 times half bathrooms (above-grade and basement).")
    print("Example: 2 full baths + 1 half bath = 2 + 0.5*1 = 2.5 bathrooms.")
    print("Overall Qual is the house quality rating (1-10, higher is better).")
    print("Year Built is the construction year (e.g., 1900-2010).")
    try:
        gr_liv_area = float(input("Enter square footage (Gr Liv Area, e.g., 1500-2500): "))
        bedrooms = int(input("Enter number of bedrooms (Bedroom AbvGr, e.g., 2-4): "))
        bathrooms = float(input("Enter total bathrooms (e.g., 1-3, see note above): "))
        overall_qual = int(input("Enter overall quality (Overall Qual, 1-10): "))
        year_built = int(input("Enter year built (Year Built, e.g., 1900-2010): "))

        # Validate inputs based on dataset ranges
        if gr_liv_area < 500 or gr_liv_area > 5000:
            raise ValueError("Square footage should be between 500 and 5000.")
        if bedrooms < 0 or bedrooms > 10:
            raise ValueError("Bedrooms should be between 0 and 10.")
        if bathrooms < 0 or bathrooms > 6:
            raise ValueError("Bathrooms should be between 0 and 6.")
        if overall_qual < 1 or overall_qual > 10:
            raise ValueError("Overall quality should be between 1 and 10.")
        if year_built < 1800 or year_built > 2025:
            raise ValueError("Year built should be between 1800 and 2025.")

        return pd.DataFrame({
            'Gr Liv Area': [gr_liv_area],
            'Bedroom AbvGr': [bedrooms],
            'Total_Bathrooms': [bathrooms],
            'Overall Qual': [overall_qual],
            'Year Built': [year_built]
        })
    except ValueError as e:
        print(f"Error: {e}. Please enter valid numbers.")
        return None

# Get user input
X_pred = get_user_input()

if X_pred is not None:
    # Prepare features for prediction
    X_pred_scaled = scaler.transform(X_pred) if best_model_name in ['SVR', 'Linear Regression'] else X_pred

    # Make prediction
    predicted_price = best_model.predict(X_pred_scaled)

    # Display result
    print("\nUser Input House:")
    print(X_pred)
    print(f"Predicted Price: ${predicted_price[0]:.2f}")

Best model: XGBoost

Note: Total_Bathrooms is the sum of full bathrooms plus 0.5 times half bathrooms (above-grade and basement).
Example: 2 full baths + 1 half bath = 2 + 0.5*1 = 2.5 bathrooms.
Overall Qual is the house quality rating (1-10, higher is better).
Year Built is the construction year (e.g., 1900-2010).
Enter square footage (Gr Liv Area, e.g., 1500-2500): 1604
Enter number of bedrooms (Bedroom AbvGr, e.g., 2-4): 3
Enter total bathrooms (e.g., 1-3, see note above): 2.5
Enter overall quality (Overall Qual, 1-10): 6
Enter year built (Year Built, e.g., 1900-2010): 1998

User Input House:
   Gr Liv Area  Bedroom AbvGr  Total_Bathrooms  Overall Qual  Year Built
0       1604.0              3              2.5             6        1998
Predicted Price: $188250.61
