## Housing project_Regression
 

## 1. Preprocessing Pipeline

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn import set_config
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor

path = r"C:\Users\Aida\OneDrive\Documents\Bootcamp_WBS\Primer\Python\WBS_DATA\8_SUP_ML\Regression\Data\housing_iteration_6_regression.csv"
data = pd.read_csv(path)


# X: All columns except 'SalePrice' (features)
X = data.drop(columns=['SalePrice'])

# y: The 'SalePrice' column (target)
y = data['SalePrice']

# data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=31416)


In [15]:
# Build preprocessor
X_cat = X_train.select_dtypes(exclude="number").copy()
X_num = X_train.select_dtypes(include="number").copy()

# We have to take extra steps to maintain feature names if we want to know their coefficients
# Encoders and ColumnTransformers will remove feature names unless .set_output is specified

# Step 1: Defining ordinal & onehot columns
ordinal_cols = [
    'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'KitchenQual', 'FireplaceQu', 
    'PoolQC', 'HeatingQC', 'GarageFinish', 'GarageQual',
    'GarageCond', 'PavedDrive','LandSlope', 'LotShape', 'OverallCond', 'OverallQual'
]

onehot_cols = [
    'MSSubClass', 'MSZoning', 'Street', 'Alley', 'LandContour', 'Utilities', 'LotConfig', 
    'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 
    'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'Heating', 
    'CentralAir', 'Electrical', 'GarageType', 'Fence', 'SaleType', 'SaleCondition'
]

# Step 2: Defining the categorical encoder (with "N_A" for missing values)
# Corrected ordinal_categories
ordinal_categories = [
    ['Ex', 'Gd', 'TA', 'Fa', 'Po'],  # ExterQual
    ['Ex', 'Gd', 'TA', 'Fa', 'Po'],  # ExterCond
    ['Ex', 'Gd', 'TA', 'Fa', 'Po'],  # BsmtQual
    ['Ex', 'Gd', 'TA', 'Fa', 'Po'],  # BsmtCond
    ['Gd', 'Av', 'Mn', 'No'],        # BsmtExposure
    ['Ex', 'Gd', 'TA', 'Fa', 'Po'],  # KitchenQual
    ['Ex', 'Gd', 'TA', 'Fa', 'Po'],  # FireplaceQu
    ['Ex', 'Gd', 'TA', 'Fa', 'Po'],  # PoolQC
    ['Ex', 'Gd', 'TA', 'Fa', 'Po'],  # HeatingQC
    ['Fin', 'RFn', 'Unf'],           # GarageFinish
    ['Ex', 'Gd', 'TA', 'Fa', 'Po'],  # GarageQual
    ['Ex', 'Gd', 'TA', 'Fa', 'Po'],  # GarageCond
    ['Y', 'P', 'N'],                 # PavedDrive
    ['Gtl', 'Mod', 'Sev'],           # LandSlope
    ['Reg', 'IR1', 'IR2', 'IR3'],    # LotShape
    [10, 9, 8, 7, 6, 5, 4, 3, 2, 1], # OverallCond
    [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]  # OverallQual
]

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

# === Preprocessing Pipelines ===

# Ordinal pipeline
ordinal_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy="constant", fill_value="N_A")),  # Fill missing values
    ('ordinal', OrdinalEncoder(categories=ordinal_categories, handle_unknown='use_encoded_value', unknown_value=-1))  # Ordinal encoding
])

# One-hot pipeline
onehot_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy="constant", fill_value="N_A")),  # Fill missing values
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))  # One-hot encoding
])

# Numerical pipeline
numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Fill missing values with mean
    ('scaler', StandardScaler())  # Standard scaling
])

# Combine pipelines with ColumnTransformer
full_preprocessing = ColumnTransformer(transformers=[
    ("num_pipe", numeric_pipeline, X_num.columns),       # Apply numerical pipeline to numerical columns
    ("ordinal", ordinal_pipeline, ordinal_cols),        # Apply ordinal pipeline to ordinal columns
    ("onehot", onehot_pipeline, onehot_cols)            # Apply one-hot pipeline to categorical columns
])



In [None]:
# === Full pipeline: preprocessing + DecisionTreeRegressor ===
full_pipeline = Pipeline(steps=[
    ("preprocessor", full_preprocessing),  # Preprocessing
    ("decisiontreeregressor", DecisionTreeRegressor(random_state=42))  # Model
])

# === Hyperparameter Grid ===
param_grid = {
    "decisiontreeregressor__max_depth": [5, 10, 15, 20, None],  # Maximum depth of the tree
    "decisiontreeregressor__min_samples_split": [2, 5, 10, 15],  # Minimum samples to split a node
    "decisiontreeregressor__min_samples_leaf": [1, 2, 5, 10],  # Minimum samples in a leaf
    "decisiontreeregressor__max_features": ["sqrt", "log2", None]  # Features considered at each split
}

# === GridSearchCV for Hyperparameter Tuning ===
search = GridSearchCV(
    full_pipeline,
    param_grid=param_grid,
    cv=5,  # 5-fold cross-validation
    verbose=1,
    scoring="r2",  # Optimize for R^2
    n_jobs=-1  # Use all available CPU cores
)

# === Model Training ===
search.fit(X_train, y_train)

# === Results ===
print("Best Parameters:", search.best_params_)
print("Best Score (R²):", search.best_score_)

# === Feature Importance Analysis ===
# Extract the best decision tree model
best_model = search.best_estimator_.named_steps['decisiontreeregressor']

# Extract feature importances
feature_importances = best_model.feature_importances_

# Retrieve feature names from preprocessing pipeline
numeric_features = X_num.columns.tolist()  # Numerical feature names
ordinal_features = ordinal_cols  # Ordinal feature names
onehot_features = search.best_estimator_.named_steps['preprocessor'] \
    .transformers_[2][1].named_steps['onehot'].get_feature_names_out(onehot_cols).tolist()

# Combine all feature names
all_feature_names = numeric_features + ordinal_features + onehot_features

# Create a dictionary for feature importances
importance_dict = dict(zip(all_feature_names, feature_importances))

# Sort and display feature importances
sorted_importances = sorted(importance_dict.items(), key=lambda x: abs(x[1]), reverse=True)
print("Feature Importances (sorted):")
for feature, importance in sorted_importances:
    print(f"{feature}: {importance}")


## 3. Error analysis

In [20]:
from sklearn.metrics import r2_score


# Make predictions on the training data
y_train_pred = search.predict(X_train)

# Calculate accuracy score
r2 = r2_score(y_train, y_train_pred )
print(f"R² Score : {r2}")

R² Score : 0.979832375807639


In [21]:
y_test_pred = search.predict(X_test)  # Predictions on test data
r2 = r2_score(y_test, y_test_pred) 
print(f"R² Score : {r2}")   # Compare to test labels

R² Score : 0.7215943880705856
