## Housing project_Regression
 

## 1. Preprocessing Pipeline

In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn import set_config
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor


path = r"C:\Users\Aida\OneDrive\Documents\Bootcamp_WBS\Primer\Python\WBS_DATA\8_SUP_ML\Regression\Data\housing_iteration_6_regression.csv"
data = pd.read_csv(path)


# X: All columns except 'SalePrice' (features)
X = data.drop(columns=['SalePrice'])

# y: The 'SalePrice' column (target)
y = data['SalePrice']

# data splitting
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=31416)


In [31]:

# Build preprocessor
X_cat = X_train.select_dtypes(exclude="number").copy()
X_num = X_train.select_dtypes(include="number").copy()

# We have to take extra steps to maintain feature names if we want to know their coefficients
# Encoders and ColumnTransformers will remove feature names unless .set_output is specified

# Step 1: Defining ordinal & onehot columns
ordinal_cols = [
    'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'KitchenQual', 'FireplaceQu', 
    'PoolQC', 'HeatingQC', 'GarageFinish', 'GarageQual',
    'GarageCond', 'PavedDrive','LandSlope', 'LotShape', 'OverallCond', 'OverallQual'
]

onehot_cols = [
    'MSSubClass', 'MSZoning', 'Street', 'Alley', 'LandContour', 'Utilities', 'LotConfig', 
    'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 
    'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'Heating', 
    'CentralAir', 'Electrical', 'GarageType', 'Fence', 'SaleType', 'SaleCondition'
]

# Step 2: Defining the categorical encoder (with "N_A" for missing values)
# Corrected ordinal_categories
ordinal_categories = [
    ['Ex', 'Gd', 'TA', 'Fa', 'Po'],  # ExterQual
    ['Ex', 'Gd', 'TA', 'Fa', 'Po'],  # ExterCond
    ['Ex', 'Gd', 'TA', 'Fa', 'Po'],  # BsmtQual
    ['Ex', 'Gd', 'TA', 'Fa', 'Po'],  # BsmtCond
    ['Gd', 'Av', 'Mn', 'No'],        # BsmtExposure
    ['Ex', 'Gd', 'TA', 'Fa', 'Po'],  # KitchenQual
    ['Ex', 'Gd', 'TA', 'Fa', 'Po'],  # FireplaceQu
    ['Ex', 'Gd', 'TA', 'Fa', 'Po'],  # PoolQC
    ['Ex', 'Gd', 'TA', 'Fa', 'Po'],  # HeatingQC
    ['Fin', 'RFn', 'Unf'],           # GarageFinish
    ['Ex', 'Gd', 'TA', 'Fa', 'Po'],  # GarageQual
    ['Ex', 'Gd', 'TA', 'Fa', 'Po'],  # GarageCond
    ['Y', 'P', 'N'],                 # PavedDrive
    ['Gtl', 'Mod', 'Sev'],           # LandSlope
    ['Reg', 'IR1', 'IR2', 'IR3'],    # LotShape
    [10, 9, 8, 7, 6, 5, 4, 3, 2, 1], # OverallCond
    [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]  # OverallQual
]

In [32]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder


# Ordinal pipeline
ordinal_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy="constant", fill_value="N_A")),  # Fill missing values
    ('ordinal', OrdinalEncoder(categories=ordinal_categories, handle_unknown='use_encoded_value', unknown_value=-1))  # Ordinal encoding
])

# One-hot pipeline
onehot_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy="constant", fill_value="N_A")),  # Fill missing values
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))  # One-hot encoding
])

# Numerical pipeline
numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Fill missing values with mean
    ('scaler', StandardScaler())  # Standard scaling
])

# Combine pipelines with ColumnTransformer
full_preprocessing = ColumnTransformer(transformers=[
    ("num_pipe", numeric_pipeline, X_num.columns),       # Apply numerical pipeline to numerical columns
    ("ordinal", ordinal_pipeline, ordinal_cols),        # Apply ordinal pipeline to ordinal columns
    ("onehot", onehot_pipeline, onehot_cols)            # Apply one-hot pipeline to categorical columns
])

# Fit and transform the full preprocessing pipeline
X_transformed_full = full_preprocessing.fit_transform(X_train)

# Display the shape of the transformed data
print(f"Transformed dataset shape: {X_transformed_full.shape}")

Transformed dataset shape: (1168, 238)


### Predicting with the LinearRegression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

# === LINEAR REGRESSION MODEL ===
# Include the preprocessing pipeline and Linear Regression model
full_pipeline = make_pipeline(
    full_preprocessing,  # Preprocessing steps
    LinearRegression()   # Linear Regression model
)

# === HYPERPARAMETER GRID ===
param_grid = {
    "linearregression__fit_intercept": [True, False],  # Whether to calculate the intercept
}

# === GRID SEARCH CV ===
search = GridSearchCV(
    full_pipeline,
    param_grid=param_grid,
    cv=5,  # 5-fold cross-validation
    verbose=1,
    scoring="r2",  # Optimize for R^2 score
    n_jobs=-1  # Use all CPU cores
)

# === FITTING THE MODEL ===
search.fit(X_train, y_train)

# === RESULTS ===
print("Best Parameters:", search.best_params_)
print("Best Score (R^2):", search.best_score_)

# === COEFFICIENT ANALYSIS ===
# Extract the best Linear Regression model
best_model = search.best_estimator_.named_steps['linearregression']
coefficients = best_model.coef_

# Map coefficients to feature names
# Retrieve feature names from the preprocessing pipeline
preprocessor = search.best_estimator_.named_steps['columntransformer']

# Feature names from numerical, ordinal, and one-hot encoding
all_feature_names = (
    X_num.columns.tolist() +  # Numerical feature names
    ordinal_cols +  # Ordinal feature names
    list(preprocessor.named_transformers_['onehot'].get_feature_names_out(onehot_cols))  # One-hot encoded names
)

# Create a dictionary of coefficients
coefficients_dict = dict(zip(all_feature_names, coefficients))

# Sort coefficients by their absolute values
sorted_coefficients = sorted(coefficients_dict.items(), key=lambda x: abs(x[1]), reverse=True)

# Display sorted coefficients
print("\nFeature Coefficients (sorted by absolute value):")
for feature, coef in sorted_coefficients:
    print(f"{feature}: {coef}")


## 3. Error analysis

In [34]:
from sklearn.metrics import r2_score


# Make predictions on the training data
y_train_pred = search.predict(X_train)

# Calculate accuracy score
r2 = r2_score(y_train, y_train_pred )
print(f"R² Score: {r2}")

R² Score: 0.9288214609265973


In [35]:
y_test_pred = search.predict(X_test)  # Predictions on test data
r2 = r2_score(y_test, y_test_pred) 
print(f"R² Score: {r2}")   # Compare to test labels

R² Score: -5.197316712944098e+17
