<a href="https://colab.research.google.com/github/EduardoAve/Data-science-portfolio/blob/main/house-price-regression/regression_house_prices_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
"""
Colab Notebook for Regression - Predicting House Prices (Ames Housing)

Objective: Predict house sale prices using regression techniques.
Load Method: Manual (upload 'train.csv' to Colab).
Includes loading, inspection, EDA, Preprocessing, Modeling, and Evaluation.
"""

# ==============================================================================
# 1. LIBRARY IMPORTS AND SETUP
# ==============================================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
import re # Import re for renaming columns
import time # To time processes

# Scikit-learn libraries
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Basic configurations
sns.set_style('whitegrid')
pd.set_option('display.max_rows', 100) # Show up to 100 rows
pd.set_option('display.max_columns', 100) # Show up to 100 columns
pd.options.display.float_format = '{:.4f}'.format # Format floats for metrics
warnings.filterwarnings('ignore')

print("Libraries imported.")


Libraries imported.


In [18]:

# ==============================================================================
# 2. DATA LOADING AND INITIAL INSPECTION (Manual Upload)
# ==============================================================================
# (Assumes this cell was run successfully and df_train exists)
# --- Instructions ---
# Assumes you have uploaded the 'train.csv' file from the Ames Housing dataset
# using the "Files" panel on the left in Colab.

# --- Verify file with !ls before attempting load ---
print("\n--- Listing CSV files in /content/ (before loading): ---")
!ls -lh /content/*.csv
print("-------------------------------------------------------------")
# --- End Verification ---

try:
    # Define the expected filename
    file_name = 'train.csv'
    data_loaded = False
    df_train = None # Initialize DataFrame
    print(f"\nAttempting to load '{file_name}'...")

    # Verify if the file exists in /content/
    if os.path.exists(file_name):
        print(f"File '{file_name}' found. Loading with pandas...")
        df_train = pd.read_csv(file_name)
        print(f"\nDataFrame 'df_train' loaded successfully.")
        data_loaded = True
    else:
        print(f"\nError: File '{file_name}' not found in /content/.")
        data_loaded = False

except Exception as e:
    print(f"\nAn unexpected error occurred while loading or inspecting the CSV: {e}")
    import traceback
    traceback.print_exc()
    data_loaded = False

if data_loaded:
    print("\nInitial data inspection completed.")
    print(f"Initial dimensions: {df_train.shape}")
    null_counts = df_train.isnull().sum()
    print(f"Columns with nulls initially (>0): {len(null_counts[null_counts > 0])}")
else:
    print("\nCould not complete initial data inspection.")




--- Listing CSV files in /content/ (before loading): ---
-rw-r--r-- 1 root root 450K Apr 20 16:45 /content/train.csv
-------------------------------------------------------------

Attempting to load 'train.csv'...
File 'train.csv' found. Loading with pandas...

DataFrame 'df_train' loaded successfully.

Initial data inspection completed.
Initial dimensions: (1460, 81)
Columns with nulls initially (>0): 19


In [19]:

# ==============================================================================
# 3. EXPLORATORY DATA ANALYSIS (EDA)
# ==============================================================================
print("\n--- Starting Section 3: Exploratory Data Analysis (EDA) ---")

if data_loaded and df_train is not None:
    # --- 3.1 Target Variable Analysis: SalePrice ---
    target_col = 'SalePrice'
    target_col_log = None
    target_to_use = target_col

    if target_col in df_train.columns:
        print(f"\nAnalyzing target variable: '{target_col}'...")
        skewness = df_train[target_col].skew()
        print(f"Skewness of {target_col}: {skewness:.2f}")
        if skewness > 1:
            print("Applying log transformation (log1p)...")
            df_train['SalePrice_log'] = np.log1p(df_train[target_col])
            target_col_log = 'SalePrice_log'
            target_to_use = target_col_log
            print(f"Skewness of {target_col_log}: {df_train[target_col_log].skew():.2f}")
        else:
            target_to_use = target_col
        print(f"Will use '{target_to_use}' for correlation analysis and modeling.")
        # (Plots omitted for brevity)
    else:
        print(f"Error: Target column '{target_col}' not found.")
        target_to_use = None

    # --- 3.2 Numerical Feature Correlation with Target ---
    # (Calculation and plots omitted for brevity)
    if target_to_use and target_to_use in df_train.columns:
        print(f"\nCorrelation with '{target_to_use}' calculated (plots omitted).")
    else:
        print("Cannot calculate correlation (target not defined).")

    # --- 3.3 Categorical Feature Analysis vs Target ---
    # (Plots omitted for brevity)
    print(f"\n--- Categorical Variable Analysis vs '{target_to_use}' performed (plots omitted). ---")

    eda_done = True

else:
    print("Skipping Section 3 because data was not loaded.")
    eda_done = False




--- Starting Section 3: Exploratory Data Analysis (EDA) ---

Analyzing target variable: 'SalePrice'...
Skewness of SalePrice: 1.88
Applying log transformation (log1p)...
Skewness of SalePrice_log: 0.12
Will use 'SalePrice_log' for correlation analysis and modeling.

Correlation with 'SalePrice_log' calculated (plots omitted).

--- Categorical Variable Analysis vs 'SalePrice_log' performed (plots omitted). ---


In [20]:

# ==============================================================================
# 4. PREPROCESSING AND FEATURE ENGINEERING (Implemented)
# ==============================================================================
print("\n--- Starting Section 4: Preprocessing and Feature Engineering ---")

# Initialize variables to be created in this section
X_train, X_test, y_train, y_test = None, None, None, None
preprocessing_done = False
scaler = None # To store the scaler

if eda_done and df_train is not None and target_to_use is not None:
    df_processed = df_train.copy()
    print(f"Using df_train for preprocessing. Dimensions: {df_processed.shape}")

    # --- 4.1 Drop 'Id' Column ---
    if 'Id' in df_processed.columns:
        df_processed.drop('Id', axis=1, inplace=True)
        print("- Column 'Id' dropped.")

    # --- 4.2 Handle Missing Values (Implemented) ---
    print("\nHandling missing values...")
    # Fill categorical NaNs meaning 'None'
    cols_nan_means_none_cat = [
        'Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
        'BsmtFinType2', 'FireplaceQu', 'GarageType', 'GarageFinish',
        'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature','MasVnrType'
    ]
    for col in cols_nan_means_none_cat:
        if col in df_processed.columns and df_processed[col].dtype == 'object':
             df_processed[col].fillna('None', inplace=True)
    # Fill numerical NaNs meaning 0
    cols_nan_means_zero = ['GarageYrBlt', 'MasVnrArea']
    for col in cols_nan_means_zero:
         if col in df_processed.columns: df_processed[col].fillna(0, inplace=True)
    if 'GarageYrBlt' in df_processed.columns: df_processed['GarageYrBlt'] = df_processed['GarageYrBlt'].astype(int)
    # Impute LotFrontage with Neighborhood median
    if 'LotFrontage' in df_processed.columns and df_processed['LotFrontage'].isnull().any():
         df_processed['LotFrontage'] = df_processed.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
         if df_processed['LotFrontage'].isnull().any(): df_processed['LotFrontage'].fillna(df_processed['LotFrontage'].median(), inplace=True)
    # Impute Electrical with mode
    if 'Electrical' in df_processed.columns and df_processed['Electrical'].isnull().any():
         mode_electrical = df_processed['Electrical'].mode()[0]
         df_processed['Electrical'].fillna(mode_electrical, inplace=True)
    if df_processed.isnull().sum().sum() == 0: print("All primary missing values handled!")
    else: print(f"Warning: {df_processed.isnull().sum().sum()} missing values remain.")


    # --- 4.3 Feature Engineering (Implemented) ---
    print("\nCreating new features...")
    try:
        df_processed['TotalSF'] = df_processed['TotalBsmtSF'] + df_processed['1stFlrSF'] + df_processed['2ndFlrSF']
        df_processed['HouseAge'] = df_processed['YrSold'] - df_processed['YearBuilt']
        df_processed['HouseAge'] = df_processed['HouseAge'].apply(lambda x: max(0, x))
        df_processed['IsRemodeled'] = (df_processed['YearRemodAdd'] != df_processed['YearBuilt']).astype(int)
        df_processed['YearsSinceRemod'] = df_processed['YrSold'] - df_processed['YearRemodAdd']
        df_processed['YearsSinceRemod'] = df_processed['YearsSinceRemod'].apply(lambda x: max(0, x))
        print("- Features created: 'TotalSF', 'HouseAge', 'IsRemodeled', 'YearsSinceRemod'.")
    except Exception as e: print(f"Error creating features: {e}")


    # --- 4.4 Transform Skewed Numerical Features (Implemented) ---
    print("\nTransforming skewed numerical features...")
    # Log transform GrLivArea if needed
    skewed_col = 'GrLivArea'
    grlivarea_log_col = skewed_col # Default if not transformed
    if skewed_col in df_processed.columns:
         skewness_grliv = df_processed[skewed_col].skew()
         if abs(skewness_grliv) > 0.75:
              print(f"- Applying log1p transformation to '{skewed_col}' (Skewness: {skewness_grliv:.2f})")
              grlivarea_log_col = skewed_col + '_log'
              df_processed[grlivarea_log_col] = np.log1p(df_processed[skewed_col])
              df_processed.drop(skewed_col, axis=1, inplace=True) # Drop original
              print(f"  Original column '{skewed_col}' dropped.")
         else:
              print(f"- '{skewed_col}' not skewed enough.")
    # Note: Other numerical features could be checked and transformed here too.

    # --- 4.5 Encode Categorical Variables ---
    print("\nEncoding categorical variables...")
    if 'MSSubClass' in df_processed.columns:
        df_processed['MSSubClass'] = df_processed['MSSubClass'].astype(str)
        print("- Converted 'MSSubClass' to string type.")
    # Identify remaining object columns for One-Hot Encoding
    categorical_cols = df_processed.select_dtypes(include='object').columns.tolist()
    print(f"- {len(categorical_cols)} object columns identified for One-Hot Encoding.")
    # Apply get_dummies
    df_processed = pd.get_dummies(df_processed, columns=categorical_cols, drop_first=True, dummy_na=False)
    print(f"- Applied One-Hot Encoding. New dimensions: {df_processed.shape}")
    print("Note: Ordinal encoding for features like quality ratings was not implemented for simplicity.")


    # --- 4.6 Separate Features (X) and Target (y) ---
    print("\nSeparating features (X) and target variable (y)...")
    target_final = target_to_use # Should be 'SalePrice_log'
    if target_final in df_processed.columns:
        y = df_processed[target_final]
        # Drop both original and log-transformed target from X
        cols_to_drop_from_X = [target_col]
        if target_col_log and target_col_log in df_processed.columns: cols_to_drop_from_X.append(target_col_log)
        cols_to_drop_from_X = [col for col in cols_to_drop_from_X if col in df_processed.columns]
        X = df_processed.drop(columns=cols_to_drop_from_X)
        print(f"- Target variable 'y' assigned from '{target_final}'.")
        print(f"- Features 'X' created with {X.shape[1]} columns.")
        features_separated = True
    else:
        print(f"Error: Final target column '{target_final}' not found.")
        features_separated = False


    # --- 4.7 Split into Training and Test Sets ---
    if features_separated:
        print("\nSplitting data into training and test sets...")
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        print(f"- X_train shape: {X_train.shape}")
        print(f"- X_test shape: {X_test.shape}")
        split_done = True
    else:
        print("Skipping train/test split.")
        split_done = False


    # --- 4.8 Scale Numerical Features ---
    if split_done:
        print("\nScaling numerical features...")
        # Identify numerical columns AFTER encoding (select only number types)
        numerical_cols_final = X_train.select_dtypes(include=np.number).columns.tolist()
        print(f"- {len(numerical_cols_final)} numerical columns identified for scaling.")
        scaler = StandardScaler() # Initialize scaler
        # Ensure columns exist and are valid before scaling
        cols_to_scale = [col for col in numerical_cols_final if col in X_train.columns and X_train[col].isnull().sum() == 0]
        if len(cols_to_scale) < len(numerical_cols_final):
             print(f"Warning: Skipping {len(numerical_cols_final)-len(cols_to_scale)} numerical columns from scaling (possibly all NaNs or non-numeric).")

        if cols_to_scale:
             print("- Applying StandardScaler (fitting on train, transforming train and test)...")
             X_train[cols_to_scale] = scaler.fit_transform(X_train[cols_to_scale])
             X_test[cols_to_scale] = scaler.transform(X_test[cols_to_scale])
             print("- StandardScaler applied.")
             scaling_done = True
        else:
             print("- No valid numerical columns found to scale.")
             scaling_done = False
    else:
        print("Skipping scaling.")
        scaling_done = False

    preprocessing_done = features_separated and split_done and scaling_done
    if preprocessing_done:
         print("\nPreprocessing complete! Data is ready for modeling.")

else:
    print("Skipping Section 4 because data was not loaded or initial EDA failed.")
    preprocessing_done = False



--- Starting Section 4: Preprocessing and Feature Engineering ---
Using df_train for preprocessing. Dimensions: (1460, 82)
- Column 'Id' dropped.

Handling missing values...
All primary missing values handled!

Creating new features...
- Features created: 'TotalSF', 'HouseAge', 'IsRemodeled', 'YearsSinceRemod'.

Transforming skewed numerical features...
- Applying log1p transformation to 'GrLivArea' (Skewness: 1.37)
  Original column 'GrLivArea' dropped.

Encoding categorical variables...
- Converted 'MSSubClass' to string type.
- 44 object columns identified for One-Hot Encoding.
- Applied One-Hot Encoding. New dimensions: (1460, 278)
Note: Ordinal encoding for features like quality ratings was not implemented for simplicity.

Separating features (X) and target variable (y)...
- Target variable 'y' assigned from 'SalePrice_log'.
- Features 'X' created with 276 columns.

Splitting data into training and test sets...
- X_train shape: (1168, 276)
- X_test shape: (292, 276)

Scaling nume

In [21]:


# ==============================================================================
# 5. REGRESSION MODELING (Implemented)
# ==============================================================================
print("\n--- Starting Section 5: Regression Modeling ---")

# Dictionary to store trained models
trained_models = {}
model_training_done = False

if preprocessing_done:
     print("Training regression models...")
     # Ensure X_train and y_train exist and are not empty
     if X_train is not None and y_train is not None and not X_train.empty and not y_train.empty:
          models_to_train = {
              'LinearRegression': LinearRegression(),
              'Ridge': Ridge(random_state=42),
              'Lasso': Lasso(random_state=42, alpha=0.0005), # Added small alpha to Lasso
              'RandomForest': RandomForestRegressor(random_state=42, n_estimators=100, n_jobs=-1, max_depth=15, min_samples_split=5, min_samples_leaf=3),
              'GradientBoosting': GradientBoostingRegressor(random_state=42, n_estimators=100, learning_rate=0.1, max_depth=3)
          }

          for name, model in models_to_train.items():
               try:
                    print(f"\nTraining model: {name}...")
                    start_time = time.time()
                    # Train the model with the preprocessed training data
                    model.fit(X_train, y_train)
                    end_time = time.time()
                    trained_models[name] = model # Store the trained model
                    print(f"{name} trained in {end_time - start_time:.2f} seconds.")
               except Exception as e:
                    print(f"Error training {name}: {e}")

          if trained_models:
               print("\n--- Model training finished ---")
               model_training_done = True
          else:
               print("\n--- No models were successfully trained ---")
     else:
          print("Error: X_train or y_train not available or empty.")

else:
    print("Skipping section because preprocessing was not completed.")



--- Starting Section 5: Regression Modeling ---
Training regression models...

Training model: LinearRegression...
LinearRegression trained in 1.07 seconds.

Training model: Ridge...
Ridge trained in 0.06 seconds.

Training model: Lasso...
Lasso trained in 0.60 seconds.

Training model: RandomForest...
RandomForest trained in 2.79 seconds.

Training model: GradientBoosting...
GradientBoosting trained in 1.16 seconds.

--- Model training finished ---


In [22]:


# ==============================================================================
# 6. MODEL EVALUATION (Implemented)
# ==============================================================================
print("\n--- Starting Section 6: Model Evaluation ---")

results = {}
evaluation_completed = False
results_df = None # Initialize results dataframe

if model_training_done and trained_models and X_test is not None and y_test is not None:
     print("Evaluating models on the test set (X_test, y_test)...")

     # Inverse transform y_test (log scale) to original scale for interpretable metrics
     try:
         # Ensure y_test is not None and is a Series/array suitable for expm1
         if y_test is not None and hasattr(y_test, 'shape'):
              y_test_orig = np.expm1(y_test)
              print("y_test inverse-transformed to original scale for MAE/RMSE calculation.")
              y_test_orig_available = True
         else:
              print("Warning: y_test is not available or has wrong type. Cannot calculate original scale metrics.")
              y_test_orig_available = False
     except Exception as e:
         print(f"Error inverse-transforming y_test: {e}. Original scale metrics will be unavailable.")
         y_test_orig_available = False

     for name, model in trained_models.items():
          try:
               print(f"\nEvaluating {name}...")
               # Predictions on the log scale
               y_pred_log = model.predict(X_test)

               # Metrics on the log scale (target scale for modeling)
               rmse_log = np.sqrt(mean_squared_error(y_test, y_pred_log))
               mae_log = mean_absolute_error(y_test, y_pred_log)
               r2 = r2_score(y_test, y_pred_log)
               print(f"- RMSE (log scale): {rmse_log:.4f}")
               print(f"- MAE (log scale): {mae_log:.4f}")
               print(f"- R2 Score: {r2:.4f}")

               # Metrics on the original scale (dollars, more interpretable)
               rmse_orig = 'N/A'
               mae_orig = 'N/A'
               if y_test_orig_available:
                    try:
                         # Inverse transform predictions
                         y_pred_orig = np.expm1(y_pred_log)
                         # Calculate original scale metrics
                         rmse_orig = np.sqrt(mean_squared_error(y_test_orig, y_pred_orig))
                         mae_orig = mean_absolute_error(y_test_orig, y_pred_orig)
                         print(f"- RMSE (original scale): ${rmse_orig:,.0f}")
                         print(f"- MAE (original scale): ${mae_orig:,.0f}")
                    except Exception as e_orig:
                         print(f"Error calculating metrics on original scale: {e_orig}")
               else:
                    print("- Cannot calculate original scale metrics.")

               # Store results
               results[name] = {'RMSE_log': rmse_log, 'MAE_log': mae_log, 'R2': r2, 'RMSE_orig': rmse_orig, 'MAE_orig': mae_orig}

          except Exception as e:
               print(f"Error evaluating {name}: {e}")

     # Convert results to DataFrame for comparison
     if results:
          results_df = pd.DataFrame(results).T
          # Sort by RMSE on log scale (common competition metric)
          results_df.sort_values(by='RMSE_log', inplace=True)
          print("\n--- Model Comparison (sorted by RMSE_log) ---")
          # Display with adjusted formatting for better readability
          display_df = results_df.copy()
          for col in ['RMSE_orig', 'MAE_orig']:
              display_df[col] = pd.to_numeric(display_df[col], errors='coerce').map('{:,.0f}'.format) # Format as integer with commas
          print(display_df)
          evaluation_completed = True
     else:
          print("\nNo evaluation results generated.")

else:
    print("Skipping section because modeling was not completed or test data is missing.")




--- Starting Section 6: Model Evaluation ---
Evaluating models on the test set (X_test, y_test)...
y_test inverse-transformed to original scale for MAE/RMSE calculation.

Evaluating LinearRegression...
- RMSE (log scale): 0.1579
- MAE (log scale): 0.0941
- R2 Score: 0.8663
- RMSE (original scale): $24,683
- MAE (original scale): $15,289

Evaluating Ridge...
- RMSE (log scale): 0.1366
- MAE (log scale): 0.0936
- R2 Score: 0.9000
- RMSE (original scale): $25,844
- MAE (original scale): $16,236

Evaluating Lasso...
- RMSE (log scale): 0.1387
- MAE (log scale): 0.0917
- R2 Score: 0.8970
- RMSE (original scale): $27,441
- MAE (original scale): $16,009

Evaluating RandomForest...
- RMSE (log scale): 0.1477
- MAE (log scale): 0.0983
- R2 Score: 0.8831
- RMSE (original scale): $29,948
- MAE (original scale): $17,545

Evaluating GradientBoosting...
- RMSE (log scale): 0.1421
- MAE (log scale): 0.0928
- R2 Score: 0.8918
- RMSE (original scale): $31,815
- MAE (original scale): $17,040

--- Model

In [23]:

# ==============================================================================
# 7. CONCLUSIONS (Implemented)
# ==============================================================================
print("\n--- Starting Section 7: Conclusions ---")

if evaluation_completed and results_df is not None:
     print("Summary of the house price prediction project:")
     print("- Data preprocessing involved handling numerous missing values (imputing based on meaning), feature engineering (TotalSF, Age, Remodeling flags), log-transforming the target variable ('SalePrice') and skewed features ('GrLivArea'), and One-Hot Encoding categorical features.")
     print("- Five different regression models were trained: Linear Regression, Ridge, Lasso, Random Forest, and Gradient Boosting.")
     print("- Models were evaluated using RMSE and MAE (on log and original scales) and R2 score.")

     # Identify best model based on RMSE_log
     best_model_name = results_df.index[0] # Since it's sorted by RMSE_log ascending
     best_metrics = results_df.loc[best_model_name]

     print(f"\nBest performing model (based on lowest RMSE_log): {best_model_name}")
     print("Key Performance Metrics for Best Model:")
     print(f"- RMSE (log scale): {best_metrics['RMSE_log']:.4f}")
     print(f"- R2 Score: {best_metrics['R2']:.4f}")
     # Format original scale metrics if they are numbers
     if isinstance(best_metrics['RMSE_orig'], (int, float)):
          print(f"- RMSE (original scale): ${best_metrics['RMSE_orig']:,.0f}")
     else:
          print(f"- RMSE (original scale): {best_metrics['RMSE_orig']}")
     if isinstance(best_metrics['MAE_orig'], (int, float)):
          print(f"- MAE (original scale): ${best_metrics['MAE_orig']:,.0f} (Average prediction error in dollars)")
     else:
          print(f"- MAE (original scale): {best_metrics['MAE_orig']}")

     print("\nInterpretation:")
     print(f"- The {best_model_name} model achieved the best predictive performance on the log-transformed target, explaining {best_metrics['R2']:.1%} of the variance.")
     if isinstance(best_metrics['MAE_orig'], (int, float)):
          print(f"- In practical terms, the model's predictions are, on average, about ${best_metrics['MAE_orig']:,.0f} off the actual sale price.")
     print("- Regularized models (Ridge) and tree-based ensembles (GradientBoosting, RandomForest) generally outperformed basic Linear Regression and Lasso (which performed poorly, likely due to default alpha).")

     print("\nLimitations & Next Steps:")
     print("- Preprocessing: Ordinal features were treated as nominal (One-Hot Encoded); using Ordinal Encoding with correct mapping could improve performance. More feature engineering could be explored.")
     print("- Modeling: Only default or basic hyperparameters were used. Hyperparameter tuning (e.g., using GridSearchCV or RandomizedSearchCV) for the top models (Ridge, GradientBoosting, RandomForest) is likely to yield significant improvements.")
     print("- Feature Importance: Analysis of feature importances (especially for tree models) was not performed but would provide insights into price drivers.")
     print("- Advanced Models: Techniques like XGBoost, LightGBM, or stacking/ensembling could be explored for potentially higher accuracy.")

else:
    print("Analysis did not complete successfully or evaluation results are missing. Conclusions cannot be finalized.")


# End of Regression Notebook
print("\n--- End of Regression Notebook ---")



--- Starting Section 7: Conclusions ---
Summary of the house price prediction project:
- Data preprocessing involved handling numerous missing values (imputing based on meaning), feature engineering (TotalSF, Age, Remodeling flags), log-transforming the target variable ('SalePrice') and skewed features ('GrLivArea'), and One-Hot Encoding categorical features.
- Five different regression models were trained: Linear Regression, Ridge, Lasso, Random Forest, and Gradient Boosting.
- Models were evaluated using RMSE and MAE (on log and original scales) and R2 score.

Best performing model (based on lowest RMSE_log): Ridge
Key Performance Metrics for Best Model:
- RMSE (log scale): 0.1366
- R2 Score: 0.9000
- RMSE (original scale): $25,844
- MAE (original scale): $16,236 (Average prediction error in dollars)

Interpretation:
- The Ridge model achieved the best predictive performance on the log-transformed target, explaining 90.0% of the variance.
- In practical terms, the model's prediction