In [1]:
import pandas as pd
import plotly.express as px
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.datasets import  fetch_california_housing
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Read and clean the data

In [2]:
# read the data
data = pd.read_excel('Data/HousePricePrediction.xlsx')
data = data.dropna()
data = data.drop(['Id'], axis=1)
data.head()


Unnamed: 0,MSSubClass,MSZoning,LotArea,LotConfig,BldgType,OverallCond,YearBuilt,YearRemodAdd,Exterior1st,BsmtFinSF2,TotalBsmtSF,SalePrice
0,60,RL,8450,Inside,1Fam,5,2003,2003,VinylSd,0.0,856.0,208500.0
1,20,RL,9600,FR2,1Fam,8,1976,1976,MetalSd,0.0,1262.0,181500.0
2,60,RL,11250,Inside,1Fam,5,2001,2002,VinylSd,0.0,920.0,223500.0
3,70,RL,9550,Corner,1Fam,5,1915,1970,Wd Sdng,0.0,756.0,140000.0
4,60,RL,14260,FR2,1Fam,5,2000,2000,VinylSd,0.0,1145.0,250000.0


### Numerical Dataframe & Categorical dataframe

In [3]:
# numerical data
numerical_data = data.select_dtypes(include=['int64', 'float64'])
# categorical data
categorical_data = data[['MSZoning', 'LotConfig','BldgType','Exterior1st']]

In [4]:
# Calculate the correlation matrix
correlation_matrix = numerical_data.corr()
correlation_matrix.sort_values(by='SalePrice', ascending=False, inplace=True)
correlation_matrix

Unnamed: 0,MSSubClass,LotArea,OverallCond,YearBuilt,YearRemodAdd,BsmtFinSF2,TotalBsmtSF,SalePrice
SalePrice,-0.084284,0.263843,-0.077856,0.522897,0.507101,-0.011378,0.613581,1.0
TotalBsmtSF,-0.238518,0.260833,-0.171098,0.391452,0.291066,0.10481,1.0,0.613581
YearBuilt,0.02785,0.014228,-0.375983,1.0,0.592855,-0.049107,0.391452,0.522897
YearRemodAdd,0.040581,0.013788,0.073741,0.592855,1.0,-0.067759,0.291066,0.507101
LotArea,-0.139781,1.0,-0.005636,0.014228,0.013788,0.11117,0.260833,0.263843
BsmtFinSF2,-0.065649,0.11117,0.040229,-0.049107,-0.067759,1.0,0.10481,-0.011378
OverallCond,-0.059316,-0.005636,1.0,-0.375983,0.073741,0.040229,-0.171098,-0.077856
MSSubClass,1.0,-0.139781,-0.059316,0.02785,0.040581,-0.065649,-0.238518,-0.084284


### Drop irrelevent coloumns 

In [5]:
categorical_data.columns

Index(['MSZoning', 'LotConfig', 'BldgType', 'Exterior1st'], dtype='object')

In [6]:
numerical_data.columns

Index(['MSSubClass', 'LotArea', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'BsmtFinSF2', 'TotalBsmtSF', 'SalePrice'],
      dtype='object')

In [7]:
from scipy.stats import chi2_contingency

# Calculate chi-square test for each categorical column
chi2_results = {}
for column in categorical_data.columns:
    contingency_table = pd.crosstab(categorical_data[column], categorical_data['Exterior1st'])
    chi2, p_value, _, _ = chi2_contingency(contingency_table)
    chi2_results[column] = {'chi2': chi2, 'p_value': p_value}

# Print chi-square test results
for column, result in chi2_results.items():
    print(f"\nChi-square test for {column}:")
    print(f"Chi-square statistic: {result['chi2']:.4f}")
    print(f"P-value: {result['p_value']:.4f}")



Chi-square test for MSZoning:
Chi-square statistic: 240.3641
P-value: 0.0000

Chi-square test for LotConfig:
Chi-square statistic: 72.0282
P-value: 0.0733

Chi-square test for BldgType:
Chi-square statistic: 210.1977
P-value: 0.0000

Chi-square test for Exterior1st:
Chi-square statistic: 20440.0000
P-value: 0.0000


### Choose important variables to be trained and tested

In [None]:
'YearRemodAdd', 'LotArea',

In [8]:
data_chose = data[[ 'MSZoning',  'LotConfig', 'BldgType',
       'OverallCond','YearBuilt','MSSubClass', 'Exterior1st', 'BsmtFinSF2',
       'TotalBsmtSF', 'SalePrice']]
## Define features and target variable
x = data_chose.drop('SalePrice', axis=1)
y = data_chose['SalePrice']

### encoding categorical columns

In [17]:

# Apply pd.get_dummies() only to the existing columns
X_encoded = pd.get_dummies(x, columns=categorical_data.columns)
X_encoded

Unnamed: 0,OverallCond,YearBuilt,MSSubClass,BsmtFinSF2,TotalBsmtSF,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,...,Exterior1st_CemntBd,Exterior1st_HdBoard,Exterior1st_ImStucc,Exterior1st_MetalSd,Exterior1st_Plywood,Exterior1st_Stone,Exterior1st_Stucco,Exterior1st_VinylSd,Exterior1st_Wd Sdng,Exterior1st_WdShing
0,5,2003,60,0.0,856.0,False,False,False,True,False,...,False,False,False,False,False,False,False,True,False,False
1,8,1976,20,0.0,1262.0,False,False,False,True,False,...,False,False,False,True,False,False,False,False,False,False
2,5,2001,60,0.0,920.0,False,False,False,True,False,...,False,False,False,False,False,False,False,True,False,False
3,5,1915,70,0.0,756.0,False,False,False,True,False,...,False,False,False,False,False,False,False,False,True,False
4,5,2000,60,0.0,1145.0,False,False,False,True,False,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,5,1999,60,0.0,953.0,False,False,False,True,False,...,False,False,False,False,False,False,False,True,False,False
1456,6,1978,20,163.0,1542.0,False,False,False,True,False,...,False,False,False,False,True,False,False,False,False,False
1457,9,1941,70,0.0,1152.0,False,False,False,True,False,...,True,False,False,False,False,False,False,False,False,False
1458,6,1950,20,1029.0,1078.0,False,False,False,True,False,...,False,False,False,True,False,False,False,False,False,False


In [18]:
# apply pca to the data
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_encoded)
X_pca

array([[-202.38712422,  -38.23847171,    3.76690892],
       [ 203.15711199,  -55.21693413,  -32.65549508],
       [-138.54437171,  -41.02440802,    4.60757274],
       ...,
       [  91.23300133,  -50.23088723,    9.19294682],
       [  64.40819181,  981.14783225,  -25.42860532],
       [ 209.73783935,  234.90429015,  -30.36455868]])

### Perform Train Test Split

In [19]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# normalize the data

scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)

## Machine Learning models testing

In [20]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Assuming X_encoded and y are already defined

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Standardize the features
scaler_standard = StandardScaler()
X_train_scaled = scaler_standard.fit_transform(X_train)
X_test_scaled = scaler_standard.transform(X_test)

# Normalize the data
scaler_minmax = MinMaxScaler()
X_train_normalized = scaler_minmax.fit_transform(X_train)
X_test_normalized = scaler_minmax.transform(X_test)

# Define models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Support Vector Machine': SVR(),
    'K-Nearest Neighbors': KNeighborsRegressor(),
    'Decision Tree': DecisionTreeRegressor(random_state=42)
}

# Function to evaluate model
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return mae, mse, r2

# Run models and print results
for name, model in models.items():
    print(f"\n{name}:")
    
    # Standardized data
    mae_scaled, mse_scaled, r2_scaled = evaluate_model(model, X_train_scaled, X_test_scaled, y_train, y_test)
    print(f"Standardized - MAE: {mae_scaled:.2f}, MSE: {mse_scaled:.2f}, R2: {r2_scaled:.2f}")
    
    # Normalized data
    mae_normalized, mse_normalized, r2_normalized = evaluate_model(model, X_train_normalized, X_test_normalized, y_train, y_test)
    print(f"Normalized - MAE: {mae_normalized:.2f}, MSE: {mse_normalized:.2f}, R2: {r2_normalized:.2f}")


Linear Regression:
Standardized - MAE: 36345.20, MSE: 3146433243.18, R2: 0.59
Normalized - MAE: 40756.74, MSE: 3593364151.93, R2: 0.53

Random Forest:
Standardized - MAE: 25098.15, MSE: 1630467828.81, R2: 0.79
Normalized - MAE: 25043.75, MSE: 1627276556.07, R2: 0.79

Support Vector Machine:
Standardized - MAE: 59542.86, MSE: 7857893106.68, R2: -0.02
Normalized - MAE: 59545.34, MSE: 7857726007.48, R2: -0.02

K-Nearest Neighbors:
Standardized - MAE: 33956.92, MSE: 3094162179.82, R2: 0.60
Normalized - MAE: 33595.66, MSE: 3125403284.48, R2: 0.59

Decision Tree:
Standardized - MAE: 32997.45, MSE: 2803660836.55, R2: 0.63
Normalized - MAE: 32800.87, MSE: 2786184329.70, R2: 0.64


In [15]:
import numpy as np
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor

# Assuming X_encoded and y are already defined

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)

# Define models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Support Vector Machine': SVR(),
    'K-Nearest Neighbors': KNeighborsRegressor(),
    'Decision Tree': DecisionTreeRegressor(random_state=42)
}

# Define the cross-validation method
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation for each model
for name, model in models.items():
    # Calculate MAE scores
    mae_scores = cross_val_score(model, X_scaled, y, scoring='neg_mean_absolute_error', cv=cv)
    mae_scores = -mae_scores  # sklearn returns negative MAE, so we negate it
    
    # Calculate MSE scores
    mse_scores = cross_val_score(model, X_scaled, y, scoring='neg_mean_squared_error', cv=cv)
    mse_scores = -mse_scores  # sklearn returns negative MSE, so we negate it
    
    # Calculate R2 scores
    r2_scores = cross_val_score(model, X_scaled, y, scoring='r2', cv=cv)
    
    print(f"\n{name}:")
    print(f"MAE: {mae_scores.mean():.2f} (+/- {mae_scores.std() * 2:.2f})")
    print(f"MSE: {mse_scores.mean():.2f} (+/- {mse_scores.std() * 2:.2f})")
    print(f"R2: {r2_scores.mean():.2f} (+/- {r2_scores.std() * 2:.2f})")


Linear Regression:
MAE: 42472501735825904.00 (+/- 109894658576726752.00)
MSE: 986087575396164867510986259017236480.00 (+/- 2563945843659643917728900761939607552.00)
R2: -152692332388403915048091648.00 (+/- 403244565560518121141305344.00)

Random Forest:
MAE: 24375.22 (+/- 2594.81)
MSE: 1651055773.50 (+/- 743135883.72)
R2: 0.73 (+/- 0.16)

Support Vector Machine:
MAE: 55631.24 (+/- 5669.93)
MSE: 6629675171.60 (+/- 1731315131.61)
R2: -0.05 (+/- 0.04)

K-Nearest Neighbors:
MAE: 32658.99 (+/- 3211.74)
MSE: 2741292719.60 (+/- 853414722.66)
R2: 0.56 (+/- 0.09)

Decision Tree:
MAE: 32533.06 (+/- 6548.29)
MSE: 3073814659.68 (+/- 1918398396.60)
R2: 0.51 (+/- 0.28)


Let's analyze these cross-validation results for each model:

Linear Regression:
The results for Linear Regression are extremely large and negative, which indicates a serious problem. This could be due to:

Multicollinearity in the data
Outliers significantly affecting the model
Numerical instability or overflow issues
This model is clearly not suitable for the task in its current form.


Random Forest:

MAE: 24375.22 (+/- 2594.81)
MSE: 1651055773.50 (+/- 743135883.72)
R2: 0.73 (+/- 0.16)
This is the best performing model. It has the lowest MAE and MSE, and the highest R2 score. The R2 of 0.73 indicates that the model explains about 73% of the variance in the target variable. The relatively small standard deviations suggest consistent performance across folds.


Support Vector Machine (SVM):

MAE: 55631.24 (+/- 5669.93)
MSE: 6629675171.60 (+/- 1731315131.61)
R2: -0.05 (+/- 0.04)
The negative R2 score indicates that this model performs worse than a horizontal line. It's not suitable for this problem without significant tuning or feature engineering.


K-Nearest Neighbors:

MAE: 32658.99 (+/- 3211.74)
MSE: 2741292719.60 (+/- 853414722.66)
R2: 0.56 (+/- 0.09)
This model performs reasonably well, explaining about 56% of the variance. It's not as good as Random Forest but better than SVM and Decision Tree.


Decision Tree:

MAE: 32533.06 (+/- 6548.29)
MSE: 3073814659.68 (+/- 1918398396.60)
R2: 0.51 (+/- 0.28)
The Decision Tree model explains about 51% of the variance. However, it has the highest standard deviation in R2 score, indicating inconsistent performance across folds.



Recommendations based on these results:

Random Forest is clearly the best model and should be your primary choice. It performs well and consistently.
The Linear Regression results indicate a problem that needs investigation. Check for multicollinearity, outliers, or data preprocessing issues.
The SVM model is not suitable in its current form. It might require extensive feature scaling, kernel selection, or hyperparameter tuning.
K-Nearest Neighbors performs decently and could be a secondary model to consider.
The Decision Tree model, while performing okay on average, shows high variability across folds. This suggests it might be overfitting to specific subsets of the data.
Given that tree-based models (Random Forest and Decision Tree) are performing well, your data likely has complex, non-linear relationships. Feature engineering to capture these relationships might improve performance further.
For the Random Forest model, you could try increasing the number of trees (n_estimators) and perform hyperparameter tuning to potentially improve its performance even more.

Remember, these results give you a more reliable estimate of how these models would perform on unseen data, which is crucial for real-world applications.

In [16]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Function to remove outliers using IQR method
def remove_outliers(X, y):
    Q1 = y.quantile(0.25)
    Q3 = y.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    mask = (y >= lower_bound) & (y <= upper_bound)
    return X[mask], y[mask]

# Remove outliers
X_encoded_no_outliers, y_no_outliers = remove_outliers(X_encoded, y)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_encoded_no_outliers, y_no_outliers, test_size=0.2, random_state=42)

# Standardize the features
scaler_standard = StandardScaler()
X_train_scaled = scaler_standard.fit_transform(X_train)
X_test_scaled = scaler_standard.transform(X_test)

# Normalize the data
scaler_minmax = MinMaxScaler()
X_train_normalized = scaler_minmax.fit_transform(X_train)
X_test_normalized = scaler_minmax.transform(X_test)

# Define models
models = {
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'K-Nearest Neighbors': KNeighborsRegressor(),
    'Lasso': Lasso(alpha=1.0, random_state=42)
}

# Function to evaluate model
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return mse, mae, r2

# Run models and print results
for name, model in models.items():
    print(f"\n{name}:")
    
    # Standardized data
    mse_scaled, mae_scaled, r2_scaled = evaluate_model(model, X_train_scaled, X_test_scaled, y_train, y_test)
    print(f"Standardized - MSE: {mse_scaled:.2f}, MAE: {mae_scaled:.2f}, R2: {r2_scaled:.2f}")
    
    # Normalized data
    mse_normalized, mae_normalized, r2_normalized = evaluate_model(model, X_train_normalized, X_test_normalized, y_train, y_test)
    print(f"Normalized - MSE: {mse_normalized:.2f}, MAE: {mae_normalized:.2f}, R2: {r2_normalized:.2f}")

# Create a DataFrame with the results
results = []
for name, model in models.items():
    mse_scaled, mae_scaled, r2_scaled = evaluate_model(model, X_train_scaled, X_test_scaled, y_train, y_test)
    mse_normalized, mae_normalized, r2_normalized = evaluate_model(model, X_train_normalized, X_test_normalized, y_train, y_test)
    results.append({
        'Model': name,
        'Standardized MSE': mse_scaled,
        'Standardized MAE': mae_scaled,
        'Standardized R2': r2_scaled,
        'Normalized MSE': mse_normalized,
        'Normalized MAE': mae_normalized,
        'Normalized R2': r2_normalized
    })

results_df = pd.DataFrame(results)
print("\nResults Summary:")
print(results_df.to_string(index=False))


Random Forest:
Standardized - MSE: 849094074.51, MAE: 21083.90, R2: 0.73
Normalized - MSE: 858982695.26, MAE: 21188.94, R2: 0.73

Gradient Boosting:
Standardized - MSE: 857887594.20, MAE: 21591.95, R2: 0.73
Normalized - MSE: 858914256.49, MAE: 21613.59, R2: 0.73

Decision Tree:
Standardized - MSE: 1311002645.74, MAE: 25955.44, R2: 0.59
Normalized - MSE: 1293444252.88, MAE: 25590.08, R2: 0.59

K-Nearest Neighbors:
Standardized - MSE: 1163676553.62, MAE: 25279.52, R2: 0.63
Normalized - MSE: 1224150067.38, MAE: 25429.62, R2: 0.62

Lasso:


  model = cd_fast.enet_coordinate_descent(


Standardized - MSE: 1325509768.03, MAE: 28313.01, R2: 0.58
Normalized - MSE: 1325514374.17, MAE: 28311.11, R2: 0.58

Results Summary:
              Model  Standardized MSE  Standardized MAE  Standardized R2  Normalized MSE  Normalized MAE  Normalized R2
      Random Forest      8.490941e+08      21083.902044         0.733058    8.589827e+08    21188.936604       0.729949
  Gradient Boosting      8.578876e+08      21591.945400         0.730293    8.589143e+08    21613.587814       0.729971
      Decision Tree      1.311003e+09      25955.439286         0.587841    1.293444e+09    25590.082143       0.593361
K-Nearest Neighbors      1.163677e+09      25279.522143         0.634158    1.224150e+09    25429.615714       0.615146
              Lasso      1.325510e+09      28313.011425         0.583280    1.325514e+09    28311.107189       0.583279


  model = cd_fast.enet_coordinate_descent(


In [None]:
# add prediction to the train data and test data to see if the model is predicting well

lin_reg = LinearRegression()
lin_reg.fit(X_train_scaled, y_train)
pred = lin_reg.predict(X_test_scaled)

print("MAE", mean_absolute_error(pred, y_test))
print("RMSE", mean_squared_error(pred, y_test, squared=False))
print("R2 score", lin_reg.score(X_test_scaled, y_test))


In [None]:
pred_train = lin_reg.predict(X_train_scaled)

print("MAE", mean_absolute_error(pred_train, y_train))
print("RMSE", mean_squared_error(pred_train, y_train, squared=False))
print("R2 score", lin_reg.score(X_train_scaled, y_train))

## Ensemble Methods

In [30]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# Create the ensemble models create a list of models
random_forest = RandomForestRegressor()
adaboost = AdaBoostRegressor()
gradient_boosting = GradientBoostingRegressor()
bagging_regressor = BaggingRegressor(base_estimator=RandomForestRegressor(), n_estimators=10, bootstrap=True, random_state=42)
pasting_reg = BaggingRegressor(
  estimator=DecisionTreeRegressor(max_depth=20),
  n_estimators=100,
  max_samples=1000,
  bootstrap=False  # This ensures that no bootstrap is done, which is characteristic of Pasting
)

# Train the ensemble models
random_forest.fit(X_train_scaled, y_train)
adaboost.fit(X_train_scaled, y_train)
gradient_boosting.fit(X_train_scaled, y_train)
bagging_regressor.fit(X_train_scaled, y_train)
pasting_reg.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred_random_forest = random_forest.predict(X_test_scaled)
y_pred_adaboost = adaboost.predict(X_test_scaled)
y_pred_gradient_boosting = gradient_boosting.predict(X_test_scaled)
y_pred_bagging = bagging_regressor.predict(X_test_scaled)
pred = pasting_reg.predict(X_test_scaled)

# Print the results
print("Random Forest:")
print(f"MSE: {mean_squared_error(y_test, y_pred_random_forest):.4f}")
print(f"MAE: {mean_absolute_error(y_test, y_pred_random_forest):.4f}")
print(f"R^2: {r2_score(y_test, y_pred_random_forest):.4f}")

print("\nAdaBoost:")
print(f"MSE: {mean_squared_error(y_test, y_pred_adaboost):.4f}")
print(f"MAE: {mean_absolute_error(y_test, y_pred_adaboost):.4f}")
print(f"R^2: {r2_score(y_test, y_pred_adaboost):.4f}")

print("\nGradient Boosting:")
print(f"MSE: {mean_squared_error(y_test, y_pred_gradient_boosting):.4f}")
print(f"MAE: {mean_absolute_error(y_test, y_pred_gradient_boosting):.4f}")
print(f"R^2: {r2_score(y_test, y_pred_gradient_boosting):.4f}")

print("\nBagging Regressor:")
print(f"MSE: {mean_squared_error(y_test, y_pred_bagging):.4f}")
print(f"MAE: {mean_absolute_error(y_test, y_pred_bagging):.4f}")
print(f"R^2: {r2_score(y_test, y_pred_bagging):.4f}")

print("\nPasting Regressor:")
print("MAE:", mean_absolute_error(pred, y_test))
print("RMSE:", mean_squared_error(pred, y_test, squared=False))
print("R2 score:", pasting_reg.score(X_test_scaled, y_test))




Random Forest:
MSE: 1608095481.4361
MAE: 25353.8866
R^2: 0.7903

AdaBoost:
MSE: 2897093795.9062
MAE: 43203.3078
R^2: 0.6223

Gradient Boosting:
MSE: 1295942946.6794
MAE: 24308.3558
R^2: 0.8310

Bagging Regressor:
MSE: 1508779872.9491
MAE: 24325.2875
R^2: 0.8033

Pasting Regressor:
MAE: 26998.06043969391
RMSE: 43540.71993519721
R2 score: 0.752840460592287
