In [10]:
import pandas as pd
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
import numpy as np

In [11]:
# Load the data
data = pd.read_csv('/content/sqm_data.csv')

# Drop specified columns
data.drop(columns=['id', 'type', 'propertytype', 'flatnumber', 'state', 'geom',
                   'street', 'property_id-2','streetnumber', 'suburb', 'latitude', 'longitude'], inplace=True)

# Group by 'property_id' and calculate the mean price
price_mean_by_property = data.groupby('property_id')['price'].transform('mean')

# Fill NaN values in 'price' with the calculated mean for each 'property_id'
data['price'].fillna(price_mean_by_property, inplace=True)

# Drop rows where 'price' has NaN values
data.dropna(subset=['price'], inplace=True)

# Fill NaN values in 'carspaces' with 0
data['carspaces'].fillna(0, inplace=True)

# Drop rows where 'area' column has NaN values
data.dropna(subset=['area'], inplace=True)

# One-Hot Encoding for 'streettype'
data = pd.get_dummies(data, columns=['streettype'], prefix='streettype', drop_first=True)

# Convert 'date' to datetime and extract 'year', 'month', and 'day'
data['date'] = pd.to_datetime(data['date'], errors='coerce')
data['year'] = data['date'].dt.year
data['month'] = data['date'].dt.month
data['day'] = data['date'].dt.day

# Drop the 'date' column
data.drop(columns=['date'], inplace=True)

# Standardize 'price' and 'area'
scaler = StandardScaler()
data[['price', 'area']] = scaler.fit_transform(data[['price', 'area']])

# Target encoding for 'property_id'
category_means = data.groupby('property_id')['price'].mean()
data['property_id_target_encoded'] = data['property_id'].map(category_means)

# Drop original 'property_id' column after encoding
data.drop(columns=['property_id'], inplace=True)

# Split the data
X = data.drop(columns=['price'])
y = data['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['price'].fillna(price_mean_by_property, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['carspaces'].fillna(0, inplace=True)
  data['date'] = pd.to_datetime(data['date'], errors='coerce')


In [14]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
import pandas as pd
import numpy as np

# Assuming the previous steps to process and split the data are done
# Your X_train, X_test, y_train, y_test are ready

# Initialize models
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "XGBoost": XGBRegressor(random_state=42)
}

# Dictionary to store results
results = {}

# Fit each model and calculate performance metrics
for model_name, model in models.items():
    model.fit(X_train, y_train)

    # Predictions for test data
    y_pred_test_scaled = model.predict(X_test)
    # Predictions for train data
    y_pred_train_scaled = model.predict(X_train)

    # Inverse transform predictions and actual prices for test data
    y_pred_test_original = scaler.inverse_transform(
        np.column_stack((y_pred_test_scaled, np.zeros(len(y_pred_test_scaled))))
    )[:, 0]
    y_test_original = scaler.inverse_transform(
        np.column_stack((y_test.values, np.zeros(len(y_test))))
    )[:, 0]

    # Inverse transform predictions and actual prices for train data
    y_pred_train_original = scaler.inverse_transform(
        np.column_stack((y_pred_train_scaled, np.zeros(len(y_pred_train_scaled))))
    )[:, 0]
    y_train_original = scaler.inverse_transform(
        np.column_stack((y_train.values, np.zeros(len(y_train))))
    )[:, 0]

    # Calculate metrics for test data
    mse_test = mean_squared_error(y_test_original, y_pred_test_original)
    rmse_test = np.sqrt(mse_test)
    r2_test = r2_score(y_test_original, y_pred_test_original)
    mae_test = mean_absolute_error(y_test_original, y_pred_test_original)

    # Calculate metrics for train data
    mse_train = mean_squared_error(y_train_original, y_pred_train_original)
    r2_train = r2_score(y_train_original, y_pred_train_original)

    # Store the results
    results[model_name] = {
        "MSE Test": mse_test,
        "RMSE Test": rmse_test,
        "R^2 Test": r2_test,
        "MAE Test": mae_test,
        "R^2 Train": r2_train
    }

# Print results
print("Performance metrics for each model:")
for model_name, metrics in results.items():
    print(f"\n{model_name}:")
    for metric_name, value in metrics.items():
        print(f"  {metric_name}: {value}")


Performance metrics for each model:

Linear Regression:
  MSE Test: 32001173325.237865
  RMSE Test: 178888.71771366094
  R^2 Test: 0.8160020943003649
  MAE Test: 109239.97202797746
  R^2 Train: 0.814986323399788

Random Forest:
  MSE Test: 20290710749.726017
  RMSE Test: 142445.46587984476
  R^2 Test: 0.8833340188760435
  MAE Test: 73266.86827658988
  R^2 Train: 0.975453346560144

Gradient Boosting:
  MSE Test: 19699994738.339184
  RMSE Test: 140356.66973229017
  R^2 Test: 0.8867304727452114
  MAE Test: 79341.42287314447
  R^2 Train: 0.9095336472209172

XGBoost:
  MSE Test: 51047869502.64581
  RMSE Test: 225937.755814839
  R^2 Test: 0.7064888532850291
  MAE Test: 80167.5287494293
  R^2 Train: 0.9848576111098014


In [17]:
# Initialize a dictionary to store actual vs predicted values for each model
predictions_vs_actual = {}

# Predicting actual prices for each model and comparing them
for model_name, model in models.items():
    # Predict on the test data
    y_pred_test_scaled = model.predict(X_test)

    # Inverse transform predictions and actual prices to get back to original scale
    y_pred_test_original = scaler.inverse_transform(
        np.column_stack((y_pred_test_scaled, np.zeros(len(y_pred_test_scaled))))
    )[:, 0]
    y_test_original = scaler.inverse_transform(
        np.column_stack((y_test.values, np.zeros(len(y_test))))
    )[:, 0]

    # Store the results in the dictionary
    predictions_vs_actual[model_name] = {
        "Actual Prices": y_test_original.tolist(),
        "Predicted Prices": y_pred_test_original.tolist()
    }

# Print sample results (10 values) for each model
for model_name, values in predictions_vs_actual.items():
    print(f"\n{model_name}:")
    print("Sample of 10 Actual Prices vs Predicted Prices:")
    for actual, predicted in zip(values["Actual Prices"][:10], values["Predicted Prices"][:10]):
        print(f"  Actual: {actual:.2f}, Predicted: {predicted:.2f}")



Linear Regression:
Sample of 10 Actual Prices vs Predicted Prices:
  Actual: 420000.00, Predicted: 763147.69
  Actual: 715000.00, Predicted: 789303.34
  Actual: 250000.00, Predicted: 406026.40
  Actual: 120000.00, Predicted: 79483.96
  Actual: 312500.00, Predicted: 284921.37
  Actual: 347000.00, Predicted: 335444.18
  Actual: 253000.00, Predicted: 205063.49
  Actual: 375000.00, Predicted: 506500.07
  Actual: 235000.00, Predicted: 214339.15
  Actual: 402000.00, Predicted: 387761.63

Random Forest:
Sample of 10 Actual Prices vs Predicted Prices:
  Actual: 420000.00, Predicted: 589556.40
  Actual: 715000.00, Predicted: 742503.88
  Actual: 250000.00, Predicted: 324675.00
  Actual: 120000.00, Predicted: 141030.00
  Actual: 312500.00, Predicted: 370545.50
  Actual: 347000.00, Predicted: 318471.60
  Actual: 253000.00, Predicted: 250687.16
  Actual: 375000.00, Predicted: 416730.00
  Actual: 235000.00, Predicted: 253145.50
  Actual: 402000.00, Predicted: 400125.57

Gradient Boosting:
Sample of