In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import joblib

In [2]:
df = pd.read_excel(r"C:\My Folder\Tuts\Python\Project\Project 6 - Singapore Flat Resale Prices\Code\singapore_flat_resale_cleaned_data.xlsx")

In [None]:
def machine_learning_delivery_date(df, algorithm):

    x = df.drop(columns=['resale_price'], axis=1)
    y = df['resale_price']
    x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

    model = algorithm().fit(x_train, y_train)
    y_pred = model.predict(x_test)

    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    metrics = {'Algorithm': str(algorithm).split("'")[1].split(".")[-1],
               'R2': r2,
               'Mean Absolute Error': mae,
               'Mean Squared Error': mse,
               'Root Mean Squared Error': rmse,
               'Model': model}

    return metrics

In [None]:
regressors = [LinearRegression, DecisionTreeRegressor, ExtraTreesRegressor, RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor]
results = []
for regressor in regressors:
    result = machine_learning_delivery_date(df, regressor)
    print(result)
    results.append(result)



In [None]:
results_df = pd.DataFrame(results)
metrics_to_plot = ['R2', 'Mean Absolute Error', 'Mean Squared Error', 'Root Mean Squared Error']
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.flatten()

for i, metric in enumerate(metrics_to_plot):
    axes[i].bar(results_df['Algorithm'], results_df[metric])
    axes[i].set_title(metric)
    axes[i].set_xlabel('Algorithm')
    axes[i].set_ylabel(metric)
    axes[i].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# It is clearly seen the Random Forest algorithm gives best accuracy compared to other algorithms.
# It also low in Mean Absolute Error, Mean Squared Error and Root Mean Squared Error than other algorithms.
# so let us choose Random Forest Algorithm

# We shall now fine tune the algorithm for better accuracy

In [None]:
# Fine tuning takes too long time to complete the execution. Therefore let us take the Random Forest Algorithm as Final

In [None]:
user_data = np.array([[1, 0, 10.4880884817015, 17, 2.23606797749979, 67, 12, 3.46410161513775, 1990]])
y_predict = best_model.predict(user_data)
y_predict[0]

In [None]:
import joblib
best_r2 = -np.inf
best_model = None
for result in results:
  if result['R2'] > best_r2:
    best_r2 = result['R2']
    best_model = result['Model']

if best_model is not None:
  joblib.dump(best_model, 'singapore_flat_reasle_model.pkl')
  print(f'Best model: {best_model} has been saved')


In [None]:
user_data = np.array([[1, 0, 10.4880884817015, 17, 2.23606797749979, 67, 12, 3.46410161513775, 1990]])
y_predict = best_model.predict(user_data)
y_predict[0]