<a href="https://colab.research.google.com/github/Brandon1219/Machine-learning-project/blob/main/Machine_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**PROJECT DISSERTATION**

Import Requirement

In [12]:
import pandas as pd
import numpy as np

Import Data

In [13]:
data = pd.read_csv("Final_Dis_geo.csv")

# 1. preprocess

## 1.1 Outliner removal

In [None]:
# find the outliers
upper_limit = data['Price'].mean() + 3*  data['Price'].std()
lower_limit = data['Price'].mean() - 3*  data['Price'].std()
print('Upper limit:', upper_limit)
print('Lower limit:', lower_limit)
print('Original number of data:', len(data))

In [None]:
# Remove the outliers
data = data.loc[(data['Price'] < upper_limit) & (data['Price'] > lower_limit)]
print('new number of data:', len(data))

## 1.2 Split data (into train data and test data)

In [19]:
from sklearn.model_selection import train_test_split
x = data.drop(['Price'], axis=1)
y = data['Price']

In [20]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
train_data = x_train.join(y_train)

# 2. Information

## 2.1 Basic information for the dataset (for eaxmination purpose)

### 2.1.1 data description

In [None]:
data.describe()

### 2.1.2 Information of attributes

In [None]:
data.info()

### 2.1.3 train dataset

In [None]:
train_data

## 2.2 visualization (Charts)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

### 2.2.1 Data collection

In [None]:
### Number of Transactions Collected from Each District

# Data
districts = ["Sanmin", "Sinsing", "Cianjin", "Lingya", "Cianjhen", "Yancheng",
             "Zuoying", "Gushan", "Nanzih", "Siaogang"]
entries = [37954, 6678, 6287, 20338, 16077, 2395, 25843, 19821, 30208, 12530]

# Create the bar chart
plt.figure(figsize=(10, 6))
plt.bar(districts, entries, color='lightgreen')
plt.xticks(rotation=45)
plt.xlabel('District')
plt.ylabel('Number of Transactions')
plt.title('Number of Transactions Collected from Each District')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()

# Display the chart
plt.show()

### 2.2.2 Attributes Details

In [None]:
### Price chart
numeric_col = ['Price']
data.boxplot(numeric_col)

In [None]:
### Variable info in Train Data
train_data.hist(figsize= (20,16))

### 2.2.3 Attribute correlation

In [None]:
### Correlation list
train_data.corr()

In [None]:
### Attribute Correlation Chart
plt.figure(figsize=(20, 12))
sns.heatmap(train_data.corr(), annot=True, cmap="YlGnBu", annot_kws={"fontsize": 8})
plt.show()

In [None]:
###  Distribution of transactions
plt.figure(figsize=(15,8))
sns.scatterplot(x="Latitude", y="Longitude", data=train_data, hue="Price", palette="coolwarm")

In [None]:
### Correlation of Attributes to House Price: Comparative Analysis

# Define the attributes and their corresponding correlation values
attributes = ['Date', 'Size', 'Main Building Ratio', 'Lowrise Apt', 'Midrise Apt','Highrise Apt', 'Studio', 'Age', 'Floor Level', 'Building Height',
              'Land', 'Building', 'Parking', 'Bedroom', 'Livingroom', 'Bathroom','Mng', 'Lift', 'Residential', 'Commerce', 'Industry']
correlation_values = [0.13, 0.85, -0.43, -0.33, -0.14, 0.45, -0.22, -0.52, 0.32, 0.46,
                      -0.1, 0.012, 0.67, 0.38, 0.31, 0.35, 0.34, 0.2, 0.17, -0.14, -0.023]

# Sort the attributes and correlation values by their absolute values in descending order
sorted_indices = np.argsort(np.abs(correlation_values))[::-1]
sorted_attributes = [attributes[i] for i in sorted_indices]
sorted_correlation_values = [correlation_values[i] for i in sorted_indices]

# Take the absolute values of correlation values
abs_correlation_values = [abs(val) for val in sorted_correlation_values]

# Create a list of colors based on correlation values
colors = ['blue' if val >= 0 else 'darkblue' for val in sorted_correlation_values]

# Create a bar chart
plt.figure(figsize=(10, 6))
bars = plt.bar(sorted_attributes, abs_correlation_values, color=colors)
plt.xlabel('Attributes')
plt.ylabel('Absolute Correlation')
plt.title('Absolute Correlation of Attributes to House Price')

# Add a legend for the colors
legend_labels = ['Positive Correlation', 'Negative Correlation']
legend_handles = [plt.Rectangle((0, 0), 1, 1, color='blue'), plt.Rectangle((0, 0), 1, 1, color='darkblue')]
plt.legend(legend_handles, legend_labels)

# Customize the appearance of negative correlation bars
for bar, color in zip(bars, colors):
    if color == 'darkblue':
        bar.set_color('darkblue')  # Change the color of negative correlation bars

plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
### MAPE vs Number of Attributes

mape_values = [18.07, 17.75, 17.72, 17.67, 17.86, 18.26, 18.25, 18.49, 19.49, 19.24, 19.54, 19.82, 19.79, 20.89, 20.52, 20.12, 21.24, 21.89, 23.37]
attribute_numbers = list(range(21, 2, -1))

plt.figure(figsize=(10, 5))  # Adjust the figsize to make the chart wider
plt.plot(attribute_numbers, mape_values, marker='o')
plt.xlabel('Number of Attributes')
plt.ylabel('MAPE(%)')
plt.title('MAPE vs Number of Attributes')
plt.xticks(attribute_numbers)
plt.ylim(17, 23)  # Set the maximum y-axis value to 23
plt.grid(True)
plt.show()

In [None]:
### Correlation between attributes and price

attributes = [
    "Latitude", "Longitude", "Date", "Size", "Main Building Ratio", "Lowrise Apt",
    "Midrise Apt", "Highrise Apt", "Studio", "Age", "Floor Level", "Building Height",
    "Land", "Building", "Parking", "Bedroom", "Livingroom", "Bathroom", "Mng", "Lift",
    "Residential", "Commerce", "Industry", "Sanmin", "Sinsing", "Cianjin", "Lingya",
    "Cianjhen", "Yancheng", "Zuoying", "Gushan", "Nanzih", "Siaogang"
]
correlations = [
    -0.484070, -0.193322, 0.127965, 0.878761, -0.362428, -0.278980,
    -0.124519, 0.380570, -0.171659, -0.481923, 0.363415, 0.477665,
    -0.088906, 0.069419, 0.663959, 0.380452, 0.256067, 0.443533, 0.283344,
    0.191338, 0.142253, -0.103103, -0.014118, -0.114364, -0.041034, 0.010374,
    -0.010393, -0.010514, -0.050272, 0.084569, 0.279929, -0.119606, -0.101772
]

# Create the bar chart
plt.figure(figsize=(12, 6))
plt.bar(attributes, correlations, color='skyblue')
plt.xticks(rotation=90)
plt.xlabel('Attributes')
plt.ylabel('Correlation with Price')
plt.title('Correlation between Attributes and Price')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()

# Display the chart
plt.show()

# 3 Machine learning models

## 3.1 Linear Regression

Import Requirements

In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV

In [40]:
### Linear Regression ###

# Train a LinearRegression model
model = LinearRegression()

# Define the parameter grid for tuning
param_grid = {
    'fit_intercept': [True, False],
    'copy_X': [True, False],
    'n_jobs': [1, -1],
    'positive': [True, False]
}

# Perform grid search to find the best parameter combination
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='r2')
grid_search.fit(x_train, y_train)

# Get the best model with tuned parameters
best_model = grid_search.best_estimator_

# Predict on the test data using the best model
y_pred = best_model.predict(x_test)

# Calculate R-squared, MAE and MARE, RMSE
r_squared = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mare = mean_absolute_percentage_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

# Print results
print("R-squared:", r_squared)
print("MAE:", mae)
print("MARE:", mare)
print("RMSE:", rmse)

# Print the best parameter combination found
print("Best Parameters:", grid_search.best_params_)
print("Best Model:", best_model)

R-squared: 0.8640250553223399
MAE: 125.69758614073321
MARE: 0.3051512335057081
RMSE: 176.85159428189627
Best Parameters: {'copy_X': True, 'fit_intercept': False, 'n_jobs': 1, 'positive': False}
Best Model: LinearRegression(fit_intercept=False, n_jobs=1)


## 3.2 Multi-Linear Regression

Import Requirements

In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV

In [41]:
### with best parameters

# Create polynomial features
poly = PolynomialFeatures(degree=2)  # Set the degree of the polynomial
x_train_poly = poly.fit_transform(x_train)
x_test_poly = poly.transform(x_test)

# Initialize Linear Regression model
model = LinearRegression(fit_intercept=True, positive=False, copy_X=True, n_jobs=1)
model.fit(x_train_poly, y_train)

# Predict on the test data using the model
y_pred = model.predict(x_test_poly)

# calculate R-squared, MAE, MARE, RMSE
r_squared = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mare = mean_absolute_percentage_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Print results
print("R-squared:", r_squared)
print("MAE:", mae)
print("MARE:", mare)
print("RMSE:", rmse)

R-squared: 0.9262566655747689
MAE: 86.7426686716513
MARE: 0.18868144773197848
RMSE: 130.23895993914107


### To find best parameters

In [None]:
### Multi-Linear Regression ### - Find best parameters

# Create polynomial features
poly = PolynomialFeatures(degree=2)  # Set the degree of the polynomial
x_train_poly = poly.fit_transform(x_train)
x_test_poly = poly.transform(x_test)

# Initialize Linear Regression model
model = LinearRegression()

# Define the parameter grid for tuning
param_grid = {
    'fit_intercept': [True, False],
    'positive': [True, False],
    'copy_X': [True, False],
    'n_jobs': [1, -1]
}

# Perform grid search to find the best parameter combination
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='r2')
grid_search.fit(x_train_poly, y_train)

# Predict on the test data using the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(x_test_poly)

# calculate R-squared, MAE, MARE, RMSE
r_squared = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mare = mean_absolute_percentage_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Print results
print("R-squared:", r_squared)
print("MAE:", mae)
print("MARE:", mare)
print("RMSE:", rmse)

# Print the best parameter combination found
print("Best Parameters:", grid_search.best_params_)

R-squared: 0.9278274664583999
MAE: 87.36575497154016
MARE: 0.23593425408538757
RMSE: 131.50026409454077
Best Parameters: {'copy_X': True, 'fit_intercept': True, 'n_jobs': 1, 'positive': False}


## 3.3 Random Forest

Import Requirements

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV

In [None]:
### with best parameters

# Train a Random Forest model on the data with specified parameters
rf = RandomForestRegressor(n_estimators=300, max_depth=None, min_samples_split=5, min_samples_leaf=2)

# Fit the model to the training data
rf.fit(x_train, y_train)

# Make predictions on the test set
y_pred = rf.predict(x_test)

# Calculate R-squared, MAE, MARE, RMSE
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mare = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
rmse = mean_squared_error(y_test, y_pred, squared=False)

# Print Results
print("R-squared:", r2)
print("MAE:", mae)
print("MARE:", mare)
print("RMSE:", rmse)

R-squared: 0.9339571817280206
MAE: 82.83741259247722
MARE: 17.293063715875544
RMSE: 124.21421996569441


### To find best parameters

In [None]:
### Random Forest ### - Find best parameters

# Train a Random Forest model on the data
rf = RandomForestRegressor()

# Define the parameter grid for tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform grid search to find the best parameter combination
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='r2')
grid_search.fit(x_train, y_train)

# Get the best model with tuned parameters
best_model = grid_search.best_estimator_

# Make predictions on the test set using the best model
y_pred = best_model.predict(x_test)

# Calculate R-squared, MAE, MARE, RMSE
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mare = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
rmse = mean_squared_error(y_test, y_pred, squared=False)

# Print Results
print("R-squared:", r2)
print("MAE:", mae)
print("MARE:", mare)
print("RMSE:", rmse)

# Print the best parameter combination found
print("Best Parameters:", grid_search.best_params_)
print("Best Model:", best_model)

R-squared: 0.9549443664732158
MAE: 63.60980337675091
MARE: 17.84731390490965
RMSE: 101.06379251258646
Best Parameters: {'n_estimators': 300}
Best Model: RandomForestRegressor(n_estimators=300)


## 3.4 XGBoost

Import Requirements



In [42]:
import xgboost as xgb
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV
from xgboost.sklearn import XGBRegressor

In [49]:
### with best parameters

# Convert the data to DMatrix format
dtrain = xgb.DMatrix(x_train, label=y_train)
dtest = xgb.DMatrix(x_test, label=y_test)

# Initialize the XGBRegressor model
model = XGBRegressor(learning_rate=0.1, max_depth=7, subsample=1.0, colsample_bytree=0.8)

# Train the XGBRegressor model
model.fit(x_train, y_train)

# Make predictions on the test data
y_pred = model.predict(x_test)

# Calculate R-squared, MAE, MARE, RMSE
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

mare = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

# Print Results
print("R-squared with best model:", r2)
print("Mean Absolute Error with best model:", mae)
print("Root Mean Squared Error with best model:", rmse)
print("Mean Absolute Percentage Error with best model:", mare)

R-squared with best model: 0.9556339039086914
Mean Absolute Error with best model: 66.31811649922325
Root Mean Squared Error with best model: 101.01948301393251
Mean Absolute Percentage Error with best model: 14.740174424926543


### To find best parameters

In [48]:
# Convert the data to DMatrix format
dtrain = xgb.DMatrix(x_train, label=y_train)
dtest = xgb.DMatrix(x_test, label=y_test)

# Set the XGBoost parameters
params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'learning_rate': 0.1,
    'max_depth': 3,
    'subsample': 0.8,
    'colsample_bytree': 0.8
}

# Train the XGBoost model
num_rounds = 100
model = xgb.train(params, dtrain, num_rounds)

# Make predictions on the test set
y_pred = model.predict(dtest)

# Calculate R-squared, MAE, MARE, RMSE
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mare = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
rmse = np.sqrt(mse)

# Print Results
print("R-squared with best model:", r2)
print("Mean Absolute Error with best model:", mae)
print("Root Mean Squared Error with best model:", rmse)
print("Mean Absolute Percentage Error with best model:", mare)

R-squared with best model: 0.9319873584094023
Mean Absolute Error with best model: 85.32799784017048
Root Mean Squared Error with best model: 101.01948301393251
Mean Absolute Percentage Error with best model: 18.715123086464803


## 3.5 CART

Import Requirements

In [51]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV

In [52]:
### with best parameters

# Train a CART model on the data
cart = DecisionTreeRegressor(max_depth = 15, min_samples_split = 20, min_samples_leaf = 8)
cart.fit(x_train, y_train)

# Make predictions on the test set
y_pred = cart.predict(x_test)

# Calculate R-squared, MAE, MARE, RMSE
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mare = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Print Results
print("R-squared:", r2)
print("MAE:", mae)
print("MARE:", mare)
print("RMSE:", rmse)

R-squared: 0.9345800986488253
MAE: 80.59481526440551
MARE: 16.895704037343506
RMSE: 122.6689015332635


### To find best parameters

In [None]:
### CART ### - Find best parameters

# Define the parameter grid for tuning
param_grid = {
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8]
}

# Initialize the CART model
cart = DecisionTreeRegressor()

# Perform grid search to find the best parameter combination
grid_search = GridSearchCV(cart, param_grid, cv=5, scoring='r2')
grid_search.fit(x_train, y_train)

# Get the best model with tuned parameters
best_model = grid_search.best_estimator_

# Make predictions on the test set using the best model
y_pred = best_model.predict(x_test)

# Calculate R-squared, MAE, MARE, RMSE
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mare = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Print Results
print("R-squared:", r2)
print("MAE:", mae)
print("MARE:", mare)
print("RMSE:", rmse)

## 3.6 GRNN - Generalized Regression Neural Network (GRNN)

Install requirements

In [None]:
pip install pygam

In [55]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from pygam import LinearGAM
from sklearn.model_selection import ParameterGrid

### Model - with best parameters

In [56]:
### GRNN ### ---with best parameters

# Create the GRNN model
model = LinearGAM(lam=0.1, n_splines=30, max_iter=100, tol=0.0001, terms='auto')

# Train the model
model.fit(x_train, y_train)

# Make predictions on the test set
y_pred = model.predict(x_test)

# Calculate R-squared, MAE, MARE, RMSE
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mare = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Print Results
print("R-squared:", r2)
print("Mean Absolute Error:", mae)
print("Mean Absolute Percentage Error:", mare)
print("RMSE:", rmse)

R-squared: 0.9100802478923622
Mean Absolute Error: 99.74424719403554
Mean Absolute Percentage Error: 23.090338518277246
RMSE: 143.81595990244438


## To find best parameters

In [None]:
# find best parameters

# Define the parameter grid for tuning
param_grid = {
    'lam': [0.1, 0.01, 0.001],
    'n_splines': [10, 20, 30],
    'max_iter': [100, 200, 300],
    'tol': [1e-4, 1e-5, 1e-6],
}

best_r2 = -np.inf
best_params = {}

# Iterate over the parameter combinations
for params in ParameterGrid(param_grid):
    # Create the GRNN model with the current parameter combination
    model = LinearGAM(**params)
    model.fit(x_train, y_train)

    # Make predictions on the test set
    y_pred = model.predict(x_test)

    # Calculate R-squared
    r2 = r2_score(y_test, y_pred)

    # Check if the current model is the best
    if r2 > best_r2:
        best_r2 = r2
        best_params = params

# Rebuild the best model using the best parameters
best_model = LinearGAM(**best_params)
best_model.fit(x_train, y_train)

# Make predictions on the test set using the best model
y_pred = best_model.predict(x_test)

# Calculate evaluation metrics with the best model
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mare = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Print the evaluation metrics and best parameters
print("R-squared:", r2)
print("Mean Absolute Error:", mae)
print("Mean Absolute Percentage Error:", mare)
print("RMSE:", rmse)
print("Best Parameters:", best_params)


# 3.7 ANNs Artificial Neural Networks (ANNs)

Install requirements

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.neural_network import MLPRegressor

In [None]:
# With best parameters

# Create the ANNs model with specified parameters
model = MLPRegressor(hidden_layer_sizes=150, activation='relu', solver='adam', alpha=0.01, learning_rate='adaptive')

# Train the model
model.fit(x_train, y_train)

# Make predictions on the test set
y_pred = model.predict(x_test)

# Calculate R-squared, MAE, MARE, RMSE
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mare = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Print Resultes
print("R-squared:", r2)
print("Mean Absolute Error:", mae)
print("Mean Absolute Percentage Error:", mare)
print("RMSE:", rmse)

R-squared: 0.8593829553423451
Mean Absolute Error: 130.36525924055027
Mean Absolute Percentage Error: 28.25327041940819
RMSE: 181.24978522281202


# 3.8 Gradient Boosting

Install requirements

In [59]:
pip install scikit-learn



In [60]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

In [61]:
# with best parameters

# Create the GradientBoostingRegressor model
model = GradientBoostingRegressor(n_estimators=300, max_depth=7, learning_rate=0.1)

# Train the model
model.fit(x_train, y_train)

# Make predictions on the test set
y_pred = model.predict(x_test)

# Calculate R-squared, MAE, MARE, RMSE
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mare = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Print Resultes
print("R-squared:", r2)
print("Mean Absolute Error:", mae)
print("Mean Absolute Percentage Error:", mare)
print("RMSE:", rmse)

R-squared: 0.9599529070855344
Mean Absolute Error: 61.91162513111234
Mean Absolute Percentage Error: 13.80264555461643
RMSE: 95.9765272255871


### To find best parameters

In [None]:
# to find best parameters

# Create the GradientBoostingRegressor model
model = GradientBoostingRegressor()

# Define the parameter grid for tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001]
}

# Perform grid search to find the best parameter combination
grid_search = GridSearchCV(model, param_grid, scoring='r2', cv=5)
grid_search.fit(x_train, y_train)

# Get the best model with tuned parameters
best_model = grid_search.best_estimator_

# Make predictions on the test set using the best model
y_pred = best_model.predict(x_test)

# Calculate R-squared, MAE, MARE, RMSE
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mare = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Print Resultes
print("R-squared:", r2)
print("Mean Absolute Error:", mae)
print("Mean Absolute Percentage Error:", mare)
print("RMSE:", rmse)
print("Best Model:", best_model)

## 3.9 Ordinary Least Squares (OLS)

Install requirements

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression

In [None]:
# Create the LinearRegression model
model = LinearRegression()

# Train the model
model.fit(x_train, y_train)

# Make predictions on the test set
y_pred = model.predict(x_test)

# Calculate R-squared, MAE, MARE, RMSE
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mare = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Print Resultes
print("R-squared:", r2)
print("Mean Absolute Error:", mae)
print("Mean Absolute Percentage Error:", mare)
print("RMSE:", rmse)

R-squared: 0.8316824862616343
Mean Absolute Error: 142.79021386701825
Mean Absolute Percentage Error: 31.023226870932337
RMSE: 198.3002086935617
