##### Setting Working Directory

In [1]:
import os
os.getcwd()

'd:\\Python_MachineLearning_Proj\\Regression'

In [2]:
os.chdir(r"C:\Users\thars\Downloads")

##### Train and Compare Multiple Machine Learning Models

In [3]:
# Import necessary libraries
import pandas as pd
data = pd.read_csv(r"C:\Users\thars\Downloads\Real_Estate.csv")

# display the first few rows
print(data.head())

             Transaction date  House age  Distance to the nearest MRT station  \
0  2012-09-02 16:42:30.519336       13.3                            4082.0150   
1  2012-09-04 22:52:29.919544       35.5                             274.0144   
2  2012-09-05 01:10:52.349449        1.1                            1978.6710   
3  2012-09-05 13:26:01.189083       22.2                            1055.0670   
4  2012-09-06 08:29:47.910523        8.5                             967.4000   

   Number of convenience stores   Latitude   Longitude  \
0                             8  25.007059  121.561694   
1                             2  25.012148  121.546990   
2                            10  25.003850  121.528336   
3                             5  24.962887  121.482178   
4                             6  25.011037  121.479946   

   House price of unit area  
0                  6.488673  
1                 24.970725  
2                 26.694267  
3                 38.091638  
4             

In [4]:
# Checking the information about the dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 414 entries, 0 to 413
Data columns (total 7 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Transaction date                     414 non-null    object 
 1   House age                            414 non-null    float64
 2   Distance to the nearest MRT station  414 non-null    float64
 3   Number of convenience stores         414 non-null    int64  
 4   Latitude                             414 non-null    float64
 5   Longitude                            414 non-null    float64
 6   House price of unit area             414 non-null    float64
dtypes: float64(5), int64(1), object(1)
memory usage: 22.8+ KB


In [5]:
# Checking Null values 
data.isnull().sum()

Transaction date                       0
House age                              0
Distance to the nearest MRT station    0
Number of convenience stores           0
Latitude                               0
Longitude                              0
House price of unit area               0
dtype: int64

In [6]:
data.shape

(414, 7)

##### Data Preprocessing 

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import datetime

# Convert Transction date to datetime 
data["Transaction date"] = pd.to_datetime(data["Transaction date"])
data["Transaction year"] = data["Transaction date"].dt.year
data["Transaction month"] = data["Transaction date"].dt.month

# drop the original transction date column as we extracted the relevent features
data = data.drop("Transaction date", axis=1)   # or we can use data.drop(columns = ["Transaction date"])

# define the features and target varible
X = data.drop("House price of unit area", axis=1)
y = data["House price of unit area"]

# splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=42)

# Scale the features 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
X_train_scaled.shape

(331, 7)

In [10]:
X_test_scaled.shape

(83, 7)

##### Model Training and Comparision

In [11]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# initializing the models
models = {
    "LinearRegression" : LinearRegression(),
    "Decision TreeRegressor" : DecisionTreeRegressor(random_state=42),
    "Random Forest" : GradientBoostingRegressor(random_state=42),
    "Gradient Boosting" : GradientBoostingRegressor(random_state=42)
}

# Dictoniory to hold the evaluation metrics for each model
results = {}

#train and evaluate each model
for name, model in models.items():
    #training themodel
    model.fit(X_train_scaled, y_train)

    #making predictions on test set
    predictions = model.predict(X_test_scaled)

    #calulate evaluation metrics
    mae = mean_absolute_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)

    #storing the metrics
    results[name] = {"MAE":mae, "R2":r2}

results_df = pd.DataFrame(results).T  #convets the results to the dataframe for better readiability
print(results_df)

                              MAE        R2
LinearRegression         9.748246  0.529615
Decision TreeRegressor  11.760342  0.204962
Random Forest           10.000117  0.476071
Gradient Boosting       10.000117  0.476071


In [16]:
results

{'LinearRegression': {'MAE': 9.748245872777915, 'R2': 0.5296147783457823},
 'Decision TreeRegressor': {'MAE': 11.760341959956165,
  'R2': 0.20496231879784121},
 'Random Forest': {'MAE': 10.000116968494513, 'R2': 0.4760708438289032},
 'Gradient Boosting': {'MAE': 10.000116968494513, 'R2': 0.4760708438289032}}