In [1]:
# We first import all the necessary libraries required
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#import seaborn as sns
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
##sns.set()
# and of course the actual regression (machine learning) module
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model

In [2]:
# Load the data
data = pd.read_csv('Data.csv', index_col=0)

# Explore the data
print(data.head())
print(data.shape)
print(data.describe())
print(data.info())

                       date     price  bedrooms  bathrooms  sqft_living  \
id                                                                        
7129300520  20141013T000000  221900.0         3       1.00         1180   
6414100192  20141209T000000  538000.0         3       2.25         2570   
5631500400  20150225T000000  180000.0         2       1.00          770   
2487200875  20141209T000000  604000.0         4       3.00         1960   
1954400510  20150218T000000  510000.0         3       2.00         1680   

            sqft_lot  floors  waterfront  view  condition  grade  sqft_above  \
id                                                                             
7129300520      5650     1.0           0     0          3      7        1180   
6414100192      7242     2.0           0     0          3      7        2170   
5631500400     10000     1.0           0     0          3      6         770   
2487200875      5000     1.0           0     0          5      7        10

In [3]:
# Check for missing values
print(data.isnull().sum())

date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
dtype: int64


There were no missing values in our data.

In [4]:
# Separate the categorical and numerical columns for further analysis 
categorical_cols = data.select_dtypes(include=['object'])  
numerical_cols = data.select_dtypes(include=['int', 'float'])

In [5]:
numerical_cols.corr()["price"]

price            1.000000
bedrooms         0.308350
bathrooms        0.525138
sqft_living      0.702035
sqft_lot         0.089661
floors           0.256794
waterfront       0.266369
view             0.397293
condition        0.036362
grade            0.667434
sqft_above       0.605567
sqft_basement    0.323816
yr_built         0.054012
yr_renovated     0.126434
zipcode         -0.053203
lat              0.307003
long             0.021626
sqft_living15    0.585379
sqft_lot15       0.082447
Name: price, dtype: float64

In [7]:
# Assuming 'target_column' is the name of your target variable
X = numerical_cols.drop(['price','floors','waterfront',"view",'grade'], axis=1)
y = numerical_cols['price']

# Split the data into training and testing sets (80% train, 20% test)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Next we apply feature scaling to the independent variables. This helps in reducing the variability between different independent variables.
We use standarization where the values are centered around the mean with the standard deviation value of 1. 

In [9]:
# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the scaler on the training data
X_train_scaled = scaler.fit_transform(x_train)

# Transform the test data using the trained scaler
X_test_scaled = scaler.transform(x_test)

We first try multiple linear regression and check it effeiciency.

In [10]:
# Add a constant column for the intercept term in the linear regression model
X_train_with_intercept = sm.add_constant(X_train_scaled)

# Fit the OLS model on the training data
ols_model = sm.OLS(y_train, X_train_with_intercept)
ols_result = ols_model.fit()

# Print the summary of the OLS regression results
print(ols_result.summary())


                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.631
Model:                            OLS   Adj. R-squared:                  0.631
Method:                 Least Squares   F-statistic:                     2272.
Date:                Fri, 28 Jul 2023   Prob (F-statistic):               0.00
Time:                        12:31:16   Log-Likelihood:            -2.3719e+05
No. Observations:               17290   AIC:                         4.744e+05
Df Residuals:                   17276   BIC:                         4.745e+05
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       5.378e+05   1670.575    321.906      0.0

The above table shows the Adj. R-squared: 0.631. This shows the level of significance or how gooddness of fit of the model is okay but can be much better. Some variables have a p-value greater than 0.05. The p-values of the variables is lower than 0.05 which shows the variables are significant.  

In [12]:
# Initialize the LinearRegression model
mlr_model = LinearRegression()

# Train the model on the filtered and scaled training data
mlr_model.fit(X_train_scaled, y_train)

# Make predictions on the filtered and scaled test data
y_pred = mlr_model.predict(X_train_scaled)

# Calculate the Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calculate the R-squared score
r2 = r2_score(y_test, y_pred)

# Print the evaluation metrics
print("Mean Squared Error (MSE):", mse)
print("R-squared score (R2):", r2)


Mean Squared Error (MSE): 56601838487.76647
R-squared score (R2): 0.6255915501615275


In [13]:
reg = LinearRegression()
reg.fit(x_train,y_train)
y_pred = reg.predict(x_test)
rel = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(rel.round(3))

               Actual    Predicted
id                                
2591820310   365000.0   456658.433
7974200820   865000.0   824767.161
7701450110  1038000.0  1062480.099
9522300010  1490000.0  1538664.434
9510861140   711000.0   695585.251
...               ...          ...
6163900333   338000.0   473991.771
3528960020   673000.0   679941.608
1687000220   285000.0   471631.324
4141400030   605000.0   584743.908
1822500160   356500.0   504154.277

[4323 rows x 2 columns]


Since the r-squared is low, we try another ml techique. We build a decision tree. Decision trees are simple tree-like structures that break down data into smaller and smaller subsets by making decisions based on feature values. Each leaf node in the tree represents a prediction.

In [15]:
#Importing Decision Tree Regressor library 
from sklearn.tree import DecisionTreeRegressor

In [16]:
# Create the decision tree regressor
regressor = DecisionTreeRegressor()

# Train the model on the training data
regressor.fit(x_train, y_train)

# Make predictions on the test set
y_pred = regressor.predict(x_test)

In [17]:
# Calculate Mean Squared Error and R-squared
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the evaluation metrics
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Squared Error: 53067103085.51625
R-squared: 0.6489730310092161


R-square under decision tree is better than Linear Regression. However, it could still be improved.

We try XGBOOST.

In [18]:
#Importing the necessary libraries
import xgboost as xgb
# Create the XGBoost regressor
regressor = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Train the model on the training data
regressor.fit(x_train, y_train)
y_pred = regressor.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Squared Error: 27232927230.859016
R-squared: 0.819860302395061


We tune the parameters

In [19]:
from sklearn.model_selection import GridSearchCV
regressor = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 0.9],
}
grid_search = GridSearchCV(regressor, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(x_train, y_train)
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
y_pred = best_model.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Best Hyperparameters:", best_params)
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")




Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 300, 'subsample': 0.9}
Mean Squared Error: 25304526628.481472
R-squared: 0.8326162392955877
