In [1]:
! pip install scikit_learn




[notice] A new release of pip is available: 24.0 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split


In [3]:
data = pd.read_csv(r"Real_Estate.csv")

In [4]:
data.head()

Unnamed: 0,Transaction date,House age,Distance to the nearest MRT station,Number of convenience stores,Latitude,Longitude,House price of unit area
0,2012-09-02 16:42:30.519336,13.3,4082.015,8,25.007059,121.561694,6.488673
1,2012-09-04 22:52:29.919544,35.5,274.0144,2,25.012148,121.54699,24.970725
2,2012-09-05 01:10:52.349449,1.1,1978.671,10,25.00385,121.528336,26.694267
3,2012-09-05 13:26:01.189083,22.2,1055.067,5,24.962887,121.482178,38.091638
4,2012-09-06 08:29:47.910523,8.5,967.4,6,25.011037,121.479946,21.65471


In [5]:
data.describe()

Unnamed: 0,House age,Distance to the nearest MRT station,Number of convenience stores,Latitude,Longitude,House price of unit area
count,414.0,414.0,414.0,414.0,414.0,414.0
mean,18.405072,1064.468233,4.2657,24.973605,121.520268,29.102149
std,11.75767,1196.749385,2.880498,0.024178,0.026989,15.750935
min,0.0,23.38284,0.0,24.932075,121.473888,0.0
25%,9.9,289.3248,2.0,24.952422,121.496866,18.422493
50%,16.45,506.1144,5.0,24.974353,121.520912,30.39407
75%,30.375,1454.279,6.75,24.994947,121.544676,40.615184
max,42.7,6306.153,10.0,25.014578,121.565321,65.571716


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 414 entries, 0 to 413
Data columns (total 7 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Transaction date                     414 non-null    object 
 1   House age                            414 non-null    float64
 2   Distance to the nearest MRT station  414 non-null    float64
 3   Number of convenience stores         414 non-null    int64  
 4   Latitude                             414 non-null    float64
 5   Longitude                            414 non-null    float64
 6   House price of unit area             414 non-null    float64
dtypes: float64(5), int64(1), object(1)
memory usage: 22.8+ KB


In [7]:
data.columns

Index(['Transaction date', 'House age', 'Distance to the nearest MRT station',
       'Number of convenience stores', 'Latitude', 'Longitude',
       'House price of unit area'],
      dtype='object')

### Conversion of date to datetime

In [8]:
data['Transaction date'] = pd.to_datetime(data['Transaction date'])

In [9]:
data['Transaction date'].dtype

dtype('<M8[ns]')

In [10]:
data.head(2)

Unnamed: 0,Transaction date,House age,Distance to the nearest MRT station,Number of convenience stores,Latitude,Longitude,House price of unit area
0,2012-09-02 16:42:30.519336,13.3,4082.015,8,25.007059,121.561694,6.488673
1,2012-09-04 22:52:29.919544,35.5,274.0144,2,25.012148,121.54699,24.970725


In [11]:
X = data.drop(columns='House price of unit area', axis=1)  # features / independent variables

In [12]:
X.shape

(414, 6)

In [13]:
data.shape

(414, 7)

In [14]:
y = data['House price of unit area'] # Dependent variable
y

0       6.488673
1      24.970725
2      26.694267
3      38.091638
4      21.654710
         ...    
409    29.096310
410    33.871347
411    25.255105
412    25.285620
413    37.580554
Name: House price of unit area, Length: 414, dtype: float64

In [15]:
X.drop('Transaction date',axis=1,inplace=True)
X

Unnamed: 0,House age,Distance to the nearest MRT station,Number of convenience stores,Latitude,Longitude
0,13.3,4082.01500,8,25.007059,121.561694
1,35.5,274.01440,2,25.012148,121.546990
2,1.1,1978.67100,10,25.003850,121.528336
3,22.2,1055.06700,5,24.962887,121.482178
4,8.5,967.40000,6,25.011037,121.479946
...,...,...,...,...,...
409,18.3,170.12890,6,24.981186,121.486798
410,11.9,323.69120,2,24.950070,121.483918
411,0.0,451.64190,8,24.963901,121.543387
412,35.9,292.99780,5,24.997863,121.558286


In [16]:
# Split data into training and testing set

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,train_size=0.8,random_state=42) #random state ensures reproducibility

In [17]:
print(f"X_training shape : {X_train.shape}")
print(f"X_testing shape : {X_test.shape}")
print(f"Y_training : {y_train.shape}")
print(f"Y_testing : {y_test.shape}")

X_training shape : (331, 5)
X_testing shape : (83, 5)
Y_training : (331,)
Y_testing : (83,)


In [18]:
#create an instance of Linear Regression model
lin_model = LinearRegression() 

In [19]:
lin_model.fit(X_train,y_train)

# Takes the training features (X_train) and corresponding target values (y_train)
# Finds the best-fit line by minimizing the sum of squared residuals
# Calculates the coefficients (slopes) for each feature and the intercept
# Stores these parameters within the lin_model object

In [20]:
# Training data evaluation

y_train_pred = lin_model.predict(X_train) #Uses the trained model to make predictions on the training features
base_train_mse = mean_squared_error(y_train,y_train_pred) #Calculates the average of the squared differences between predicted and actual values
base_train_mse


 
#compare this with test set MSE to check for overfitting especially in low MSE

117.36441878463525

In [21]:
#test data evaluation
y_test_pred = lin_model.predict(X_test)
base_test_mse = mean_squared_error(y_test,y_test_pred)
base_test_mse

# If test MSE is much higher than training MSE, it suggests overfitting

124.60907371015156

In [22]:
metrics_df =pd.DataFrame ({
    'Train MSE': [base_train_mse],
    'Test MSE': [base_test_mse],
    'Intercept': [lin_model.intercept_]
}, index=['Base Model'] )
metrics_df


Unnamed: 0,Train MSE,Test MSE,Intercept
Base Model,117.364419,124.609074,2816.995306


In [23]:
#Retrieve the y-intercept of the fitted linear regression model.
lin_model.intercept_

2816.995306454353

In [24]:
# coeficients

coefficients = lin_model.coef_
coefficients

array([ 1.25074957e-03, -8.35130068e-03,  1.81682766e+00,  6.43423914e+01,
       -3.61575541e+01])

In [25]:
coeff_df = pd.DataFrame({'Feature':X.columns,
             'Base_coefficient':coefficients})

In [26]:
coeff_df

Unnamed: 0,Feature,Base_coefficient
0,House age,0.001251
1,Distance to the nearest MRT station,-0.008351
2,Number of convenience stores,1.816828
3,Latitude,64.342391
4,Longitude,-36.157554


In [27]:
# Fit a base linear regression model
# Explore the training and test metrics
# Explore the coefficients and intercept
# Fit a standard scaler to your model, and go through the above steps, comparing the metrics.

### Scaled Model- Standardization

In [28]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [29]:

#Instantiate the model
lin_std_model = LinearRegression ()
lin_std_model

In [30]:
#Fit the model
lin_std_model.fit(X_train_scaled, y_train)

In [31]:
# Evaluate the training metrics
y_train_std_pred = lin_std_model.predict(X_train_scaled)
std_train_mse = mean_squared_error(y_train, y_train_std_pred)
std_train_mse 

117.36441878463549

In [32]:
#Evaluate testing metrics
y_test_std_train = lin_std_model.predict(X_test_scaled)
std_test_mse = mean_squared_error(y_test, y_test_std_train)
std_test_mse


124.6090737101526

In [33]:
lin_std_model.intercept_

28.75831387077685

In [34]:
std_coeff = lin_std_model.coef_
std_coeff

array([  0.01484447, -10.08750227,   5.06902708,   1.54554389,
        -0.95199227])

In [35]:
coeff_df['std_coeff'] = std_coeff
coeff_df

Unnamed: 0,Feature,Base_coefficient,std_coeff
0,House age,0.001251,0.014844
1,Distance to the nearest MRT station,-0.008351,-10.087502
2,Number of convenience stores,1.816828,5.069027
3,Latitude,64.342391,1.545544
4,Longitude,-36.157554,-0.951992


In [36]:
metrics_df.loc['Std Mse'] = [std_train_mse, std_test_mse, lin_std_model.intercept_]
metrics_df


Unnamed: 0,Train MSE,Test MSE,Intercept
Base Model,117.364419,124.609074,2816.995306
Std Mse,117.364419,124.609074,28.758314


## Polynomial Regression

### Challenge of the day:
Fit a polynomial regression model

Employ grid search to figure out the best value of the degree/order


In [37]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

### Alternative 1

In [38]:
degrees = [1, 2, 3] 
models = []

for degree in degrees:
    polynomial_features = PolynomialFeatures(degree)
    X_train_poly = polynomial_features.fit_transform (X_train)
    X_test_poly = polynomial_features.transform (X_test)

    model = LinearRegression()
    model.fit(X_train_poly, y_train)

    #training metric
    y_train_pred = model.predict(X_train_poly)
    train_mse = mean_squared_error(y_train, y_train_pred)

    #testing metric
    y_test_pred = model.predict( X_test_poly)
    test_mse = mean_squared_error(y_test, y_test_pred)
    
    #append models
    models.append(models)
    #print
    print(f"Degree {degree}:")
    print(f"Training MSE: {train_mse}")
    print(f"Testing MSE: {test_mse}")
    print(model)
    
    

Degree 1:
Training MSE: 117.36441878463528
Testing MSE: 124.6090737101455
LinearRegression()
Degree 2:
Training MSE: 110.46907028234752
Testing MSE: 135.27910292419926
LinearRegression()
Degree 3:
Training MSE: 212.25798167934366
Testing MSE: 293.95164277882685
LinearRegression()


### Alternative 2: using make_pipeline

In [39]:
def polynomial_regression(X,y,degree):
    model = make_pipeline(PolynomialFeatures(degree),LinearRegression())
    model.fit(X, y)
    return model


degrees = [1, 2, 3, 4]
models = []

def grid_search():
    for degree in degrees:
        model = polynomial_regression(X_train, y_train, degree)

        #training metric
        y_train_pred = model.predict(X_train)
        train_mse = mean_squared_error(y_train, y_train_pred)

        #testing metric
        y_test_pred = model.predict( X_test)
        test_mse = mean_squared_error(y_test, y_test_pred)
        
        #append models
        models.append(models)
        #print
        print(f"Degree {degree}:")
        print(f"Training MSE: {train_mse}")
        print(f"Testing MSE: {test_mse}")
        print(model)

grid_search()
    

Degree 1:
Training MSE: 117.36441878463528
Testing MSE: 124.6090737101455
Pipeline(steps=[('polynomialfeatures', PolynomialFeatures(degree=1)),
                ('linearregression', LinearRegression())])
Degree 2:
Training MSE: 110.46907028234752
Testing MSE: 135.27910292419926
Pipeline(steps=[('polynomialfeatures', PolynomialFeatures()),
                ('linearregression', LinearRegression())])
Degree 3:
Training MSE: 212.25798167934366
Testing MSE: 293.95164277882685
Pipeline(steps=[('polynomialfeatures', PolynomialFeatures(degree=3)),
                ('linearregression', LinearRegression())])
Degree 4:
Training MSE: 401.6518589038904
Testing MSE: 1060.1496949285238
Pipeline(steps=[('polynomialfeatures', PolynomialFeatures(degree=4)),
                ('linearregression', LinearRegression())])


## Regularization

In [40]:
from sklearn.linear_model import Lasso, Ridge, RidgeCV, LassoCV, ElasticNetCV

In [41]:
#Ridge
#Instantiate
ridge_model = Ridge(alpha=0.55)

In [42]:
#Fit
ridge_model.fit(X_train, y_train)

In [43]:
#Training metrics
y_train_pred = ridge_model.predict(X_train)
mean_squared_error (y_train, y_train_pred)

119.12193191390735

In [44]:
#Testing metrics
y_test_pred = ridge_model.predict(X_test)
mean_squared_error(y_test, y_test_pred)

123.3627224145378

Alpha = 0.1
   Training = 117.73125
   Test = 123.54609517563605

Alpha = 0.5
   Training = 119.03231363704236 
   Test = 123.34463043262092

Alpha = 0.6
   Training = 119.20227819203953
   Test = 123.38074964302508

Alpha = 0.7
   Training = 119.34019907707622
   Test = 123.415342272663

Alpha = 0.55
   Training =  119.12193191390735
   Test = 123.3627224145378


 

In [45]:
#Lasso
#Instatiate
lasso_model = Lasso(alpha=0.01)

In [46]:
lasso_model.fit(X_train, y_train)

In [47]:
#Training metrics
y_train_pred = lasso_model.predict(X_train)
mean_squared_error (y_train, y_train_pred)

117.68361262939811

In [48]:
#Testing metrics
y_test_pred = lasso_model.predict(X_test)
mean_squared_error(y_test, y_test_pred)

123.77347611802162

Alpha = 0.1
   Training = 117.73125, Lasso: 120.627230999682
   Test = 123.54609517563605, 123.80404100058776
 
Alpha = 0.5, 0.05, 0.01
   Training = 119.03231363704236 , 120.62619449383067, 117.68361262939811
   Test = 123.34463043262092, 123.85622878388571, 123.77347611802162

Alpha = 0.6
   Training = 119.20227819203953
   Test = 123.38074964302508

Alpha = 0.7
   Training = 119.34019907707622
   Test = 123.415342272663

Alpha = 0.55
   Training =  119.12193191390735
   Test = 123.3627224145378


 

In [51]:
from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV
# Fit RidgeCV
ridge_cv = RidgeCV(alphas=(0.1, 1, 10), )
ridge_cv.fit(X_train, y_train)

# Fit LassoCV
lasso_cv = LassoCV(alphas=(0.1, 1, 10))
lasso_cv.fit(X_train, y_train)

# Fit ElasticNetCV
elastic_cv = ElasticNetCV(alphas=(0.1, 1, 10))
elastic_cv.fit(X_train, y_train)

# Function to calculate metrics
def calculate_metrics(model, X_train, X_test, y_train, y_test):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    train_mse = mean_squared_error(y_train, y_train_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)
    return train_mse, test_mse

# Calculate metrics for each model
ridge_train_mse, ridge_test_mse = calculate_metrics(ridge_cv, X_train, X_test, y_train, y_test)
lasso_train_mse, lasso_test_mse = calculate_metrics(lasso_cv, X_train, X_test, y_train, y_test)
elastic_train_mse, elastic_test_mse = calculate_metrics(elastic_cv, X_train, X_test, y_train, y_test)

# Create a dataframe with metrics
metrics_df = pd.DataFrame({
    'Model': ['RidgeCV', 'LassoCV', 'ElasticNetCV'],
    'Train MSE': [ridge_train_mse, lasso_train_mse, elastic_train_mse],
    'Test MSE': [ridge_test_mse, lasso_test_mse, elastic_test_mse],
    'Best Alpha': [ridge_cv.alpha_, lasso_cv.alpha_, elastic_cv.alpha_]
})

print("Metrics Comparison:")
print(metrics_df)

# Create a dataframe with coefficients
coef_df = pd.DataFrame({
    'Feature': X.columns,
    'RidgeCV': ridge_cv.coef_,
    'LassoCV': lasso_cv.coef_,
    'ElasticNetCV': elastic_cv.coef_
})

print("\nCoefficients Comparison:")
print(coef_df)

Metrics Comparison:
          Model   Train MSE    Test MSE  Best Alpha
0       RidgeCV  117.731253  123.546163         0.1
1       LassoCV  120.764046  123.033084         1.0
2  ElasticNetCV  120.628508  123.781415         0.1

Coefficients Comparison:
                               Feature    RidgeCV   LassoCV  ElasticNetCV
0                            House age   0.006565  0.008400      0.015611
1  Distance to the nearest MRT station  -0.008366 -0.008370     -0.008398
2         Number of convenience stores   1.833220  1.734923      1.846911
3                             Latitude  41.955755  0.000000      0.000000
4                            Longitude -25.278743 -0.000000     -0.000000
