# Problem Set: Scaling & Linear Regression

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Part 1: Data Preprocessing

<font color='blue'>
1. Load the Excel dataset into a pandas DataFrame. Handle any missing values and perform data
exploration to understand the characteristics of the dataset</font>.

In [2]:
real_estate_data = pd.read_csv("Real estate.csv")

In [3]:
# pd.set_option("display.max_rows", None)
# pd.set_option("display.max_columns", None)

In [4]:
real_estate_data

Unnamed: 0,No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
0,1,2012.917,32.0,84.87882,10,24.98298,121.54024,37.9
1,2,2012.917,19.5,306.59470,9,24.98034,121.53951,42.2
2,3,2013.583,13.3,561.98450,5,24.98746,121.54391,47.3
3,4,2013.500,13.3,561.98450,5,24.98746,121.54391,54.8
4,5,2012.833,5.0,390.56840,5,24.97937,121.54245,43.1
...,...,...,...,...,...,...,...,...
409,410,2013.000,13.7,4082.01500,0,24.94155,121.50381,15.4
410,411,2012.667,5.6,90.45606,9,24.97433,121.54310,50.0
411,412,2013.250,18.8,390.96960,7,24.97923,121.53986,40.6
412,413,2013.000,8.1,104.81010,5,24.96674,121.54067,52.5


In [5]:
real_estate = pd.read_csv("Real estate.csv", usecols=list(range(1, 8)))
real_estate.head()

Unnamed: 0,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
0,2012.917,32.0,84.87882,10,24.98298,121.54024,37.9
1,2012.917,19.5,306.5947,9,24.98034,121.53951,42.2
2,2013.583,13.3,561.9845,5,24.98746,121.54391,47.3
3,2013.5,13.3,561.9845,5,24.98746,121.54391,54.8
4,2012.833,5.0,390.5684,5,24.97937,121.54245,43.1


In [6]:
real_estate.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 414 entries, 0 to 413
Data columns (total 7 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   X1 transaction date                     414 non-null    float64
 1   X2 house age                            414 non-null    float64
 2   X3 distance to the nearest MRT station  414 non-null    float64
 3   X4 number of convenience stores         414 non-null    int64  
 4   X5 latitude                             414 non-null    float64
 5   X6 longitude                            414 non-null    float64
 6   Y house price of unit area              414 non-null    float64
dtypes: float64(6), int64(1)
memory usage: 22.8 KB


In [7]:
real_estate.describe()

Unnamed: 0,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
count,414.0,414.0,414.0,414.0,414.0,414.0,414.0
mean,2013.148971,17.71256,1083.885689,4.094203,24.96903,121.533361,37.980193
std,0.281967,11.392485,1262.109595,2.945562,0.01241,0.015347,13.606488
min,2012.667,0.0,23.38284,0.0,24.93207,121.47353,7.6
25%,2012.917,9.025,289.3248,1.0,24.963,121.528085,27.7
50%,2013.167,16.1,492.2313,4.0,24.9711,121.53863,38.45
75%,2013.417,28.15,1454.279,6.0,24.977455,121.543305,46.6
max,2013.583,43.8,6488.021,10.0,25.01459,121.56627,117.5


In [8]:
real_estate.isna().any()

X1 transaction date                       False
X2 house age                              False
X3 distance to the nearest MRT station    False
X4 number of convenience stores           False
X5 latitude                               False
X6 longitude                              False
Y house price of unit area                False
dtype: bool

In [None]:
sns.pairplot(real_estate)

---------------------------------------------------------------------------------------------------------------------------------------------------------

<font color='purple'>2. Extract the feature columns (X1 to X6) and the target column (Y). Split the dataset into features (X) and
target (y). (70% training & 30% testing ratio)</font>.

In [None]:
# Correlation Analysis:
# Correlation heatmap

correlation_matrix = real_estate.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.show()

In [None]:
# Categorical Variables:
# Countplot for categorical variables

sns.countplot(x='X2 house age', data=real_estate)
plt.show()

In [None]:
# Crosstab for analyzing two categorical variables

pd.crosstab(real_estate['X5 latitude'], real_estate['X6 longitude'])

In [None]:
X = real_estate.drop('Y house price of unit area', axis=1)
X.head()

In [None]:
y = real_estate['Y house price of unit area']
y.head()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)

--------------------------------------------------------------------------------------------------------------------------------------------------------

<font color='purple'><strong>3. Standardize the feature columns (X1 to X6) using Z-score scaling. Implement the scaling process manually (without using libraries like `scikit-learn`).</strong></font>.

In [None]:
def standardizer(column):
    return (column - column.mean()) / column.std()

In [None]:
standardizer(real_estate['X2 house age'])

In [None]:
X_train_1 = X_train.copy()
X_test_1 = X_test.copy()

In [None]:
scales = {}
for column in X_train_1.columns:
    column_mean = X_train_1[column].mean()
    column_std = X_train_1[column].std()
    scales[column] = dict(mean = column_mean, std = column_std)

scales

In [None]:
for column in X_train_1.columns:
    X_train_1[column] = standardizer(X_train_1[column])

X_train_1.head()

In [None]:
scales['X1 transaction date']['std']

In [None]:
X_test_1.head()

In [None]:
for column in X_test_1.columns:
    X_test_1[column] = (X_test_1[column] - scales[column]['mean']) / scales[column]['std']

In [None]:
X_test_1.head()

----------------------------------------------------------------------------------------------------------------------

# Part 2: Linear Regression

<font color='purple'><strong>4. Implement a simple linear regression model using the scaled features (X1 to X6) to predict the house price (Y). You can use gradient descent for optimization. Calculate the regression coefficients (intercept and slope) and write down the regression equation.</strong></font> 

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lr = LinearRegression()

In [None]:
lr.fit(X_train_1, y_train)

In [None]:
lr.predict([[0.010300, -1.433801, -0.698739, 0.610202 ,-0.309479, 0.488590]])

In [None]:
y_test.head()

In [None]:
y_test.tail()

<font font-size='18px'> -0.873542, -0.416515, -0.717526, -1.062932, 0.329289, -0.242888 </font>

In [None]:
lr.predict([[-0.873542, -0.416515, -0.717526, -1.062932, 0.329289, -0.242888]])

<font> -1.757384, 0.241212, 0.896451, -0.393679, -0.508295, -1.399352 </font>

In [None]:
lr.predict([[-1.757384, 0.241212, 0.896451, -0.393679, -0.508295, -1.399352]])

In [None]:
lr.coef_

In [None]:
lr.intercept_

<font style="font-size: 18px;"> The linear regression equation: y = mx + c </font>

In [None]:
sum(lr.coef_*[-0.873542,-0.416515,-0.717526,-1.062932,0.329289,-0.242888]) + lr.intercept_

In [None]:
sum(lr.coef_*[-1.757384, 0.241212, 0.896451, -0.393679, -0.508295, -1.399352]) + lr.intercept_

--------------------------------------------------------------------------------------------------------------------------------------------------------

<font color='purple'><strong>5. Evaluate the model's performance using metrics such as Mean Squared Error (MSE) and R-squared. Interpret the R-squared value in the context of this regression.</strong></font>

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
y_pred = lr.predict(X_test_1)
y_pred

In [None]:
MSE = mean_squared_error(y_test, y_pred)

In [None]:
MSE

In [None]:
from sklearn.metrics import r2_score
r_squared = r2_score(y_test, y_pred)
r_squared

----------------------------------------------------------------------------------------------------------------------

# Part 3: Feature Scaling Comparison

<font color='purple'><strong>6. Train another linear regression model without scaling the features. Compare the performance of this model with the scaled model from Part 2 in terms of convergence speed and prediction accuracy.</strong></font> 

In [None]:
X_train_without_scaling = X_train.copy()
X_train_without_scaling

In [None]:
X_test_without_scaling = X_test.copy()
X_test_without_scaling

In [None]:
ln_reg = LinearRegression()

In [None]:
ln_reg.fit(X_train_without_scaling, y_train)

In [None]:
y_pred_without_scaling = ln_reg.predict(X_test_without_scaling)
y_pred_without_scaling

In [None]:
MSE_without_scaling = mean_squared_error(y_test, y_pred_without_scaling)
MSE_without_scaling

In [None]:
print(f"Mean Squared Error without Scaling is {MSE_without_scaling} and MSE after scaling {MSE}")

In [None]:
r2_score_without_scaling = r2_score(y_test, y_pred_without_scaling)
r2_score_without_scaling

----------------------------------------------------------------------------------------------------------------------

# Part 4: Visualization and Analysis

<font color='purple'><strong>7. Create scatter plots to visualize the relationship between each feature (X1 to X6) and the target variable (Y). Discuss the patterns you observe in the plots.</strong></font> 

In [None]:
real_estate.head()

In [None]:
fig, axes = plt.subplots(3,2, figsize=(10, 8))
fig.suptitle("Relationship between each feature (X1 to X6) and the target variable (Y)", fontsize=22)
fig.tight_layout(pad=2.5)

for i in range(3):
    for j in range(2):
        if i == 0 and j == 0:
            axes[i,j].scatter(real_estate['X1 transaction date'], real_estate['Y house price of unit area'])
            m, c = np.polyfit(real_estate['X1 transaction date'],real_estate['Y house price of unit area'],1)
            axes[i,j].plot(real_estate['X1 transaction date'], real_estate['X1 transaction date']*m + c, color='red')
            axes[i,j].set_xlabel('X1 transaction date')
            axes[i,j].set_ylabel('Y')
        elif i == 0 and j == 1:
            axes[i,j].scatter(real_estate['X2 house age'], real_estate['Y house price of unit area'])
            m, c = np.polyfit(real_estate['X2 house age'],real_estate['Y house price of unit area'],1)
            axes[i,j].plot(real_estate['X2 house age'], real_estate['X2 house age']*m + c, color='red')
            axes[i,j].set_xlabel('X2 house age')
            axes[i,j].set_ylabel('Y')
        elif i == 1 and j == 0:
            axes[i,j].scatter(real_estate['X3 distance to the nearest MRT station'], real_estate['Y house price of unit area'])
            m, c = np.polyfit(real_estate['X3 distance to the nearest MRT station'],real_estate['Y house price of unit area'],1)
            axes[i,j].plot(real_estate['X3 distance to the nearest MRT station'], real_estate['X3 distance to the nearest MRT station']*m + c, color='red')
            axes[i,j].set_xlabel('X3 distance to the nearest MRT station')
            axes[i,j].set_ylabel('Y')
        elif i == 1 and j == 1:
            axes[i,j].scatter(real_estate['X4 number of convenience stores'], real_estate['Y house price of unit area'])
            m, c = np.polyfit(real_estate['X4 number of convenience stores'],real_estate['Y house price of unit area'],1)
            axes[i,j].plot(real_estate['X4 number of convenience stores'], real_estate['X4 number of convenience stores']*m + c, color='red')
            axes[i,j].set_xlabel('X4 number of convenience stores')
            axes[i,j].set_ylabel('Y')
        elif i == 2 and j == 0:
            axes[i,j].scatter(real_estate['X5 latitude'], real_estate['Y house price of unit area'])
            m, c = np.polyfit(real_estate['X5 latitude'],real_estate['Y house price of unit area'],1)
            axes[i,j].plot(real_estate['X5 latitude'], real_estate['X5 latitude']*m + c, color='red')
            axes[i,j].set_xlabel('X5 latitude')
            axes[i,j].set_ylabel('Y')
        elif i == 2 and j == 1:
            axes[i,j].scatter(real_estate['X6 longitude'], real_estate['Y house price of unit area'])
            m, c = np.polyfit(real_estate['X6 longitude'],real_estate['Y house price of unit area'],1)
            axes[i,j].plot(real_estate['X6 longitude'], real_estate['X6 longitude']*m + c, color='red')
            axes[i,j].set_xlabel('X6 longitude')
            axes[i,j].set_ylabel('Y')

Discussion on the patterns of the scatterplot shown above:

* <font color='green'>X1 transaction date</font> column is not related at all with the <font color='green'>Y house price of unit area</font>

* <font color='green'>X2 house age</font> column has a negative relation with the <font color='green'>Y house price of unit area</font> column which is intuitive because as house gets older the price of the house decreases.

* <font color='green'>X3 distance to the nearest MRT station</font> column has a negative relation with the <font color='green'>Y house price of unit area" column. The nearer the house is to MRT station more the house price is.

* <font color='green'>X4 number of convenience stores</font> column has a positive relation with the <font color='green'>Price</font> column. As the number of convenience stores increase the house price also increases.

* <font color='green'>X5 latitude</font> column shows a positive relationship with the "Price" column. This tells us that the higher the latitude, higher is the price of the house.

* <font color='green'>X6 longitude</font> column shows a positive relationship with the "Price" column. This tells us that the higher the longitude, higher is the price of the house.

<font color='purple'><strong>8. Visualize the predicted house prices from the scaled model against the actual house prices. Interpret the results and discuss any discrepancies.</strong></font>

In [None]:
plt.scatter(y_test, y_pred)
plt.xlabel("Actual Value")
plt.ylabel("Predicted Value")
plt.title("Actual Value vs Predicted Value", fontsize=20)

----------------------------------------------------------------------------------------------------------------------

# Part 5: Advanced Problem (Optional)

<font color='#BC7AF9'><strong>9. Implement Min-Max scaling for the feature columns (X1 to X6) manually (without using libraries). Train a linear regression model using the Min-Max scaled features and compare its performance with the Z-score scaled and unscaled models.</strong></font> 

In [None]:
X_train_min_max = X_train.copy()
X_test_min_max = X_test.copy()

In [None]:
X_train_min_max.head()

In [None]:
X_train_min_max["X2 house age"]

In [None]:
scales_minmax = {}
for column in X_train_min_max.columns:
    column_min = X_train_min_max[column].min()
    column_max = X_train_min_max[column].max()
    scales_minmax[column] = dict(minimum = column_min, maximum = column_max)
scales_minmax

In [None]:
def min_max(column_data):
    minimum = column_data.min()
    maximum = column_data.max()
    
    return (column_data - minimum)/ (maximum - minimum)

In [None]:
X_train_min_max['X2 house age'].dtype

#### Transform training data

In [None]:
for column in X_train_min_max.columns:
    if X_train_min_max[column].dtype != object:
        X_train_min_max[column] = min_max(X_train_min_max[column])
    else:
        continue
        
X_train_min_max.head()

#### Transforming test data

In [None]:
X_test_min_max.head()

In [None]:
scales_minmax["X2 house age"]["maximum"]

In [None]:
for column in X_test_min_max.columns:
    X_test_min_max[column] = (X_test_min_max[column] - scales_minmax[column]["minimum"]) / (scales_minmax[column]["maximum"] - scales_minmax[column]["minimum"])

X_test_min_max.head()

In [None]:
lr_minmax = LinearRegression()
lr_minmax

In [None]:
lr_minmax.fit(X_train_min_max, y_train)

In [None]:
y_pred_minmax = lr_minmax.predict(X_test_min_max)
y_pred_minmax

In [None]:
mse_minmax = mean_squared_error(y_test, y_pred_minmax)
mse_minmax

In [None]:
r2_score_minmax = r2_score(y_test, y_pred_minmax)
r2_score_minmax

In [None]:
plt.scatter(y_test, y_pred_minmax)

In [None]:
MSE

In [None]:
MSE_without_scaling

In [None]:
mse_minmax

In [None]:
r_squared

In [None]:
r2_score_minmax

In [None]:
r2_score_without_scaling

In [None]:
performance = {
    "standardization" : {
        'mse': MSE,
        'r2_score': r_squared
    },
    
    "normalization" : {
        'mse': mse_minmax,
        'r2_score': r2_score_minmax
    },
    
    "no_feature_transformation" : {
        'mse': MSE_without_scaling,
        'r2_score': r2_score_without_scaling
    }
}

performance_df = pd.DataFrame(performance)
performance_df

In [None]:
plt.scatter(performance_df.iloc[0, 0], performance_df.iloc[1, 0], color="#D80032", label='standardization')
plt.scatter(performance_df.iloc[0, 1], performance_df.iloc[1, 1], color="#E5D283", label="normalization")
plt.scatter(performance_df.iloc[0, 2], performance_df.iloc[1, 2], color="#9400FF", label='no_transformation')
plt.legend()
plt.xlabel("mse")
plt.ylabel("r2_score")