## Import liabraries

In [10]:
## Import required frameworks
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

## Import 'housing.csv' data set

In [11]:
# Load the 'housing.csv' file to the Data Frame
csv_file = 'housing.csv'
df_original = pd.read_csv(csv_file, delimiter=',')

# Show the columns names
columns = list(df_original.columns)
# print(columns)                            #   Uncomment to get the columns' names

# Create a list of the allowed 'ocean_proximity' values
allowed_values = ["<1H OCEAN", "INLAND"]

# Create new filtered DataFrame with respect to houses located near bay or 1 hour away from it
df_filted = df_original[df_original['ocean_proximity'].isin(allowed_values)]

# Drop the 'ocean_proximity' column from the filtered 'df_filted' Data Frame
df = df_filted.drop(columns=['ocean_proximity'])

# Make an overview of the data from the filtered DataFrame 'df_original'
print(df.head())

     longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
701    -121.97     37.64                32.0       1283.0           194.0   
830    -121.99     37.61                 9.0       3666.0           711.0   
859    -121.97     37.57                21.0       4342.0           783.0   
860    -121.96     37.58                15.0       3575.0           597.0   
861    -121.98     37.58                20.0       4126.0          1031.0   

     population  households  median_income  median_house_value  
701       485.0       171.0         6.0574            431000.0  
830      2341.0       703.0         4.6458            217000.0  
859      2172.0       789.0         4.6146            247600.0  
860      1777.0       559.0         5.7192            283500.0  
861      2079.0       975.0         3.6832            216900.0  


## Question 1
Check the missing values among each of the Data Frame's ('df') columns

In [12]:
# Check missing data in Housing DataFrame's columns
print(df.isnull().sum())

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        157
population              0
households              0
median_income           0
median_house_value      0
dtype: int64


## Answer for Question 1: The column called 'total_bedrooms' has 157 missing values [15,530 exists over 15,687]

## Question 2
Compute the median (50% percentile) for variable 'population' 

In [13]:
population_median = df.population.median()
print(population_median)

1195.0


In [14]:
# Split the data into train/val/test sets (60%/20%/20% distribution)
n = len(df)

# Compute the proportions for 'training' / 'validation' / 'test'
n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test

# Create 'training' / 'validation' / 'test' data sets
df_train = df.iloc[:n_train]
df_val = df.iloc[n_train:n_train+n_val]
df_test = df.iloc[n_train+n_val:]

# Set seed for reproducibility
idx = np.arange(n)
seed_number = 42

# Shuffle the DataFrame
np.random.seed(seed_number)
np.random.shuffle(idx)

# Shuffle indexes for 'training' / 'validation' / 'test' data sets
df_train = df.iloc[idx[:n_train]]
df_val = df.iloc[idx[n_train:n_train+n_val]]
df_test = df.iloc[idx[n_train+n_val:]]

# Reset indexes for 'training' / 'validation' / 'test' data sets by shuffling them
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

# Apply the log transformation to the 'median_house_value' variable and set it to "y" varaible
y_train = np.log1p(df_train.median_house_value.values)
y_val = np.log1p(df_val.median_house_value.values)
y_test = np.log1p(df_test.median_house_value.values)

# Delete 'median_house_value' as a target value in the 'training' / 'validation' / 'test' data sets 
del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

## Answer for Question 2: The median for column called 'population' equal to 1195

## Question 3
Check the best techniques (fill all with 0 / fill with mean) to fill the missing values for 'total_bedrooms'

In [52]:
# Define a formula to calculate the 'X' matrix
base = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']

# Fill missing data in 'total_bedrooms' with the '0' number
def prepare_X(df):
    df_num = df[base]
    df_num = df_num.fillna(0)
    X = df_num.values
    return X

In [53]:
# Define a formula for computing coefficients for the Linear Regression Model
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

In [54]:
# Define the formula for the Root Mean Squere Error (RMSE)
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

### Option 1. Construct the Linear Regression Model and compute the Root Mean Squere Error (RMSE) by filling the "0" to the missing values for the 'total_bedrooms' column

In [55]:
## Option 1. Fill the 0 to the missing data for 'total_bedrooms'

# 1. Compute the 'X train' matrix to train the model
X_train = prepare_X(df_train)

# 2. Compute the coeffficients for the linear regression model
w_0, w = train_linear_regression(X_train, y_train)

# 3. Compute the 'X validation' matrix to evaluate the model
X_val = prepare_X(df_val)

# 4. Predict the values for the median_house_value using thr validation data set
y_pred = w_0 + X_val.dot(w)

# 5. Compute the Root Mean Squared Error (RMSE)
print(round(rmse(y_val, y_pred), 2))

0.34


### Option 2. Construct the Linear Regression Model and compute the Root Mean Squere Error (RMSE) by filling the "MEAN" to the missing values for the 'total_bedrooms' column

In [37]:
# Define a formula to calculate the 'X' matrix
base = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']

# Fill missing data in 'total_bedrooms' with the mean
mean_median_house_value = y_train.mean()

def prepare_X(df):
    df_num = df[base]
    df_num = df_num.fillna(mean_median_house_value)
    X = df_num.values
    return X

In [38]:
# Define a formula for computing coefficients for the Linear Regression Model
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

In [39]:
# Define the formula for the Root Mean Squere Error (RMSE)
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [40]:
## Option 2. Fill the median to the missing data for 'total_bedrooms'

# 1. Compute the 'X train' matrix to train the model
X_train = prepare_X(df_train)

# 2. Compute the coeffficients for the linear regression model
w_0, w = train_linear_regression(X_train, y_train)

# 3. Compute the 'X validation' matrix to evaluate the model
X_val = prepare_X(df_val)

# 4. Predict the values for the median_house_value using thr validation data set
y_pred = w_0 + X_val.dot(w)

# 5. Compute the Root Mean Squared Error (RMSE)
print(round(rmse(y_val, y_pred), 2))

0.34


## Answer for Question 3: Using both "0" and "MEAN" to fill the missing value for the 'total_bedrooms' are equally good to use

## Question 4

Find the minimum 'regularization parameter' for the Linear Regression Model

In [56]:
# Define a formula to calculate the 'X' matrix
base = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']

# Fill missing data in 'total_bedrooms' with the '0' number
def prepare_X(df):
    df_num = df[base]
    df_num = df_num.fillna(0)
    X = df_num.values
    return X

In [57]:
# Define a formula for computing coefficients for the Linear Regression Model with regularization term
def train_linear_regression_reg(X, y, r=0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])

    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

In [58]:
# Define the formula for the Root Mean Squere Error (RMSE)
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

###  Construct the Linear Regression Model with regularization term and compute the Root Mean Squere Error (RMSE) by filling the "0" to the missing values for the 'total_bedrooms' column

In [59]:
# Find the minimum RMSE using regularization parameter (r = 0.0 / 0.000001 / 0.0001 / 0.001 / 0.01 / 0.1 / 1 / 5 / 10) to the Linear Regresssion Model 

for r in [0.0, 0.00001, 0.0001, 0.001, 0.1, 1, 10]:
    X_train = prepare_X(df_train)
    w_0, w = train_linear_regression_reg(X_train, y_train, r=r)

    X_val = prepare_X(df_val)
    y_pred = w_0 + X_val.dot(w)
    score = rmse(y_val, y_pred)
    
    print(r, w_0, score)

0.0 -9.763249477709273 0.3408479034133711
1e-05 -9.763043013880937 0.3408479310606026
0.0001 -9.761185235537457 0.3408481800530103
0.001 -9.742646249218538 0.340850692187126
0.1 -8.058889769445528 0.34128620420007866
1 -3.1331542783183215 0.34489583276493896
10 -0.4381172315471025 0.34831498335199945


## Answer for Question 4: The best 'regularization parameter' for the Linear Regression Model, resulting in the lowest RMSE, is '0'

## Question 5

Find the most stable Linear Regression Model based on standard deviation of RMSEs' scores (use different seed values)

In [61]:
# Define a formula to calculate the 'X' matrix
base = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']

# Fill missing data in 'total_bedrooms' with the '0' number
def prepare_X(df):
    df_num = df[base]
    df_num = df_num.fillna(0)
    X = df_num.values
    return X


# Define a formula for computing coefficients for the Linear Regression Model
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]


# Define the formula for the Root Mean Squere Error (RMSE)
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [66]:
# Lists to store RMSE values for each seed
rmse_values = []

# Seed values to loop through
seed_values = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

# Loop through each seed value
for seed_number in seed_values:
    # Split the data into train/val/test sets (60%/20%/20% distribution)
    n = len(df)
    n_val = int(n * 0.2)
    n_test = int(n * 0.2)
    n_train = n - n_val - n_test

    idx = np.arange(n)

    # Shuffle the DataFrame
    np.random.seed(seed_number)
    np.random.shuffle(idx)

    # Shuffle indexes for 'training' / 'validation' / 'test' data sets
    df_train = df.iloc[idx[:n_train]]
    df_val = df.iloc[idx[n_train:n_train + n_val]]
    df_test = df.iloc[idx[n_train + n_val:]]

    # Reset indexes for 'training' / 'validation' / 'test' data sets by shuffling them
    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)

    # Apply the log transformation to the 'median_house_value' variable and set it to "y" variable
    y_train = np.log1p(df_train.median_house_value.values)
    y_val = np.log1p(df_val.median_house_value.values)
    y_test = np.log1p(df_test.median_house_value.values)

    # Delete 'median_house_value' as a target value in the 'training' / 'validation' / 'test' data sets
    del df_train['median_house_value']
    del df_val['median_house_value']
    del df_test['median_house_value']

    # 1. Compute the 'X train' matrix to train the model
    X_train = prepare_X(df_train)

    # 2. Compute the coefficients for the linear regression model
    w_0, w = train_linear_regression(X_train, y_train)

    # 3. Compute the 'X validation' matrix to evaluate the model
    X_val = prepare_X(df_val)

    # 4. Predict the values for the median_house_value using the validation data set
    y_pred = w_0 + X_val.dot(w)

    # 5. Compute the Root Mean Squared Error (RMSE) for this seed
    rmse_score = rmse(y_val, y_pred)

    # 6. Append the RMSE value to the list
    rmse_values.append(rmse_score)


# Calculate the standard deviation of RMSE values
std_deviation_rmse = np.std(rmse_values)    


# Print the RMSE values and standard deviation
for i, seed_number in enumerate(seed_values):
    print(f"Seed {seed_number}: RMSE = {round(rmse_values[i], 2)}")

print(f"Standard Deviation of RMSE: {round(std_deviation_rmse, 3)}")

Seed 0: RMSE = 0.34
Seed 1: RMSE = 0.34
Seed 2: RMSE = 0.34
Seed 3: RMSE = 0.33
Seed 4: RMSE = 0.34
Seed 5: RMSE = 0.34
Seed 6: RMSE = 0.34
Seed 7: RMSE = 0.35
Seed 8: RMSE = 0.35
Seed 9: RMSE = 0.33
Standard Deviation of RMSE: 0.005


## Answer for Question 5: The standard deviation between the RMSE scores for the Linear Regression Model is '0.005'

## Question 6

Find the RMSE score from the joined training and validation data sets for Linear Regression Model

In [68]:
# Split the data into train/val/test sets (60%/20%/20% distribution)
n = len(df)

# Compute the proportions for 'training' / 'validation' / 'test'
n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test

# Create 'training' / 'validation' / 'test' data sets
df_train = df.iloc[:n_train]
df_val = df.iloc[n_train:n_train+n_val]
df_test = df.iloc[n_train+n_val:]

# Set seed for reproducibility
idx = np.arange(n)
seed_number = 9

# Shuffle the DataFrame
np.random.seed(seed_number)
np.random.shuffle(idx)

# Shuffle indexes for 'training' / 'validation' / 'test' data sets
df_train = df.iloc[idx[:n_train]]
df_val = df.iloc[idx[n_train:n_train+n_val]]
df_test = df.iloc[idx[n_train+n_val:]]

# Reset indexes for 'training' / 'validation' / 'test' data sets by shuffling them
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

# Apply the log transformation to the 'median_house_value' variable and set it to "y" varaible
y_train = np.log1p(df_train.median_house_value.values)
y_val = np.log1p(df_val.median_house_value.values)
y_test = np.log1p(df_test.median_house_value.values)

# Delete 'median_house_value' as a target value in the 'training' / 'validation' / 'test' data sets 
del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

In [69]:
# Combine the 'df_train' and 'df_val' DataFrames
df_train_val = pd.concat([df_train, df_val], axis=0).reset_index(drop=True)

# Combine the 'y_train' and 'y_val' arrays
y_train_val = np.concatenate([y_train, y_val])

# Define a formula to calculate the 'X' matrix
base = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']

# Fill missing data in 'total_bedrooms' with the '0' number
def prepare_X(df):
    df_num = df[base]
    df_num = df_num.fillna(0)
    X = df_num.values
    return X

# Define a formula for computing coefficients for the Linear Regression Model with regularization term
def train_linear_regression_reg(X, y, r=0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])

    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)

    return w_full[0], w_full[1:]

# Combine the training and validation data
X_train_val = prepare_X(df_train_val)

# Compute the coefficients for the linear regression model with regularization
r = 0.001
w_0, w = train_linear_regression_reg(X_train_val, y_train_val, r=r)

# Prepare the validation data
X_val = prepare_X(df_val)

# Predict the values for the median_house_value using the validation data set
y_pred = w_0 + X_val.dot(w)

# Compute the Root Mean Squared Error (RMSE)
score = rmse(y_val, y_pred)

print(f"RMSE for r={r}: {round(score, 2)}")

RMSE for r=0.001: 0.33


## Answer for Question 6: The RMSE score for the Linear Regression Model from joined training and validation data sets is '0.33'