In [2]:
# Importing libraries
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns


In [3]:
 # load data set
usecols = ['latitude', 'longitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value']
california_housing = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv", usecols=usecols)
california_housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0


QUESTION 1: Find a feature with missing values. How many missing values does it have?


In [4]:
california_housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


In [5]:
incomplete = california_housing.isnull().sum()
print(incomplete[incomplete > 0])

total_bedrooms    207
dtype: int64


total_bedrooms is the missing feature with 207 values

Question 2: What's the median (50% percentile) for variable 'population'?

In [6]:
california_housing.population.median()

1166.0

1166 is the median

In [7]:
list(california_housing.columns)

['longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'median_house_value']

In [8]:
# Shuffle the data
np.random.seed(42)
n = len(california_housing)
n

20640

In [9]:
idx = np.arange(n)
idx

array([    0,     1,     2, ..., 20637, 20638, 20639])

In [10]:
np.random.shuffle(idx)

In [11]:
# add idx to data
california_housing["idx"] = idx

california_housing = california_housing.sort_values(by="idx", ascending=True)
california_housing = california_housing.drop(columns='idx')
california_housing = california_housing.reset_index(drop=True)


In [12]:
# calculating the number of records in each part:

n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

print(f"total number of records in dataset = {n}")
print(f"split dataset: \n\t train = {n_train} \n\t val = {n_val} \n\t test = {n_test}")

total number of records in dataset = 20640
split dataset: 
	 train = 12384 
	 val = 4128 
	 test = 4128


In [13]:
# splitting the data into training, validation and test sets:

california_housing_train = california_housing.iloc[:n_train]
california_housing_val = california_housing.iloc[n_train : n_train + n_val]
california_housing_test = california_housing.iloc[n_train + n_val :]

print(f"length of training set = {len(california_housing_train)}")
print(f"length of validation set = {len(california_housing_val)}")
print(f"length of test set = {len(california_housing_test)}")

length of training set = 12384
length of validation set = 4128
length of test set = 4128


In [14]:
# log1p transformation to [y] and separating features from the target variable :

y_train = np.log1p(np.array(california_housing_train.median_house_value.values))
y_val = np.log1p(np.array(california_housing_val.median_house_value.values))
y_test = np.log1p(np.array(california_housing_test.median_house_value.values))

california_housing_train_f = california_housing_train.drop(columns=["median_house_value"])
california_housing_val_f = california_housing_val.drop(columns=["median_house_value"])
california_housing_testf_f = california_housing_test.drop(columns=["median_house_value"])

In [15]:
def prepare_X_null(california_housing):
    #  function to prepare dataset with 0
    california_housing = california_housing.copy()
    california_housing = california_housing.fillna(0)
    X = california_housing.values
    return X


def prepare_X_mean(california_housing):
    # function to prepare dataset with mean
    california_housing = california_housing.copy()
    mean = california_housing.total_bedrooms.mean()
    california_housing.total_bedrooms = california_housing.total_bedrooms.fillna(mean)
    X = california_housing.values
    return X


In [18]:
def train_linear_regression(x_train, y_train):
    # function to train the Linear Regression model
    ones = np.ones(x_train.shape[0])
    x_train = np.column_stack([ones, x_train])

    XTX = x_train.T.dot(x_train)
    XTX_inv = np.linalg.inv(XTX)

    w_full = XTX_inv.dot(x_train.T).dot(y_train)
    w0 = w_full[0]
    w = w_full[1:]

    return w0, w


def predict_linear_regression(x_val, w0, w_array):
    # function for marking predictions:
    y_pred = w0 + x_val.dot(w_array)
    return y_pred


def rmse(y_pred, y_val):
    # function to calculate RMSE:
    SE = (y_pred - y_val) ** 2
    MSE = SE.mean()
    RMSE = np.sqrt(MSE)
    return RMSE

x_train_0 = prepare_X_null(california_housing_train_f)
x_train_mean = prepare_X_mean(california_housing_train_f)


x_null_val = prepare_X_null(california_housing_val_f)
x_mean_val = prepare_X_mean(california_housing_val_f)



w0_null, w_null = train_linear_regression(x_train_0, y_train)
w0_mean, w_mean = train_linear_regression(x_train_mean, y_train)


y_null_pred = predict_linear_regression(x_null_val, w0_null, w_null)
y_mean_pred = predict_linear_regression(x_mean_val, w0_mean, w_mean)


rmse_option_null = rmse(y_null_pred, y_val)
rmse_option_mean = rmse(y_mean_pred, y_val)



print(f"RMSE score for option_null = {round(rmse_option_null, 2)}")
print(f"RMSE score for option_mean = {round(rmse_option_mean, 2)}")

print(f"\nAnswer to question 3:")
print(f"Both are equally good")

RMSE score for option_null = 0.34
RMSE score for option_mean = 0.34

Answer to question 3:
Both are equally good


In [19]:
def train_linear_regression_reg(x_train, y_train, r=0):
    # function for Linear Regression model training with Regularization
    ones = np.ones(x_train.shape[0])
    x_train = np.column_stack([ones, x_train])

    XTX = x_train.T.dot(x_train)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)

    w_full = XTX_inv.dot(x_train.T).dot(y_train)

    w0 = w_full[0]
    w = w_full[1:]

    return w0, w


r_list = [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]

x_train = prepare_X_null(california_housing_train_f)
x_val = prepare_X_null(california_housing_val_f)

for r in r_list:

    w0, w = train_linear_regression_reg(x_train, y_train, r=r)
    y_pred = predict_linear_regression(x_val, w0, w)

    rmse_score = rmse(y_val, y_pred)
    print(f"For r = {r:>7} RMSE_score = {round(rmse_score, 2)}")

print(f"\nAnswer to question 4:")
print(
    f"\tRegularization does not give evident results. \n\tThe smallest r with the same RMSE is 0"
)

For r =       0 RMSE_score = 0.34
For r =   1e-06 RMSE_score = 0.34
For r =  0.0001 RMSE_score = 0.34
For r =   0.001 RMSE_score = 0.34
For r =    0.01 RMSE_score = 0.34
For r =     0.1 RMSE_score = 0.34
For r =       1 RMSE_score = 0.34
For r =       5 RMSE_score = 0.35
For r =      10 RMSE_score = 0.35

Answer to question 4:
	Regularization does not give evident results. 
	The smallest r with the same RMSE is 0


In [25]:
def rmse_random_seed(california_housing, random_seed):
    california_housing = california_housing.copy()
    n = len(california_housing)
    idx = np.arange(n)
    np.random.seed(random_seed)
    np.random.shuffle(idx)

    california_housing["idx"] = idx
    california_housing = california_housing.sort_values(by="idx", ascending=True)
    california_housing = california_housing.drop(columns=["idx"])
    california_housing = california_housing.reset_index(drop=True)

    n_val = int(0.2 * n)
    n_test = int(0.2 * n)
    n_train = n - (n_val + n_test)
    california_housing_train = california_housing.iloc[:n_train]
    california_housing_val = california_housing.iloc[n_train : n_train + n_val]
    california_housing_test = california_housing.iloc[n_train + n_val :]

    y_train = np.log1p(np.array(california_housing_train.median_house_value.values))
    y_val = np.log1p(np.array(california_housing_val.median_house_value.values))

    california_housing_train_f = california_housing_train.drop(columns=["median_house_value"])
    california_housing_val_f = california_housing_val.drop(columns=["median_house_value"])
    
    x_train = prepare_X_null(california_housing_train_f)
    w0, w = train_linear_regression(x_train, y_train)
    
    x_val = prepare_X_null(california_housing_val_f)
    y_pred = predict_linear_regression(x_val, w0, w)
    
    rmse_score = rmse(y_pred, y_val)
    return rmse_score


rmse_scores_list = []
s = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

for seed in s:
    rmse_score = rmse_random_seed(california_housing, seed)
    rmse_scores_list.append(rmse_score)

    print(f"For Seed = {seed}, RMSE = {rmse_score}")

For Seed = 0, RMSE = 0.3398944153262685
For Seed = 1, RMSE = 0.3424516674111378
For Seed = 2, RMSE = 0.3504073622301258
For Seed = 3, RMSE = 0.34191333918962197
For Seed = 4, RMSE = 0.3332498184122541
For Seed = 5, RMSE = 0.3457116072329894
For Seed = 6, RMSE = 0.33694402533528683
For Seed = 7, RMSE = 0.3435425039621229
For Seed = 8, RMSE = 0.3502354582745941
For Seed = 9, RMSE = 0.34504168627548826


In [26]:
# calculating the standard deviation of rmse results for different data splits:

standard_deviation = round(np.std(rmse_scores_list), 3)

print(f"\nAnswer to question 5:")
print(f"Result of standard deviation of all rmse score = {standard_deviation}")


Answer to question 5:
Result of standard deviation of all rmse score = 0.005


In [29]:
n = len(california_housing)
idx = np.arange(n)
np.random.seed(9)
np.random.shuffle(idx)

california_housing["idx"] = idx
california_housing = california_housing.sort_values(by="idx", ascending=True)
california_housing = california_housing.drop(columns=["idx"])
california_housing = california_housing.reset_index(drop=True)

n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)
california_housing_train = california_housing.iloc[:n_train]
california_housing_val = california_housing.iloc[n_train : n_train + n_val]
california_housing_test = california_housing.iloc[n_train + n_val :]


y_train = np.log1p(np.array(california_housing_train.median_house_value.values))
y_val = np.log1p(np.array(california_housing_val.median_house_value.values))
y_test = np.log1p(np.array(california_housing_test.median_house_value.values))

california_housing_train_f = california_housing_train.drop(columns=["median_house_value"])
california_housing_val_f = california_housing_val.drop(columns=["median_house_value"])
california_housing_test_f = california_housing_test.drop(columns=["median_house_value"])


california_housing_train_val = pd.concat([california_housing_train_f, california_housing_val_f])
y_train_val = np.concatenate([y_train, y_val])

x_train_val = prepare_X_null(california_housing_train_val)
w0, w = train_linear_regression_reg(x_train_val, y_train_val, r=0.001)

x_test = prepare_X_null(california_housing_test_f)
y_pred = predict_linear_regression(x_test, w0, w)

rmse_score = rmse(y_pred, y_test)

print(f"\nAnswer to question 6:")
print(f"RMSE on combine dataset = {round(rmse_score, 2)}")

n = len(california_housing)
idx = np.arange(n)
np.random.seed(9)
np.random.shuffle(idx)

california_housing["idx"] = idx
california_housing = california_housing.sort_values(by="idx", ascending=True)
california_housing = california_housing.drop(columns=["idx"])
california_housing = california_housing.reset_index(drop=True)

n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)
california_housing_train = california_housing.iloc[:n_train]
california_housing_val = california_housing.iloc[n_train : n_train + n_val]
california_housing_test = california_housing.iloc[n_train + n_val :]

y_train = np.log1p(np.array(california_housing_train.median_house_value.values))
y_val = np.log1p(np.array(california_housing_val.median_house_value.values))
y_test = np.log1p(np.array(california_housing_test.median_house_value.values))

california_housing_train_f = california_housing_train.drop(columns=["median_house_value"])
california_housing_val_f = california_housing_val.drop(columns=["median_house_value"])
california_housing_test_f = california_housing_test.drop(columns=["median_house_value"])


california_housing_train_val = pd.concat([california_housing_train_f, california_housing_val_f])
y_train_val = np.concatenate([y_train, y_val])

x_train_val = prepare_X_null(california_housing_train_val)
w0, w = train_linear_regression_reg(x_train_val, y_train_val, r=0.001)

x_test = prepare_X_null(california_housing_test_f)
y_pred = predict_linear_regression(x_test, w0, w)

rmse_score = rmse(y_pred, y_test)

print(f"\nAnswer to question 6:")
print(f"RMSE on combine dataset = {round(rmse_score, 2)}")


Answer to question 6:
RMSE on combine dataset = 0.33

Answer to question 6:
RMSE on combine dataset = 0.33
