# `Machine Learning ZoomCamp`
## `HW1_Chapter 2: Regression`

### Aileah Gotladera
-----------------------------------

In [85]:
# Check environment
# import sys
# print(sys.executable)


import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge

The goal of this homework is to create a regression model for predicting housing prices (column `median_house_value`)

### **Loading the dataset and EDA**
*For this homework, we only want to use a subset of data.*

First, keep only the records where ocean_proximity is either `<1H OCEAN` or `INLAND`

Only use the following columns:
* `latitude`,
* `longitude`,
* `housing_median_age`,
* `total_rooms`,
* `total_bedrooms`,
* `population`,
* `households`,
* `median_income`,
* `median_house_value`'

In [28]:
data = pd.read_csv('ch2_Regression_hw1data.csv')
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [29]:
data['ocean_proximity'].unique()
data = data[(data['ocean_proximity'] == '<1H OCEAN') | (data['ocean_proximity'] == 'INLAND')]

data = data.drop('ocean_proximity', axis=1)
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
701,-121.97,37.64,32.0,1283.0,194.0,485.0,171.0,6.0574,431000.0
830,-121.99,37.61,9.0,3666.0,711.0,2341.0,703.0,4.6458,217000.0
859,-121.97,37.57,21.0,4342.0,783.0,2172.0,789.0,4.6146,247600.0
860,-121.96,37.58,15.0,3575.0,597.0,1777.0,559.0,5.7192,283500.0
861,-121.98,37.58,20.0,4126.0,1031.0,2079.0,975.0,3.6832,216900.0


## **`Q1`**
* There's 1 feature with missing values. What is it?

In [30]:
data.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        157
population              0
households              0
median_income           0
median_house_value      0
dtype: int64

In [31]:
print('Q1 Answer: total_bedrooms')

Q1 Answer: total_bedrooms


## **`Q2`**
* What's the median (50% percentile) for variable `population`?

In [82]:
median_population = data['population'].median()
print(f'Q2 Answer: {median_population}')

Q2 Answer: 1195.0


### **Prepare and split the dataset**

* Shuffle the dataset (the filtered one you created above), use seed `42`.
* Split your data in train/val/test sets, with 60%/20%/20% distribution.
* Apply the log transformation to the `median_house_value` variable using the `np.log1p()` function.

In [33]:
import numpy as np
from sklearn.model_selection import train_test_split

# Shuffle the dataset with seed 42
data_shuffled = data.sample(frac=1, random_state=42)

# Split the data into train/val/test sets (60%/20%/20%)
train_ratio = 0.6
val_ratio = 0.2
test_ratio = 0.2

train_data, temp_data = train_test_split(data_shuffled, test_size=1 - train_ratio, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=test_ratio / (test_ratio + val_ratio), random_state=42)

# Apply log transformation to 'median_house_value'
train_data['median_house_value'] = np.log1p(train_data['median_house_value'])
val_data['median_house_value'] = np.log1p(val_data['median_house_value'])
test_data['median_house_value'] = np.log1p(test_data['median_house_value'])

## **`Q3`**
* We need to deal with missing values for the column from Q1.
* We have two options: fill it with 0 or with the mean of this variable.
* Try both options. For each, train a linear regression model without regularization using the code from the lessons.
* For computing the mean, use the training only!
* Use the validation dataset to evaluate the models and compare the RMSE of each option.
* Round the RMSE scores to 2 decimal digits using round(score, 2)
* Which option gives better RMSE?

In [52]:
# Fill missing values with 0
train_zero = train_data.copy()
train_zero['total_bedrooms'].fillna(0, inplace=True)

val_zero = val_data.copy()
val_zero['total_bedrooms'].fillna(0, inplace=True)

# Mean
mean_value = train_data['total_bedrooms'].mean()
train_mean = train_data.copy()
train_mean['total_bedrooms'].fillna(mean_value, inplace=True)

val_mean = val_data.copy()
val_mean['total_bedrooms'].fillna(mean_value, inplace=True)

Xtrain_zero = train_zero.drop('median_house_value', axis=1)
ytrain_zero = train_zero['median_house_value']

Xval_zero = val_zero.drop('median_house_value', axis=1)
yval_zero = val_zero['median_house_value']

model_zero = LinearRegression()
model_zero.fit(Xtrain_zero, ytrain_zero)
ypred_zero = model_zero.predict(Xval_zero)
rmse_zero = np.sqrt(mean_squared_error(yval_zero, ypred_zero))

## Mean
Xtrain_mean = train_mean.drop('median_house_value', axis=1)
ytrain_mean = train_mean['median_house_value']

Xval_mean = val_mean.drop('median_house_value', axis=1)
yval_mean = val_mean['median_house_value']

model_mean = LinearRegression()
model_mean.fit(Xtrain_mean, ytrain_mean)
ypred_mean = model_zero.predict(Xval_mean)
rmse_mean = np.sqrt(mean_squared_error(yval_mean, ypred_mean))

print('Quesiton 3 Answer')
print(f'RMSE with Zero: {round(rmse_zero, 2)}')
print(f'RMSE with MEAN: {round(rmse_mean, 2)}')

Quesiton 3 Answer
RMSE with Zero: 0.35
RMSE with MEAN: 0.35


## **`Q4`**

* Now let's train a regularized linear regression.
* For this question, fill the NAs with 0.
*Try different values of r from this list: [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10].
* Use RMSE to evaluate the model on the validation dataset.
* Round the RMSE scores to 2 decimal digits.
* Which r gives the best RMSE?

*If there are multiple options, select the smallest r.*

Options:

- 0
- 0.000001
- 0.001
- 0.0001

In [86]:
alphas = [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]
best_rmse = float('inf')
best_r = None

for alpha in alphas:
    ridge= Ridge(alpha=alpha)
    ridge.fit(Xtrain_zero, ytrain_zero)
    y_pred = ridge.predict(Xval_zero)
    mse1 = mean_squared_error(yval_zero, y_pred)
    if mse1 < best_rmse:
        best_rmse = mse1
        best_r = alpha

print('Question 4 Answer')
print("Best r value:", best_r)
     

Question 4 Answer
Best r value: 0


## **`Q5`**
- We used seed 42 for splitting the data. Let's find out how selecting the seed influences our score.
- Try different seed values: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9].
- For each seed, do the train/validation/test split with 60%/20%/20% distribution.
- Fill the missing values with 0 and train a model without regularization.
- For each seed, evaluate the model on the validation dataset and collect the RMSE scores.
- What's the standard deviation of all the scores? To compute the standard deviation, use np.std.
- Round the result to 3 decimal digits (round(std, 3)) <br>

What's the value of std?
- 0.5
- 0.05
- 0.005
- 0.0005

In [87]:
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
rmse_scores = []

for seed in seeds:
    new_data = data.sample(frac=1, random_state=seed)
    train_data, temp_data = train_test_split(new_data, test_size=0.4, random_state=seed)
    val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=seed)

    # Apply log transformation
    train_data['median_house_value'] = np.log1p(train_data['median_house_value'])
    val_data['median_house_value'] = np.log1p(val_data['median_house_value'])

    # Filling missing values with 0
    train_data['total_bedrooms'].fillna(0, inplace=True)
    val_data['total_bedrooms'].fillna(0, inplace=True)

    # Training the model
    X_train = train_data.drop('median_house_value', axis=1)
    y_train = train_data['median_house_value']
    X_val = val_data.drop('median_house_value', axis=1)
    y_val = val_data['median_house_value']

    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    rmse_scores.append(rmse(y_val, y_pred))

std_rmse = round(np.std(rmse_scores), 3)

print('Question 5 Answer')
print("Standard deviation of RMSE scores:", std_rmse)

Question 5 Answer
Standard deviation of RMSE scores: 0.006


## **`Q6`**

In [88]:
new_data = data.sample(frac=1, random_state=9)
train_data, temp_data = train_test_split(new_data, test_size=0.4, random_state=9)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=9)


train_combined = pd.concat([train_data, val_data])


train_combined['median_house_value'] = np.log1p(train_combined['median_house_value'])
test_data['median_house_value'] = np.log1p(test_data['median_house_value'])


train_combined['total_bedrooms'].fillna(0, inplace=True)
test_data['total_bedrooms'].fillna(0, inplace=True)


X_train_combined = train_combined.drop('median_house_value', axis=1)
y_train_combined = train_combined['median_house_value']
X_test = test_data.drop('median_house_value', axis=1)
y_test = test_data['median_house_value']

model = Ridge(alpha=0.001)
model.fit(X_train_combined, y_train_combined)
y_pred_test = model.predict(X_test)
rmse_test = round(rmse(y_test, y_pred_test), 2)

print('Question 6 Answer')
print("RMSE on the test dataset:", rmse_test)

Question 6 Answer
RMSE on the test dataset: 0.33


---------------