In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np

In [None]:
df = pd.read_csv(r'housing.csv')
df

In [None]:
# Checking if the median_housing_value has a long tail
# Plot a histogram of the 'median_house_value' variable
plt.figure(figsize=(10, 6))
plt.hist(df['median_house_value'], bins=30, edgecolor='k', alpha=0.7)
plt.title('Histogram of Median House Value')
plt.xlabel('Median House Value')
plt.ylabel('Frequency')
plt.grid(True)

plt.show()

In [None]:
# Looking for outliers, as indicated in https://ww2.amstat.org/publications/jse/v19n3/decock.pdf
plt.scatter(df.GrLivArea, df.SalePrice, c = "blue", marker = "s")
plt.title("Looking for outliers")
plt.xlabel("GrLivArea")
plt.ylabel("SalePrice")
plt.show()

df = df[df.GrLivArea < 4000]

In [None]:
filtered_df = df[df['ocean_proximity'].isin(['<1H OCEAN', 'INLAND'])]

# Select only the specified columns
selected_columns = [
    'latitude',
    'longitude',
    'housing_median_age',
    'total_rooms',
    'total_bedrooms',
    'population',
    'households',
    'median_income',
    'median_house_value'
]

filtered_df = filtered_df[selected_columns]

# Print the first few rows of the filtered and selected DataFrame
print(filtered_df.head())

In [15]:
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

# Question 1 - Find the columns with the missing value

In [10]:
# Check for missing values and sum them by column
missing_values = filtered_df.isnull().sum()

# Filter columns with missing values
columns_with_missing_values = missing_values[missing_values > 0]

# Print the columns with missing values
print(columns_with_missing_values)

# The answer is total bedrooms

total_bedrooms    157
dtype: int64


# Question 2 -  median for the 'population' 

In [16]:
# Calculate the median for the 'population' column
median_population = filtered_df['population'].median()
median_population

1195.0

# Question 3 

In [45]:
# Set a random seed for reproducibility
np.random.seed(42)

df_encoded = pd.get_dummies(df, columns=['ocean_proximity'], prefix='ocean')
# Shuffle the dataset
shuffled_df = df_encoded.sample(frac=1, random_state=42)

# Split the data into train (60%), validation (20%), and test (20%) sets
train_df, temp_df = train_test_split(shuffled_df, test_size=0.4, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Apply log transformation to 'median_house_value' using np.log1p()
train_df['median_house_value'] = np.log1p(train_df['median_house_value'])
val_df['median_house_value'] = np.log1p(val_df['median_house_value'])
test_df['median_house_value'] = np.log1p(test_df['median_house_value'])

# Print the shapes of the train, validation, and test sets
print("Train set shape:", train_df.shape)
print("Validation set shape:", val_df.shape)
print("Test set shape:", test_df.shape)


Train set shape: (12384, 14)
Validation set shape: (4128, 14)
Test set shape: (4128, 14)


In [47]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [46]:
X_train = train_df.drop(columns=['median_house_value'])
y_train = train_df['median_house_value']
X_val = val_df.drop(columns=['median_house_value'])
y_val = val_df['median_house_value']

In [48]:
# Option 1: Fill missing values with 0
X_train_option1 = X_train.copy()
X_train_option1['total_bedrooms'].fillna(0, inplace=True)
X_val_option1 = X_val.copy()
X_val_option1['total_bedrooms'].fillna(0, inplace=True)

X_train_option1.shape
X_val_option1.shape

(4128, 13)

In [49]:
# Option 1: Fill missing values with the mean
mean_total_bedrooms = X_train['total_bedrooms'].mean()
X_train_option2 =X_train.copy()
X_train_option2.fillna(mean_total_bedrooms, inplace=True)
X_val_option2 = X_val.copy()
X_val_option2['total_bedrooms'].fillna(mean_total_bedrooms, inplace=True)
X_train_option2.shape


(12384, 13)

In [50]:
# Train linear regression models for both options
model_option1 = LinearRegression()
model_option1.fit(X_train_option1, y_train)

model_option2 = LinearRegression()
model_option2.fit(X_train_option2, y_train)

In [51]:
# Make predictions on the validation set for both options
y_pred_option1 = model_option1.predict(X_val_option1)
y_pred_option2 = model_option2.predict(X_val_option2)

In [52]:
# Calculate RMSE for both options
rmse_option1 = np.sqrt(mean_squared_error(y_val, y_pred_option1))
rmse_option2 = np.sqrt(mean_squared_error(y_val, y_pred_option2))

# Print RMSE scores rounded to 2 decimal digits
print("RMSE (Option 1 - Fill with 0):", round(rmse_option1, 2))
print("RMSE (Option 2 - Fill with Mean):", round(rmse_option2, 2))

# Compare which option has a better RMSE
if rmse_option1 < rmse_option2:
    print("Option 1 (Fill with 0) has a better RMSE.")
else:
    print("Option 2 (Fill with Mean) has a better RMSE.")

RMSE (Option 1 - Fill with 0): 0.34
RMSE (Option 2 - Fill with Mean): 0.34
Option 1 (Fill with 0) has a better RMSE.


In [56]:
from sklearn.linear_model import Ridge

values = [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]

# Dictionary to store RMSE scores for each r value
rmse_scores = {}

# Train Ridge regression models with different r values and calculate RMSE
for r in values:
    model = Ridge(alpha = r)
    model.fit(X_train_option1, y_train)
    y_pred = model.predict(X_val_option1)
    rmse = mean_squared_error(y_val, y_pred)
    rmse_scores[r] = round(rmse, 2)
    
# Find the r value that gives the best RMSE
best_r = min(rmse_scores, key=rmse_scores.get)
best_rmse = rmse_scores[best_r]

# Print RMSE scores and the best r value
for r, rmse in rmse_scores.items():
    print(f"RMSE (r={r}): {rmse}")
print(f"Best RMSE (best_r={best_r}): {best_rmse}")

RMSE (r=0): 0.11
RMSE (r=1e-06): 0.11
RMSE (r=0.0001): 0.11
RMSE (r=0.001): 0.11
RMSE (r=0.01): 0.11
RMSE (r=0.1): 0.11
RMSE (r=1): 0.11
RMSE (r=5): 0.11
RMSE (r=10): 0.11
Best RMSE (best_r=0): 0.11


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
