In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import RidgeCV, Ridge
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
housing = pd.read_csv(r'housing.csv')
housing.head(10)

In [63]:
housing.isnull().sum()

longitude                       0
latitude                        0
housing_median_age              0
total_rooms                     0
total_bedrooms                207
population                      0
households                      0
median_income                   0
median_house_value              0
rooms_per_household             0
bedrooms_per_room             207
population_per_household        0
ocean_proximity_<1H OCEAN       0
ocean_proximity_INLAND          0
ocean_proximity_ISLAND          0
ocean_proximity_NEAR BAY        0
ocean_proximity_NEAR OCEAN      0
dtype: int64

In [None]:
filtered_df = housing[housing['ocean_proximity'].isin(['<1H OCEAN', 'INLAND'])]

# Select only the specified columns
selected_columns = [
    'latitude',
    'longitude',
    'housing_median_age',
    'total_rooms',
    'total_bedrooms',
    'population',
    'households',
    'median_income',
    'median_house_value'
]

filtered_df = filtered_df[selected_columns]

# Print the first few rows of the filtered and selected DataFrame
print(filtered_df.head())

In [51]:
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

# Log transform the target for official scoring
filtered_df["median_house_value"] = np.log1p(filtered_df["median_house_value"])

def fill_with_zero(data):
    data['total_bedrooms'].fillna(0, inplace=True)
    return data 

def fill_with_mean(data):
    mean_value = housing["total_bedrooms"].mean()
    data['total_bedrooms'].fillna(mean_value, inplace=True)
    return data

def shuffle_dataset(data):
    np.random.seed(42)
    shuffle_df = data.sample(frac=1, random_state=42)
    return shuffle_df

def split_dataset(data, state):
    test_df,temp_df = train_test_split(data, test_size=0.4, random_state = state)
    train_df, val_df = train_test_split(temp_df, test_size=0.5, random_state= state)
    return test_df, train_df, val_df

  corr_matrix = housing.corr()


In [54]:
# Differentiate numerical features (minus the target) and categorical features
categorical_features = housing.select_dtypes(include = ["object"]).columns
numerical_features = housing.select_dtypes(exclude = ["object"]).columns
numerical_features = numerical_features.drop("median_house_value")
print("Numerical features : " + str(len(numerical_features)))
print("Categorical features : " + str(len(categorical_features)))
train_num = housing[numerical_features]
train_cat = housing[categorical_features]

Numerical features : 11
Categorical features : 1


In [70]:
model = LinearRegression()
fill_0 = fill_with_zero(filtered_df)
fill_mean = fill_with_mean(filtered_df)
def get_values(data,seed):
    train, test, val = split_dataset(data,seed)
    X_train = train.drop("median_house_value", axis=1)
    y_train = train['median_house_value']
    
    X_test = test.drop("median_house_value", axis=1)
    y_test = test['median_house_value']

    X_val = val.drop("median_house_value", axis=1)
    y_val = val['median_house_value']
    return X_train, y_train, X_val, y_val

In [71]:
X_train_1, y_train_1, X_val_1, y_val_1 = get_values(fill_0,42)
X_train_2, y_train_2, X_val_2, y_val_2 = get_values(fill_mean,42)
model_1 = model.fit(X_train_1, y_train_1)
model_2 = model.fit(X_train_2, y_train_2)



latitude              0
longitude             0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
dtype: int64

In [28]:
# Make predictions on the validation set for both options
y_pred_option1 = model_1.predict(X_val_1)
y_pred_option2 = model_2.predict(X_val_2)
# Calculate RMSE for both options
rmse_option1 = np.sqrt(mean_squared_error(y_val_1, y_pred_option1))
rmse_option2 = np.sqrt(mean_squared_error(y_val_2, y_pred_option2))

# Print RMSE scores rounded to 2 decimal digits
print("RMSE (Option 1 - Fill with 0):", round(rmse_option1, 2))
print("RMSE (Option 2 - Fill with Mean):", round(rmse_option2, 2))

# Compare which option has a better RMSE
if rmse_option1 < rmse_option2:
    print("Option 1 (Fill with 0) has a better RMSE.")
else:
    print("Option 2 (Fill with Mean) has a better RMSE.")

RMSE (Option 1 - Fill with 0): 0.03
RMSE (Option 2 - Fill with Mean): 0.03
Option 2 (Fill with Mean) has a better RMSE.


In [39]:

# Create a StandardScaler object
scaler = StandardScaler()

# Fit the scaler to your training data and transform it
X_train_scaled = scaler.fit_transform(X_train_1)

# Transform the validation and test data using the same scaler
X_val_scaled = scaler.transform(X_val_1)

In [45]:
# Dictionary to store RMSE scores for each r value
rmse_scores = {}

values = [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]

# Train Ridge regression models with different r values and calculate RMSE
model = Ridge(alpha = 5)
model.fit(X_train_scaled, y_train_1)
y_pred = model.predict(X_val_scaled)
rmse = mean_squared_error(y_val_1, y_pred)
rmse_scores = round(rmse, 2)
rmse_scores


0.0

In [35]:
ridge = RidgeCV(alphas = [ 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10])
ridge.fit(X_train_1, y_train_1)
alpha = ridge.alpha_
print("Best alpha :", alpha)

Best alpha : 1.0


In [57]:
housing = shuffle_dataset(housing)
housing = pd.get_dummies(housing)
housing

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,rooms_per_household,bedrooms_per_room,population_per_household,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
9337,-122.59,37.97,46.0,4036.0,856.0,1872.0,833.0,4.5625,275200.0,4.845138,0.212091,2.247299,0,0,0,0,1
20253,-119.19,34.23,17.0,3889.0,748.0,2415.0,739.0,4.5000,234300.0,5.262517,0.192337,3.267930,0,0,0,0,1
12117,-117.21,33.95,5.0,8403.0,1240.0,3962.0,1150.0,5.2174,155500.0,7.306957,0.147566,3.445217,0,1,0,0,0
3304,-122.63,38.96,20.0,2507.0,577.0,1072.0,457.0,2.3083,60200.0,5.485777,0.230156,2.345733,0,1,0,0,0
13685,-117.24,34.15,26.0,2041.0,293.0,936.0,375.0,6.0000,140200.0,5.442667,0.143557,2.496000,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2463,-119.87,36.54,34.0,1370.0,287.0,818.0,269.0,2.4044,72500.0,5.092937,0.209489,3.040892,0,1,0,0,0
6895,-118.11,34.04,28.0,3913.0,696.0,2264.0,697.0,5.2446,258000.0,5.614060,0.177869,3.248207,1,0,0,0,0
12725,-121.35,38.61,27.0,3900.0,776.0,1549.0,761.0,2.7788,115700.0,5.124836,0.198974,2.035480,0,1,0,0,0
18498,-121.51,37.02,19.0,2372.0,394.0,1142.0,365.0,4.0238,374600.0,6.498630,0.166105,3.128767,0,1,0,0,0


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


# Define the seed values to try
seed_values = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

# Initialize an empty list to store RMSE scores
rmse_scores = []

for seed in seed_values:
    housing = housing.drop(columns = ['bedrooms_per_room'], axis =True)
    fill_with_zero(housing)
    # Split the data into train/validation/test sets with the current seed
    X_train, y_train, X_val, y_val= get_values(housing, seed)
    # Fill missing values with 0
    X_train_imputed = fill_with_zero(X_train)
    X_val_imputed = fill_with_zero(X_val)
    
#     # Train a linear regression model without regularization
#     model = LinearRegression()
#     model.fit(X_train_imputed, y_train)
    
#     # Make predictions on the validation data
#     y_pred_val = model.predict(X_val_imputed)
    
#     # Calculate RMSE on the validation data
#     rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))
    
#     # Append the RMSE score to the list
#     rmse_scores.append(rmse)

# # Compute the standard deviation of all the RMSE scores
# std_deviation = np.std(rmse_scores)

# # Round the result to 3 decimal digits
# std_deviation = round(std_deviation, 3)

# # Print the standard deviation of RMSE scores
# print("Standard Deviation of RMSE Scores:", std_deviation)

X_train_imputed.isnull().sum()

    