In [85]:
import numpy as np
import pandas as pd
from scipy.stats import zscore
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [61]:
df = pd.read_csv('/content/sample_data/california_housing_train.csv')

In [62]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0


In [63]:
df.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value'],
      dtype='object')

In [64]:
df.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
dtype: int64

In [65]:
# No NULLs
# But there is any NULLs we can fill it using mean
# df.fillna(df.mean(), inplace=True))

In [66]:
z_scores = np.abs(zscore(df))

In [67]:
threshold = 3

In [68]:
outliers = (z_scores > threshold).any(axis=1)

In [69]:
print(len(df))
print(len(outliers))

17000
17000


In [70]:
df = df[~outliers]

In [71]:
print(f"Removed {outliers.sum()} outliers")

Removed 743 outliers


In [72]:
X = df.drop('median_house_value', axis = 1)
Y = df['median_house_value']

In [73]:
k=8
selector = SelectKBest(score_func = f_regression , k = k)

In [74]:
X_Selected = selector.fit_transform(X, Y)

In [75]:
selected_features = X.columns[selector.get_support()]
print(selected_features)

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income'],
      dtype='object')


In [76]:
X_train, X_test, Y_train, Y_test = train_test_split(X_Selected, Y, test_size = 0.2, random_state=42)

In [77]:
scaler = StandardScaler()

In [78]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [79]:
model = LinearRegression()

In [80]:
model.fit(X_train_scaled, Y_train)

In [81]:
y_pred = model.predict(X_test_scaled)

In [82]:
mse = mean_squared_error(Y_test, y_pred)
print(f"Mean Squared Error: {mse}")

r2 = r2_score(Y_test, y_pred)
print(f"R² Score: {r2}")

Mean Squared Error: 4694399998.047193
R² Score: 0.6261966948798126


In [83]:
#################### Grid Search for Linear Regression #########################

In [93]:
param_grid = {
    'fit_intercept': [True, False],
}

grid_search = GridSearchCV(estimator=LinearRegression(), param_grid=param_grid,cv=5, scoring='neg_mean_squared_error')

grid_search.fit(X_train_scaled, Y_train)

best_params = grid_search.best_params_
best_score = -grid_search.best_score_
print(f"Best Parameters: {best_params}")
print(f"Best Cross-Validation Score (MSE): {best_score}")

Best Parameters: {'fit_intercept': True}
Best Cross-Validation Score (MSE): 4457281033.786706


In [94]:
################## Ridge Regression ###########################

In [95]:
from sklearn.linear_model import Ridge

In [97]:
param_grid_ridge = {
    'alpha': [0.1, 1, 10, 100]
}

In [98]:
grid_search_ridge = GridSearchCV(Ridge(), param_grid_ridge, cv=5, scoring='neg_mean_squared_error')

In [100]:
grid_search_ridge.fit(X_train_scaled, Y_train)

best_model_ridge = grid_search_ridge.best_estimator_
best_params_ridge = grid_search_ridge.best_params_
print("Best Hyperparameters (Ridge):", best_params_ridge)


Best Hyperparameters (Ridge): {'alpha': 1}


In [101]:
y_pred_ridge = best_model_ridge.predict(X_test_scaled)

In [102]:
mse = mean_squared_error(Y_test, y_pred_ridge)
print(f"Mean Squared Error: {mse}")

r2 = r2_score(Y_test, y_pred_ridge)
print(f"R² Score: {r2}")

Mean Squared Error: 4694422030.007876
R² Score: 0.6261949405300073
