In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedKFold
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.metrics import mean_squared_error, r2_score

In [6]:
housing_data = pd.read_csv('housing_with_ocean_proximity.csv')

In [8]:
housing_data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [10]:
housing_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [12]:
housing_data.isna().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [15]:
# Separate features and target variable
# Assuming 'median_house_value' is the target variable in your dataset
X = housing_data.drop('median_house_value', axis=1)
y = housing_data['median_house_value']

In [17]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=2024)

In [19]:
# Print info to check the structure of the training data
print(X_train.info())
print(X_train.isna().sum())

<class 'pandas.core.frame.DataFrame'>
Index: 14448 entries, 4722 to 7816
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           14448 non-null  float64
 1   latitude            14448 non-null  float64
 2   housing_median_age  14448 non-null  float64
 3   total_rooms         14448 non-null  float64
 4   total_bedrooms      14286 non-null  float64
 5   population          14448 non-null  float64
 6   households          14448 non-null  float64
 7   median_income       14448 non-null  float64
 8   ocean_proximity     14448 non-null  object 
dtypes: float64(8), object(1)
memory usage: 1.1+ MB
None
longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        162
population              0
households              0
median_income           0
ocean_proximity         0
dtype: int64


In [21]:
# import StandardScaler for standardization and OneHotEncoder for creating dummy variables
from sklearn.preprocessing import StandardScaler, OneHotEncoder
# import SimpleImputer for missing value treatment
from sklearn.impute import SimpleImputer
# importing pipeline class. The Pipeline class is used to create a sequence of data processing steps.
from sklearn.pipeline import Pipeline
# importing ColumnTransformer class to apply different preprocessing steps to different subsets of features in your dataset.
from sklearn.compose import ColumnTransformer

In [23]:
# Set up preprocessing steps for numeric and categorical data
housing_cat = X_train.select_dtypes(include='object').columns
housing_num = X_train.select_dtypes(exclude='object').columns

In [25]:
housing_num

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income'],
      dtype='object')

In [27]:
# Numeric variables pipeline
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler())
])

In [29]:
num_pipeline

In [35]:
# Unified preprocessing for both numeric and categorical data
preprocessing = ColumnTransformer([
    ('num', num_pipeline, housing_num),
    ('cat', OneHotEncoder(handle_unknown='ignore'), housing_cat)
])

In [37]:
preprocessing

In [39]:
# applying preprocessing pipeline to train data
check_train = preprocessing.fit_transform(X_train)

In [43]:
# converting array to dataframe to have a better look at it
check_train_df = pd.DataFrame(check_train)
check_train_df.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.604388,-0.741523,1.536729,-0.620001,-0.716387,-0.79088,-0.721514,0.072853,1.0,0.0,0.0,0.0,0.0
1,0.798753,-0.862998,0.501005,-0.11989,-0.131442,0.218924,-0.043175,0.129574,1.0,0.0,0.0,0.0,0.0
2,-0.845868,1.445014,-1.809456,0.736722,0.333637,0.298266,0.353837,0.825943,0.0,1.0,0.0,0.0,0.0


In [47]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error

In [49]:
model=Ridge()

In [51]:
final_pipeline = Pipeline([
    ('preprocessing', preprocessing),
    ('model_ridge', model)
])

In [53]:
final_pipeline

In [55]:
# Define the grid of hyperparameters to search
# Note: You can set parameters for any step by using its name followed by a double underscore(__) and the parameter name.for pipelines
grid = dict()
grid['model_ridge__alpha'] = np.arange(0.1,2.1,0.1)

In [57]:
search = GridSearchCV(estimator = final_pipeline, param_grid = grid, scoring = 'neg_mean_absolute_error',cv = 5, n_jobs= -1)
# Fit the GridSearchCV object to the training data
# Fit GridSearchCV to the training data
results = search.fit(X_train, y_train)
print('MAE: %.3f' % -results.best_score_)
print('Config: %s' % results.best_params_)

MAE: 49789.360
Config: {'model_ridge__alpha': 0.1}


In [59]:
# Predict on the test set using the trained model
y_pred = search.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)

Mean Absolute Error: 50091.263223724694
