In [27]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder 
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, f_regression

In [3]:
# Loading the dataset
df=pd.read_csv("C:\\Users\\asaha\\Downloads\\housing.csv")

In [5]:
# Basic dataset information
# Print basic information
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB
None


In [6]:
# Print basic statistics
print(df.describe())

          longitude      latitude  housing_median_age   total_rooms  \
count  20640.000000  20640.000000        20640.000000  20640.000000   
mean    -119.569704     35.631861           28.639486   2635.763081   
std        2.003532      2.135952           12.585558   2181.615252   
min     -124.350000     32.540000            1.000000      2.000000   
25%     -121.800000     33.930000           18.000000   1447.750000   
50%     -118.490000     34.260000           29.000000   2127.000000   
75%     -118.010000     37.710000           37.000000   3148.000000   
max     -114.310000     41.950000           52.000000  39320.000000   

       total_bedrooms    population    households  median_income  \
count    20433.000000  20640.000000  20640.000000   20640.000000   
mean       537.870553   1425.476744    499.539680       3.870671   
std        421.385070   1132.462122    382.329753       1.899822   
min          1.000000      3.000000      1.000000       0.499900   
25%        296.00000

In [7]:
# Checking for missing values
print(df.isnull().sum())

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64


## DATA PREPROCESSING 

In [8]:
# Splitting the data into features and target variable
X = df.drop('median_house_value', axis=1)
y = df['median_house_value']

In [11]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Handling missing values and scaling numerical features
numerical_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X_train.select_dtypes(include=[object]).columns.tolist()

In [17]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [18]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [20]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

## FEATURE ENGINEERING

In [21]:
# Adding new feature: rooms_per_household
X_train['rooms_per_household'] = X_train['total_rooms'] / X_train['households']
X_test['rooms_per_household'] = X_test['total_rooms'] / X_test['households']

In [22]:
# Adding new feature: bedrooms_per_room
X_train['bedrooms_per_room'] = X_train['total_bedrooms'] / X_train['total_rooms']
X_test['bedrooms_per_room'] = X_test['total_bedrooms'] / X_test['total_rooms']

In [23]:
# Adding new feature: population_per_household
X_train['population_per_household'] = X_train['population'] / X_train['households']
X_test['population_per_household'] = X_test['population'] / X_test['households']

In [24]:
# Update the numerical features list
numerical_features += ['rooms_per_household', 'bedrooms_per_room', 'population_per_household']

In [25]:
# Update the preprocessor to include new features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [26]:
# Apply preprocessing
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

## FEATURE SELECTION

In [28]:
# Select top k features based on f_regression
k = 10  
feature_selector = SelectKBest(score_func=f_regression, k=k)
X_train_selected = feature_selector.fit_transform(X_train_preprocessed, y_train)
X_test_selected = feature_selector.transform(X_test_preprocessed)

In [29]:
# Print selected feature scores
feature_scores = feature_selector.scores_
print("Feature scores:\n", feature_scores)

Feature scores:
 [3.55434501e+01 3.44579030e+02 1.79495552e+02 3.01823325e+02
 3.80956840e+01 1.11961694e+01 6.72961323e+01 1.50574391e+04
 4.25373449e+02 1.17166588e+03 8.01650853e+00 1.18266512e+03
 5.01979052e+03 7.27256638e+00 4.45720380e+02 3.03080897e+02]


In [30]:
# The preprocessed and selected features are ready for model training
print("Training features shape:", X_train_selected.shape)
print("Testing features shape:", X_test_selected.shape)

Training features shape: (16512, 10)
Testing features shape: (4128, 10)
