In [319]:
train_file_path = r"E:\machine_learning_project_\housing\artifact\data_ingestion\2025-01-27-14-50-46\ingested_data\train\housing.csv"

In [320]:
import pandas as pd 

In [321]:
df= pd.read_csv(train_file_path)

In [322]:
df.shape

(16512, 10)

In [323]:
df.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

In [324]:
x,y= df.drop(columns=['median_house_value'],axis=1),df[["median_house_value"]]

In [325]:
x.isna().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        158
population              0
households              0
median_income           0
ocean_proximity         0
dtype: int64

In [326]:
from sklearn.impute import SimpleImputer

In [327]:
numerical_column = x.drop(columns=["ocean_proximity"], axis=1)

In [328]:
simple_imputer= SimpleImputer(strategy="median")


In [329]:
df=simple_imputer.fit_transform(numerical_column)

In [330]:
x.isna().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        158
population              0
households              0
median_income           0
ocean_proximity         0
dtype: int64

In [331]:
simple_imputer.feature_names_in_

array(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income'],
      dtype=object)

In [332]:
simple_imputer.statistics_

array([-118.51   ,   34.26   ,   29.     , 2119.     ,  433.     ,
       1164.     ,  408.     ,    3.54155])

In [333]:
x.latitude.median()

34.26

In [334]:
x.ocean_proximity.value_counts()

<1H OCEAN     7277
INLAND        5262
NEAR OCEAN    2124
NEAR BAY      1847
ISLAND           2
Name: ocean_proximity, dtype: int64

In [335]:
simple_imputer=SimpleImputer(strategy="most_frequent")

In [336]:
from sklearn.base import BaseEstimator, TransformerMixin

In [337]:
class Test(BaseEstimator,TransformerMixin):
    pass

    def __init__(self, strategy="median"):
        self.strategy =strategy

    def fit(self,X):
        self.features_ =X.columns
        self.statistics_ =[]
        for column in X.columns:
            self.statistics_.append(X[column].median())
        return self

    def transform (self,X:pd.DataFrame):
        for idx,column in enumerate(x.columns):
            X[column].fillna(self.statistics_[idx])
        return X
            

In [338]:
response = Test().fit(numerical_column)

In [339]:
response.statistics_

[-118.51, 34.26, 29.0, 2119.0, 433.0, 1164.0, 408.0, 3.54155]

In [340]:
for idx,column in enumerate(x.columns):
    print(idx,column)

0 longitude
1 latitude
2 housing_median_age
3 total_rooms
4 total_bedrooms
5 population
6 households
7 median_income
8 ocean_proximity


In [341]:
#simple_imputer.fit_transform(df)

custom transformer

In [344]:
from sklearn.base  import BaseEstimator, TransformerMixin

In [345]:
x.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'ocean_proximity'],
      dtype='object')

In [346]:
import numpy as np

In [347]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

COLUMN_TOTAL_ROOMS = "total_rooms"
COLUMN_POPULATION = "population"
COLUMN_HOUSEHOLDS = "households"
COLUMN_TOTAL_BEDROOMS = "total_bedrooms"


class FeatureGenerator(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True,
                 total_rooms_ix=3,
                 population_ix=5,
                 households_ix=6,
                 total_bedrooms_ix=4,
                 columns=None):
        try:
            self.columns = columns
            if self.columns is not None:
                total_rooms_ix = self.columns.index(COLUMN_TOTAL_ROOMS)
                population_ix = self.columns.index(COLUMN_POPULATION)
                households_ix = self.columns.index(COLUMN_HOUSEHOLDS)
                total_bedrooms_ix = self.columns.index(COLUMN_TOTAL_BEDROOMS)
            
            self.add_bedrooms_per_room = add_bedrooms_per_room
            self.total_rooms_ix = total_rooms_ix
            self.population_ix = population_ix
            self.households_ix = households_ix
            self.total_bedrooms_ix = total_bedrooms_ix
        except Exception as e:
            raise e

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        try:
            room_per_household = X[:, self.total_rooms_ix] / X[:, self.households_ix]
            population_per_household = X[:, self.population_ix] / X[:, self.households_ix]
            
            if self.add_bedrooms_per_room:
                bedrooms_per_room = X[:, self.total_bedrooms_ix] / X[:, self.total_rooms_ix]
                generated_feature = np.c_[X, room_per_household, population_per_household, bedrooms_per_room]
            else:
                generated_feature = np.c_[X, room_per_household, population_per_household]
            
            return generated_feature
        except Exception as e:
            raise e


In [348]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler


In [377]:
num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy="median")),
    ('feature_generator', FeatureGenerator()),
    ('scaling', StandardScaler())
])

In [378]:
from sklearn.preprocessing import OneHotEncoder

In [379]:
cat_pipeline= Pipeline(steps=[
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('oneHotEncoder', OneHotEncoder()),
    ('scaling', StandardScaler(with_mean=False))
])

In [380]:
df= pd.read_csv(train_file_path)

In [381]:
df.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

In [382]:
num_column_name=['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
        ]
cat_column_name =['ocean_proximity']

In [383]:
preprocessing = ColumnTransformer([
    ('num_pipeline', num_pipeline,num_column_name),
    ('cat_pipeline', cat_pipeline,cat_column_name)
])

In [384]:
preprocessing.fit_transform(df)

array([[-0.94135046,  1.34743822,  0.02756357, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.17178212, -1.19243966, -1.72201763, ...,  0.        ,
         0.        ,  2.9869105 ],
       [ 0.26758118, -0.1259716 ,  1.22045984, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-1.5707942 ,  1.31001828,  1.53856552, ...,  0.        ,
         0.        ,  0.        ],
       [-1.56080303,  1.2492109 , -1.1653327 , ...,  0.        ,
         0.        ,  0.        ],
       [-1.28105026,  2.02567448, -0.13148926, ...,  0.        ,
         0.        ,  0.        ]])

In [385]:
import dill

In [386]:
with open('preprocessing.pkl', "wb") as prep_file:
    dill.dump(preprocessing,prep_file)

In [387]:
with open('preprocessing.pkl', "rb") as file_obj:
    preprocessing_loaded_obj= dill.load(file_obj)

In [392]:
test_file_path = r"E:\machine_learning_project_\housing\artifact\data_ingestion\2025-01-27-14-50-46\ingested_data\test\housing.csv"

In [393]:
test_df =pd.read_csv(test_file_path)

In [394]:
preprocessing_loaded_obj.transform(test_df)

array([[ 0.59229422, -0.71065803,  0.02756357, ...,  0.        ,
         0.        ,  0.        ],
       [-0.42180959, -0.35049119, -0.37006852, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.56232071, -0.64985064,  0.5842485 , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.07211862, -0.56097831,  1.14093342, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.83208232, -0.93985512,  0.10708999, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.50736927, -0.67791559,  0.5842485 , ...,  0.        ,
         0.        ,  0.        ]])