In [30]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

#read csv file
df = pd.read_csv('train_missing.csv')
df

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,33.0,3126.0,,2300.0,623.0,3.2596,103000.0,NEAR OCEAN
1,49.0,3382.0,,1314.0,756.0,3.8125,382100.0,NEAR OCEAN
2,,1897.0,,915.0,336.0,4.1563,172600.0,NEAR OCEAN
3,36.0,1421.0,,1418.0,,1.9425,93400.0,NEAR OCEAN
4,43.0,2382.0,,874.0,380.0,3.5542,96500.0,INLAND
...,...,...,...,...,...,...,...,...
16507,35.0,1330.0,,658.0,217.0,6.3700,229200.0,<1H OCEAN
16508,33.0,3084.0,,,449.0,,97800.0,INLAND
16509,36.0,2101.0,569.0,1756.0,527.0,2.9344,222100.0,<1H OCEAN
16510,15.0,3575.0,,1777.0,559.0,5.7192,283500.0,<1H OCEAN


In [31]:
# Load data
train_df = pd.read_csv('train_missing.csv')
test = pd.read_csv('test_missing.csv')

# Separate target variable
y_train = train_df['median_house_value']
y_test = test['median_house_value']

# Separate features
X_train = train_df.drop(['median_house_value', 'total_bedrooms'], axis=1)
X_test = test.drop(['median_house_value', 'total_bedrooms'], axis=1)

In [32]:
# Identify column types
numerical_cols = X_train.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_cols = ['ocean_proximity']

In [33]:
#Numeric Pipeline
numeric_preprocessing_steps = [
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaling', MinMaxScaler())
]

numeric_pipeline = Pipeline(steps=numeric_preprocessing_steps)

In [34]:
#Categorical Pipeline
categorical_pipeline = Pipeline([
    ('mode_imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
    ('encoder', OneHotEncoder())
])

In [35]:
# Combine pipelines using ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numerical_cols),
    ('cat', categorical_pipeline, categorical_cols)
])

In [36]:
X_transformed_train = preprocessor.fit_transform(X_train)
X_test_transform = preprocessor.transform(X_test)


In [37]:
#Before Pipeline
X_train.head()

Unnamed: 0,housing_median_age,total_rooms,population,households,median_income,ocean_proximity
0,33.0,3126.0,2300.0,623.0,3.2596,NEAR OCEAN
1,49.0,3382.0,1314.0,756.0,3.8125,NEAR OCEAN
2,,1897.0,915.0,336.0,4.1563,NEAR OCEAN
3,36.0,1421.0,1418.0,,1.9425,NEAR OCEAN
4,43.0,2382.0,874.0,380.0,3.5542,INLAND


In [38]:
#After Pipeline
pd.DataFrame(X_transformed_train, columns = preprocessor.get_feature_names_out()).head()


Unnamed: 0,num__housing_median_age,num__total_rooms,num__population,num__households,num__median_income,cat__ocean_proximity_<1H OCEAN,cat__ocean_proximity_INLAND,cat__ocean_proximity_ISLAND,cat__ocean_proximity_NEAR BAY,cat__ocean_proximity_NEAR OCEAN
0,0.627451,0.079455,0.06438,0.102286,0.190322,0.0,0.0,0.0,0.0,1.0
1,0.941176,0.085966,0.036744,0.124157,0.228452,0.0,0.0,0.0,0.0,1.0
2,0.542265,0.048197,0.025561,0.05509,0.252162,0.0,0.0,0.0,0.0,1.0
3,0.686275,0.03609,0.039659,0.081888,0.099488,0.0,0.0,0.0,0.0,1.0
4,0.823529,0.060532,0.024412,0.062325,0.210638,0.0,1.0,0.0,0.0,0.0
