In [1]:
# import main (major) libraries
import numpy as np
import pandas as pd

# Features engineering
from sklearn.preprocessing import StandardScaler, OrdinalEncoder

# Model selection and hyperParameter tuning
from sklearn.model_selection import train_test_split

# sklearn pipeline
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn_features.transformers import DataFrameSelector

# impute null values
from sklearn.impute import SimpleImputer

# other libraries
import os

In [2]:
# Detect paths
curr_dir = os.getcwd()
file_features_name = "cal_housing_names.csv" # file of feature names
path1 = os.path.join(curr_dir, file_features_name)

file_features_data = "cal_housing_data.csv" # file of features data
path2 = os.path.join(curr_dir, file_features_data)

# load the dataset
feature_names = pd.read_csv(path1)
feature_names = feature_names.iloc[:, 0]

calHousing_df = pd.read_csv(path2, names=feature_names) # read data with names as their features names

In [3]:
# Extract new features from existence features
calHousing_df['roomsPerHousehold'] = calHousing_df['totalRooms'] // calHousing_df['households']
calHousing_df['bedRoomsPerroom'] = calHousing_df['totalBedrooms'] // calHousing_df['totalRooms']
calHousing_df['PopulationPerHousehold'] = calHousing_df['population'] // calHousing_df['households']

In [4]:
# features and target detection and seperation
X = calHousing_df.drop('medianHouseValue', axis=1)
y = calHousing_df['medianHouseValue']

# split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.2, random_state=42)

# split numerical features lonely 
num_cols = [col for col in X_train.columns if X_train[col].dtype in ['int32', 'int64', 'float32', 'float64']]

# split categorical features lonely
cat_cols = [col for col in X_train.columns if X_train[col].dtype not in ['int32', 'int64', 'float32', 'float64']]

In [None]:
# make clssification transformer (optional)
class HouseholdClassTransformer(BaseEstimator, TransformerMixin):
    def _init(self):
        pass

    def fit(self, X, y=None):
        # if X is Dataframe
        if isinstance(X, pd.DataFrame):
            X = X.to_numpy()  # transform into numpy

        # save indecies
        self.housingMedianAge_idx = 0
        self.medianIncome_idx = 1

        # slicing values calculation
        self.medianHousingAge = X[:, self.housingMedianAge_idx].max() // 3
        self.medIncome = X[:, self.medianIncome_idx].max() / 3
        return self

    def transform(self, X):
        # if Dataframe or not
        is_dataframe = isinstance(X, pd.DataFrame)
        if is_dataframe:
            X_copy = X.copy()  # copy data to avoid data modification
        else:
            X_copy = pd.DataFrame(X, columns=['housingMedianAge', 'medianIncome'])

        # Apply classification based on training data
        def classify(house):
            if ((house['housingMedianAge'] <= self.medianHousingAge) & (house['medianIncome'] >= self.medIncome * 2)) | (house['medianIncome'] >= self.medIncome * 2):
                return 'A'
            elif ((house['housingMedianAge'] <= self.medianHousingAge * 2) & (house['medianIncome'] >= self.medIncome)) | (house['medianIncome'] >= self.medIncome):
                return 'B'
            else:
                return 'C'

        # check if householdClass column is exist or not
        if 'householdClass' in X_copy.columns:
            X_copy['householdClass'] = X_copy.apply(classify, axis=1)  # update values only
        else:
            X_copy['householdClass'] = X_copy.apply(classify, axis=1)  # adding column

        # return dataframe or numpy
        return X_copy.values if not is_dataframe else X_copy

In [5]:
# Pipelines
# numerical pipeline
num_pipeline = Pipeline(steps=[
                                ('selector', DataFrameSelector(num_cols)),
                                ('imputer', SimpleImputer(strategy='median')),
                                ('scaler', StandardScaler())
                            ])

# categorical pipeline
cat_pipeline = Pipeline(steps=[
#                                 ('selector1', DataFrameSelector(['housingMedianAge', 'medianIncome', 'householdClass'])),
#                                 ('householdClassifier', HouseholdClassTransformer()), ## active it if you want to classify new instances
                                ('selector2', DataFrameSelector(cat_cols)),
                                ('encoder', OrdinalEncoder())
                            ])

# combine pipelines
total_pipeline = FeatureUnion(transformer_list=[
                                ('num', num_pipeline),
                                ('cat', cat_pipeline)
                            ])

# Fit and transform
X_train_transformed = total_pipeline.fit_transform(X_train)

In [6]:
def preprocessing_new(X_new):
    ''' This function is doing preprocessing for the data to be prepared to pass to the model
    Args:
    *****
    (X_new: 2D array) ---> The features in the same order as follow:
        [longitude, latitude, housingMedianAge, totalRooms, totalBedrooms, 
        population, households, medianIncome, medianHouseValue]
    All features are numerical data
    
    Returns:
    ********
    Array of processed data which are ready to make inference by the model
    '''
    
    return total_pipeline.fit(X_new)