<a href="https://colab.research.google.com/github/AzadMehedi/Data-Pipeline/blob/main/Scikit_Learn_Model_Pipeline_using_California_housing_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# load packages

In [7]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

# load dataset (California Housing dataset)

In [8]:
df = pd.read_csv('/content/housing.csv')
df.shape

(20640, 10)

In [9]:
df.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

In [10]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


# divide the dataset into train dataset & test dataset using train test split

In [5]:
# from sklearn.model_selection import train_test_split

# df = df.sample(frac=1).reset_index(drop=True)
# train_df, test_df = train_test_split(df, test_size = 0.15, random_state=42)

In [6]:
train_df.shape, test_df.shape

NameError: ignored

# spliting the train dataset and test dataset into X_train, y_train, X_test, y_test

In [None]:
# X_train, y_train = train_df.to_numpy()[:, :-1], train_df.to_numpy()[:, :-1]
# X_test, y_test = test_df.to_numpy()[:, :-1], test_df.to_numpy()[:, :-1]

# X_train.shape, y_train.shape, X_test.shape, y_test.shape

# Another easy way to spliting the dataset into X_train, y_train, X_test, y_test

In [11]:
from sklearn.model_selection import train_test_split

# shuffle the dataset using the sample method
df = df.sample(frac=1).reset_index(drop=True)

# split the dataset into features (X) and target variable (y)
X = df.iloc[:, :-1].values  # convert pandas dataframe to numpy array to the futher use of Scaling techniuqe
y = df.iloc[:, -1].values   # convert pandas dataframe to numpy array to the futher use of Scaling techniuqe

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

# checking the shapes
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((17544, 9), (3096, 9), (17544,), (3096,))

In [12]:
from sklearn.preprocessing import OneHotEncoder

# create an instance of OneHotEncoder and fit it on the training data
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoder.fit(X_train)

# transform both the training and testing data using the fitted encoder
X_train = encoder.transform(X_train)
X_test = encoder.transform(X_test)

# fit and print the model using the encoded data
# fit_and_print(p1, X_train=X_train_encoded, X_test=X_test_encoded)

# Applying Preprocessing technique (StandardScaler and MinMaxScaler) 

In [13]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, FunctionTransformer
# FunctionTransformer is utility class that allows you to apply a custom function to transform your data.
from copy import deepcopy
# in ML, deepcopy used to create a copy of a dataset or a model object that is completely independent from the original object. 

### Now we will apply StandardScaler to first 2 columns
### and apply MinMaxScaler all the ccolumns except the first 2

In [14]:
std_scaler = StandardScaler().fit(X_train[:, :2])
min_max_scaler = MinMaxScaler().fit(X_train[:, 2:])

In [24]:
# The FunctionTransformer class in Scikit-learn is a utility class 
# that allows you to apply a custom function to transform your data

def preprocessor(X):
  A = np.copy(X)  # copying the main data
  A[:, :2] = std_scaler.transform(X[:, :2])
  A[:, 2:] = min_max_scaler.transform(X[:, 2:])
  return A

In [16]:
preprocessor(X_test)  # modified the copied version of X_test as we wanted

array([[-0.00755002, -0.00755002,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.00755002, -0.00755002,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.00755002, -0.00755002,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.00755002, -0.00755002,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.00755002, -0.00755002,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.00755002, -0.00755002,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [17]:
# checking X_test is being modified or not : Not modified as we wanted
X_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [18]:
preprocess_transformer = FunctionTransformer(preprocessor)
preprocess_transformer

# import pipeline and linear model

In [19]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer   # to handle missing values. this is specificly used for numerical variable.
# from sklearn.preprocessing import OneHotEncoder

In [20]:
# for numerical processing pipeline : filling missing values because Linear Regression doesn't allow missing values
numeric_processor = Pipeline(
    steps = [('imputation_mean',SimpleImputer(missing_values=np.nan, strategy='mean')),
            ('scaler',StandardScaler())])
numeric_processor

numeric_processor

In [21]:
# for categorical processing pipeline
categorical_processor = Pipeline(
  steps = [('imputation_mean',SimpleImputer(missing_values='missing', strategy='contant')),
          ('scaler',StandardScaler())])
          # ('onehot',OneHotEncoder(handle_unknown='ignore'))])
categorical_processor

In [22]:
# creating pipeline
p1 = Pipeline([('Scaler',preprocess_transformer),
               ('numeric_processor',SimpleImputer()),
               ('categorical_processor',SimpleImputer()),
               ('Linear Regression', LinearRegression())])
p1

# Find out the error using mean_abslute_error

In [23]:
from sklearn.metrics import mean_absolute_error

def fit_and_print(p, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test):
  p.fit(X_train, y_train)
  train_prediction = p.predict(X_train)
  test_prediction = p.predict(X_test)

  print('Training Error: ' + str(mean_absolute_error(train_prediction, y_train)))
  print('Testing Error: ' + str(mean_absolute_error(test_prediction, y_train)))

fit_and_print(p1)

ValueError: ignored