# Automating the pre-processing of raw data with PIPELINES

During this project we shall look at the possible ways of processing raw data with a function and a pipeline.

Below are the tasks that we shall automate either through a function or pipeline

* Checking for missing numerical values and imputing them based on a given strategy

* Checking for missing categorical values and imputing them based on a given strategy 

* Harmonizing the scale of the numerical values through standard scaling

* Handling the categorical values with OneHot and Ordinal encoders 

Note : we should confirm that all the features are of the right data types 


##### Importing the required libraries for the project

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from category_encoders import OneHotEncoder,OrdinalEncoder
from sklearn.decomposition import PCA
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import make_column_transformer,ColumnTransformer
from sklearn import set_config
from sklearn.pipeline import Pipeline

In [2]:
# this will enable us to view the pipelines in a diagram view
set_config(display="diagram")

##### 

##### Read the csv file and droping the features that will not relevant; 
* those with high cardinality (categorical features unique values above 35)
* in this case we shall not need postalcode  and year built features

In [3]:
def wrangle_fun(filepath):
    
    df = pd.read_csv(filepath)
    
    col_drop = df.select_dtypes(object).nunique()[df.select_dtypes(object).nunique()>35].index
    
    df.drop(columns=col_drop,inplace=True)
    
    df.drop(columns=["Postcode","YearBuilt"],inplace=True)
    
    return df

processed_data = wrangle_fun("melb_data.csv")

processed_data.head(3)

Unnamed: 0,Rooms,Type,Price,Method,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,2,h,1480000.0,S,2.5,2.0,1.0,1.0,202.0,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,2,h,1035000.0,S,2.5,2.0,1.0,0.0,156.0,79.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,3,h,1465000.0,SP,2.5,3.0,2.0,0.0,134.0,150.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0


#####

##### the function capability is increased by adding the requirement to check for missing numeric values and imputing them with the mean value

In [4]:
def wrangle_fun(filepath):
    
    df = pd.read_csv(filepath)
    
    col_drop = df.select_dtypes(object).nunique()[df.select_dtypes(object).nunique()>35].index
    
    df.drop(columns=col_drop,inplace=True)
    
    df.drop(columns=["Postcode","YearBuilt"],inplace=True)
    
    ## checking for missing values for the numerical features    
    num_nan = df.select_dtypes([int,float]).isnull().sum()[df.select_dtypes([int,float]).isnull().sum()>0].index

    num_impute = SimpleImputer(missing_values=np.nan, strategy="mean")

    for col in num_nan:

        df[col] = num_impute.fit_transform(df[[col]])
        
    return df

processed_data = wrangle_fun("melb_data.csv")

processed_data.head(3)

Unnamed: 0,Rooms,Type,Price,Method,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,2,h,1480000.0,S,2.5,2.0,1.0,1.0,202.0,151.96765,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,2,h,1035000.0,S,2.5,2.0,1.0,0.0,156.0,79.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,3,h,1465000.0,SP,2.5,3.0,2.0,0.0,134.0,150.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0


#####

##### the function capability is increased by adding the requirement to check for missing categorical values and imputing them with a constant "others" 

In [5]:
def wrangle_fun(filepath):
    
    df = pd.read_csv(filepath)
    
    col_drop = df.select_dtypes(object).nunique()[df.select_dtypes(object).nunique()>35].index
    
    df.drop(columns=col_drop,inplace=True)
    
    df.drop(columns=["Postcode","YearBuilt"],inplace=True)
    
    ## checking for missing values for the numerical features    
    num_nan = df.select_dtypes([int,float]).isnull().sum()[df.select_dtypes([int,float]).isnull().sum()>0].index

    num_impute = SimpleImputer(missing_values=np.nan, strategy="mean")

    for col in num_nan:

        df[col] = num_impute.fit_transform(df[[col]])
        
        
    ## ckecking in NaN vaues in the string features
    cat_nan = df.select_dtypes(object).isnull().sum()[df.select_dtypes(object).isnull().sum()>0].index

    cat_impute = SimpleImputer(fill_value="others",strategy="constant")

    for col in cat_nan:    

        df[col] = cat_impute.fit_transform(df[[col]])
        
    return df

processed_data = wrangle_fun("melb_data.csv")

processed_data.head(3)

Unnamed: 0,Rooms,Type,Price,Method,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,2,h,1480000.0,S,2.5,2.0,1.0,1.0,202.0,151.96765,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,2,h,1035000.0,S,2.5,2.0,1.0,0.0,156.0,79.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,3,h,1465000.0,SP,2.5,3.0,2.0,0.0,134.0,150.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0


#####

##### the function capability is increased by adding the requirement to scale the numerical features with Standard Scaler

In [6]:
def wrangle_fun(filepath):
    
    df = pd.read_csv(filepath)
    
    col_drop = df.select_dtypes(object).nunique()[df.select_dtypes(object).nunique()>35].index
    
    df.drop(columns=col_drop,inplace=True)
    
    df.drop(columns=["Postcode","YearBuilt"],inplace=True)
    
    ## checking for missing values for the numerical features    
    num_nan = df.select_dtypes([int,float]).isnull().sum()[df.select_dtypes([int,float]).isnull().sum()>0].index

    num_impute = SimpleImputer(missing_values=np.nan, strategy="mean")

    for col in num_nan:

        df[col] = num_impute.fit_transform(df[[col]])
        
        
    ## ckecking in NaN vaues in the string features
    cat_nan = df.select_dtypes(object).isnull().sum()[df.select_dtypes(object).isnull().sum()>0].index

    cat_impute = SimpleImputer(fill_value="others",strategy="constant")

    for col in cat_nan:    

        df[col] = cat_impute.fit_transform(df[[col]])
        
    ## scaling the numerical features using the standard scaler
    scaler =  StandardScaler()

    num_cols = df.select_dtypes([int,float]).columns

    df[num_cols] = scaler.fit_transform(df[num_cols])
    
    return df

processed_data = wrangle_fun("melb_data.csv")

processed_data.head(3)

Unnamed: 0,Rooms,Type,Price,Method,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,-0.981463,h,0.632448,S,-1.301485,-0.947035,-0.772376,-0.635232,-0.089316,0.0,Yarra,0.12116,0.03064,Northern Metropolitan,-0.784625
1,-0.981463,h,-0.06364,S,-1.301485,-0.947035,-0.772376,-1.676467,-0.100843,-0.186147,Yarra,0.016437,-0.017478,Northern Metropolitan,-0.784625
2,0.064876,h,0.608984,SP,-1.301485,0.088284,0.673367,-1.676467,-0.106356,-0.00502,Yarra,-0.001227,-0.007855,Northern Metropolitan,-0.784625


#####

##### the function capability is increased by adding the requirement to encode the categorical features with OneHot and Ordinal Encoders

In [7]:
def wrangle_fun(filepath):
    
    df = pd.read_csv(filepath)
    
    col_drop = df.select_dtypes(object).nunique()[df.select_dtypes(object).nunique()>35].index
    
    df.drop(columns=col_drop,inplace=True)
    
    df.drop(columns=["Postcode","YearBuilt"],inplace=True)
    
    ## checking for missing values for the numerical features    
    num_nan = df.select_dtypes([int,float]).isnull().sum()[df.select_dtypes([int,float]).isnull().sum()>0].index

    num_impute = SimpleImputer(missing_values=np.nan, strategy="mean")

    for col in num_nan:

        df[col] = num_impute.fit_transform(df[[col]])
        
        
    ## ckecking in NaN vaues in the string features
    cat_nan = df.select_dtypes(object).isnull().sum()[df.select_dtypes(object).isnull().sum()>0].index

    cat_impute = SimpleImputer(fill_value="others",strategy="constant")

    for col in cat_nan:    

        df[col] = cat_impute.fit_transform(df[[col]])
        
    ## scaling the numerical features using the standard scaler
    scaler =  StandardScaler()

    num_cols = df.select_dtypes([int,float]).columns

    df[num_cols] = scaler.fit_transform(df[num_cols])
    
    #Onehot and ordinal encoding
    cat_one = df.select_dtypes(object).nunique()[df.select_dtypes(object).nunique()>5].index

    onehot = OneHotEncoder()

    for col in cat_one:

        df = pd.concat([df,onehot.fit_transform(df[col])],axis=1)

    df.drop(columns= cat_one,inplace=True)
    
    ordinal = OrdinalEncoder()
    
    cat_ord = df.select_dtypes(object).nunique()[df.select_dtypes(object).nunique()<=5].index
    
    for col_1 in cat_ord:

        df[col_1] = ordinal.fit_transform(df[[col_1]])

    return df

processed_data = wrangle_fun("melb_data.csv")

processed_data.head(3)

Unnamed: 0,Rooms,Type,Price,Method,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,...,CouncilArea_33,CouncilArea_34,Regionname_1,Regionname_2,Regionname_3,Regionname_4,Regionname_5,Regionname_6,Regionname_7,Regionname_8
0,-0.981463,1,0.632448,1,-1.301485,-0.947035,-0.772376,-0.635232,-0.089316,0.0,...,0,0,1,0,0,0,0,0,0,0
1,-0.981463,1,-0.06364,1,-1.301485,-0.947035,-0.772376,-1.676467,-0.100843,-0.186147,...,0,0,1,0,0,0,0,0,0,0
2,0.064876,1,0.608984,2,-1.301485,0.088284,0.673367,-1.676467,-0.106356,-0.00502,...,0,0,1,0,0,0,0,0,0,0


### Steps to build a pre-processing pipeline

* reading the csv file and dropping the irrelevant features

In [8]:
def read_csv(filepath):
    
        df = pd.read_csv(filepath)
    
        col_drop = df.select_dtypes(object).nunique()[df.select_dtypes(object).nunique()>35].index

        df.drop(columns=col_drop,inplace=True)

        df.drop(columns=["Postcode","YearBuilt"],inplace=True)
        
        return df
    
data = read_csv("melb_data.csv")

data.head()

Unnamed: 0,Rooms,Type,Price,Method,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,2,h,1480000.0,S,2.5,2.0,1.0,1.0,202.0,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,2,h,1035000.0,S,2.5,2.0,1.0,0.0,156.0,79.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,3,h,1465000.0,SP,2.5,3.0,2.0,0.0,134.0,150.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,3,h,850000.0,PI,2.5,3.0,2.0,1.0,94.0,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,4,h,1600000.0,VB,2.5,3.0,1.0,2.0,120.0,142.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


#####  


defining the columns that will used for our column transformer (Numerical and categorical Imputer)

In [9]:
num_nan = data.select_dtypes([int,float]).columns

cat_nan = data.select_dtypes(object).columns

cols = num_nan.append(cat_nan)

####  
#### Numerical Imputer Transformer

In [10]:
class Numerical_imputer(BaseEstimator,TransformerMixin):
    
    def fit(self,X,y=None):
        
        return self
    
    def transform(self,X):
        
        num_impute = SimpleImputer(missing_values=np.nan, strategy="mean")
        
        for col in num_nan:
            
            X[col] = num_impute.fit_transform(X[[col]])
            
        return X

In [11]:
Numerical_imputer().fit_transform(data).head(3)

Unnamed: 0,Rooms,Type,Price,Method,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,2.0,h,1480000.0,S,2.5,2.0,1.0,1.0,202.0,151.96765,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,2.0,h,1035000.0,S,2.5,2.0,1.0,0.0,156.0,79.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,3.0,h,1465000.0,SP,2.5,3.0,2.0,0.0,134.0,150.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0


####  
#### Categorical Imputer Transformer

In [12]:
class Categorical_impter(BaseEstimator,TransformerMixin):
    
    def fit(self,X,y=None):
        
        return self
    
    def transform(self,X):
        
        cat_impute = SimpleImputer(strategy="constant",fill_value= "others")
 
        for col_c in cat_nan:    
        
            X[col_c] = cat_impute.fit_transform(X[[col_c]])
            
        return X       
        

In [13]:
Categorical_impter().fit_transform(data).head(3)

Unnamed: 0,Rooms,Type,Price,Method,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,2.0,h,1480000.0,S,2.5,2.0,1.0,1.0,202.0,151.96765,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,2.0,h,1035000.0,S,2.5,2.0,1.0,0.0,156.0,79.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,3.0,h,1465000.0,SP,2.5,3.0,2.0,0.0,134.0,150.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0


####  
#### Imputer Column Transformer (for both numerical and categorical features at once)

In [14]:
Imputing_values = ColumnTransformer([("Numerical_imputer", Numerical_imputer(),num_nan),("Caterigorical_imputer", Categorical_impter(),cat_nan)])

Imputing_values

In [15]:
Imputing_values.fit_transform(data)

array([[2.0, 1480000.0, 2.5, ..., 'S', 'Yarra', 'Northern Metropolitan'],
       [2.0, 1035000.0, 2.5, ..., 'S', 'Yarra', 'Northern Metropolitan'],
       [3.0, 1465000.0, 2.5, ..., 'SP', 'Yarra', 'Northern Metropolitan'],
       ...,
       [3.0, 1170000.0, 6.8, ..., 'S', 'others', 'Western Metropolitan'],
       [4.0, 2500000.0, 6.8, ..., 'PI', 'others', 'Western Metropolitan'],
       [4.0, 1285000.0, 6.3, ..., 'SP', 'others', 'Western Metropolitan']],
      dtype=object)

####
##### this will convert the generated array to a dataframe as a result

In [16]:
class Convert_to_dataframe (BaseEstimator,TransformerMixin):
    
    def fit(self,X,y=None):
        
        return self
    
    def transform(self,X):
        
        return pd.DataFrame(X,columns=cols)

In [17]:
Convert_to_dataframe().fit_transform(Imputing_values.fit_transform(data)).head(3)

Unnamed: 0,Rooms,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Lattitude,Longtitude,Propertycount,Type,Method,CouncilArea,Regionname
0,2.0,1480000.0,2.5,2.0,1.0,1.0,202.0,151.96765,-37.7996,144.9984,4019.0,h,S,Yarra,Northern Metropolitan
1,2.0,1035000.0,2.5,2.0,1.0,0.0,156.0,79.0,-37.8079,144.9934,4019.0,h,S,Yarra,Northern Metropolitan
2,3.0,1465000.0,2.5,3.0,2.0,0.0,134.0,150.0,-37.8093,144.9944,4019.0,h,SP,Yarra,Northern Metropolitan


####  
#### A pipeline that imputes missing values and converts the results to a dataframe

In [18]:
final_imputer = Pipeline(steps =[("Imputing_values",Imputing_values),("Convert_to_dataframe",Convert_to_dataframe())])
    
final_imputer

In [19]:
final_imputer.fit_transform(data).head(3)

Unnamed: 0,Rooms,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Lattitude,Longtitude,Propertycount,Type,Method,CouncilArea,Regionname
0,2.0,1480000.0,2.5,2.0,1.0,1.0,202.0,151.96765,-37.7996,144.9984,4019.0,h,S,Yarra,Northern Metropolitan
1,2.0,1035000.0,2.5,2.0,1.0,0.0,156.0,79.0,-37.8079,144.9934,4019.0,h,S,Yarra,Northern Metropolitan
2,3.0,1465000.0,2.5,3.0,2.0,0.0,134.0,150.0,-37.8093,144.9944,4019.0,h,SP,Yarra,Northern Metropolitan


####
#### Modified StandardScale Transformer

In [20]:
class standard_scaling (BaseEstimator,TransformerMixin):
    
    def fit(self,X,y=None):
        
        return self
    
    def transform(self,X):
        
        scaler =  StandardScaler()

        X[num_nan] = scaler.fit_transform(X[num_nan])
        
        return X
     

####  
#### A pipeline that imputes missing values, converts the results to a dataframe and scales the numerical features

In [21]:
numeric_scaled = Pipeline(steps =[("Imputing_values",Imputing_values),("Convert_to_dataframe",Convert_to_dataframe()),
                                
                                ("standard_scaling",standard_scaling())])
    
numeric_scaled

In [22]:
numeric_scaled.fit_transform(data).head(3)

Unnamed: 0,Rooms,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Lattitude,Longtitude,Propertycount,Type,Method,CouncilArea,Regionname
0,-0.981463,0.632448,-1.301485,-0.947035,-0.772376,-0.635232,-0.089316,0.0,0.12116,0.03064,-0.784625,h,S,Yarra,Northern Metropolitan
1,-0.981463,-0.06364,-1.301485,-0.947035,-0.772376,-1.676467,-0.100843,-0.186147,0.016437,-0.017478,-0.784625,h,S,Yarra,Northern Metropolitan
2,0.064876,0.608984,-1.301485,0.088284,0.673367,-1.676467,-0.106356,-0.00502,-0.001227,-0.007855,-0.784625,h,SP,Yarra,Northern Metropolitan


####  
#### Modified OneHot encoder Transformer

In [23]:
class OneHot_encoding (BaseEstimator,TransformerMixin):
    
    def fit(self,X,y=None):
        
        return self
    
    def transform(self,X):
        
        onehot = OneHotEncoder()
        
        cat_one = data.select_dtypes(object).nunique()[data.select_dtypes(object).nunique()>5].index
        
        for col in cat_one:
                
            X = pd.concat([X,onehot.fit_transform(X[col])],axis=1)
            
        X.drop(columns=cat_one,inplace=True)
            
        return X          

In [24]:
OneHot_encoding().fit_transform(numeric_scaled.fit_transform(data)).head(3)

Unnamed: 0,Rooms,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Lattitude,Longtitude,...,CouncilArea_33,CouncilArea_34,Regionname_1,Regionname_2,Regionname_3,Regionname_4,Regionname_5,Regionname_6,Regionname_7,Regionname_8
0,-0.981463,0.632448,-1.301485,-0.947035,-0.772376,-0.635232,-0.089316,0.0,0.12116,0.03064,...,0,0,1,0,0,0,0,0,0,0
1,-0.981463,-0.06364,-1.301485,-0.947035,-0.772376,-1.676467,-0.100843,-0.186147,0.016437,-0.017478,...,0,0,1,0,0,0,0,0,0,0
2,0.064876,0.608984,-1.301485,0.088284,0.673367,-1.676467,-0.106356,-0.00502,-0.001227,-0.007855,...,0,0,1,0,0,0,0,0,0,0


####  
#### A pipeline that imputes missing values, converts the results to a dataframe,scales the numerical features and applied OneHot Encoder

In [25]:
one_hot_added = Pipeline(steps =[("Imputing_values",Imputing_values),("Convert_to_dataframe",Convert_to_dataframe()),
                                
                                ("standard_scaling",standard_scaling()),("OneHot_encoding",OneHot_encoding())])
    
one_hot_added

In [26]:
one_hot_added.fit_transform(data).head(3)

Unnamed: 0,Rooms,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Lattitude,Longtitude,...,CouncilArea_33,CouncilArea_34,Regionname_1,Regionname_2,Regionname_3,Regionname_4,Regionname_5,Regionname_6,Regionname_7,Regionname_8
0,-0.981463,0.632448,-1.301485,-0.947035,-0.772376,-0.635232,-0.089316,0.0,0.12116,0.03064,...,0,0,1,0,0,0,0,0,0,0
1,-0.981463,-0.06364,-1.301485,-0.947035,-0.772376,-1.676467,-0.100843,-0.186147,0.016437,-0.017478,...,0,0,1,0,0,0,0,0,0,0
2,0.064876,0.608984,-1.301485,0.088284,0.673367,-1.676467,-0.106356,-0.00502,-0.001227,-0.007855,...,0,0,1,0,0,0,0,0,0,0


####  
#### Modified Ordinal encoder Transformer

In [27]:
class Ordinal_encoding (BaseEstimator,TransformerMixin):
    
    def fit(self,X,y=None):
        
        return self
    
    def transform(self,X):
        
        ordinal = OrdinalEncoder()
        
        cat_ord = data.select_dtypes(object).nunique()[data.select_dtypes(object).nunique()<=5].index
        
        for col in cat_ord:
                
            X[col] = ordinal.fit_transform(X[[col]])        
          
        return X  

In [28]:
Ordinal_encoding().fit_transform(data).head(3)

Unnamed: 0,Rooms,Type,Price,Method,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,2.0,1,1480000.0,1,2.5,2.0,1.0,1.0,202.0,151.96765,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,2.0,1,1035000.0,1,2.5,2.0,1.0,0.0,156.0,79.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,3.0,1,1465000.0,2,2.5,3.0,2.0,0.0,134.0,150.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0


####  
#### A pipeline that imputes missing values, converts the results to a dataframe,scales the numerical features,

#### applied OneHot Encoder and ordinal encoder

In [29]:
def read_csv(filepath):
    
        df = pd.read_csv(filepath)
    
        col_drop = df.select_dtypes(object).nunique()[df.select_dtypes(object).nunique()>35].index

        df.drop(columns=col_drop,inplace=True)

        df.drop(columns=["Postcode","YearBuilt"],inplace=True)
        
        return df
    
data = read_csv("melb_data.csv")

data.head()

Pre_processing_pipeline = Pipeline(steps =[("Imputing_values",Imputing_values),("Convert_to_dataframe",Convert_to_dataframe()),
                                
                                ("standard_scaling",standard_scaling()),("OneHot_encoding",OneHot_encoding()),
                                 
                                 ("Ordinal_encoding",Ordinal_encoding())])
    
Pre_processing_pipeline

In [30]:
Pre_processing_pipeline.fit_transform(data)

Unnamed: 0,Rooms,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Lattitude,Longtitude,...,CouncilArea_33,CouncilArea_34,Regionname_1,Regionname_2,Regionname_3,Regionname_4,Regionname_5,Regionname_6,Regionname_7,Regionname_8
0,-0.981463,0.632448,-1.301485,-0.947035,-0.772376,-0.635232,-0.089316,0.000000,0.121160,0.030640,...,0,0,1,0,0,0,0,0,0,0
1,-0.981463,-0.063640,-1.301485,-0.947035,-0.772376,-1.676467,-0.100843,-0.186147,0.016437,-0.017478,...,0,0,1,0,0,0,0,0,0,0
2,0.064876,0.608984,-1.301485,0.088284,0.673367,-1.676467,-0.106356,-0.005020,-0.001227,-0.007855,...,0,0,1,0,0,0,0,0,0,0
3,0.064876,-0.353025,-1.301485,0.088284,0.673367,-0.635232,-0.116380,0.000000,0.155226,0.016204,...,0,0,1,0,0,0,0,0,0,0
4,1.111216,0.820157,-1.301485,0.088284,-0.772376,0.406003,-0.109864,-0.025428,0.025269,-0.010742,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13575,1.111216,0.264851,1.118210,1.123604,0.673367,0.406003,0.023452,0.000000,-1.216516,1.659041,...,0,0,0,0,0,0,1,0,0,0
13576,0.064876,-0.069897,-0.568761,0.088284,0.673367,0.406003,-0.056488,-0.048388,-0.631709,-1.118028,...,0,0,0,1,0,0,0,0,0,0
13577,0.064876,0.147533,-0.568761,0.088284,0.673367,2.488473,-0.030677,0.000000,-0.549318,-1.037767,...,0,0,0,1,0,0,0,0,0,0
13578,1.111216,2.227975,-0.568761,1.123604,-0.772376,3.529708,0.077079,0.012838,-0.629311,-0.983779,...,0,0,0,1,0,0,0,0,0,0
