# Load data

In [11]:
import pandas as pd # to load and manupilate data
from IPython.display import display # better than print() :)
from sklearn.compose import ColumnTransformer # to apply changes in different columns
from sklearn.pipeline import Pipeline # to build a pipeline and bundle preprocessing and modeling
from sklearn.impute import SimpleImputer # for Imputing missing values
from sklearn.preprocessing import OneHotEncoder # for encoding missing values in categorical columns
from sklearn.ensemble import RandomForestRegressor # the algorithm that we'll use
from sklearn.metrics import mean_absolute_error # the metric that we'll use



data= pd.read_csv(r'data/melb_data.csv')

y= data.Price
X= data.drop(['Price'], axis=1)

from sklearn.model_selection import train_test_split
X_train_full, X_valid_full, y_train, y_valid= train_test_split(X, y, train_size=.8, test_size=.2, random_state=0)

# will use these cols variables in the pipeline
cat_cols= [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and X_train_full[cname].dtype == 'object']
num_cols= list(X_train_full._get_numeric_data().columns)
my_cols= cat_cols+num_cols

# rearrange the sequence of the columns in both the training and validation datasets
X_train= X_train_full[my_cols].copy()
X_valid= X_valid_full[my_cols].copy()

# Set up transformers

In [12]:
# Preprocessing for NUMERICAL data (imputation)
numerical_transformer_imputator= Pipeline(
    steps=[
        ('impute',SimpleImputer(strategy='mean'))
    ]
)

# Preprocessing for CATEGORICAL data (imputation + encoding)
# we use the pipeline bc there are more than one step for preprocessing categorical data, imputation and encoding
categorical_transformer_inputator_encoder= Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy="most_frequent")),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]
)


# Bundle and connect the transformers

In [13]:
# Bundle preprocessing for numerical and categorical data
# we use ColumnTransformer to apply the imputation and encodeing on different set of columns
preprocessor= ColumnTransformer(
    transformers=[
        ('numumerical columns',numerical_transformer_imputator, num_cols),
        ('categorical columns', categorical_transformer_inputator_encoder, cat_cols)
    ]
)

## Preprocessing's set and now model definition...

In [14]:
# DEFINE
model= RandomForestRegressor(n_estimators=100, random_state=0)


# Build the pipeline

In [15]:
# This pipeline is for modeling(fit, predict, & evaluate)
# Here we use the previously defined pipelines

# We bundle preprocessing and modeling using a pipeline
my_pipeline= Pipeline(
    steps=[
        ("preprocessing", preprocessor)
        ,("modeling", model)
    ]
)

# Preprocessing of training data, fit data
# my_pipeline.fit(X_train,y_train)

# 



# FIT

In [16]:
# FIT
my_pipeline.fit(X_train, y_train)

# PREDICT

In [7]:
preds= my_pipeline.predict(X_valid)

# EVALUATE

In [8]:
mae= mean_absolute_error(y_valid, preds)
mae

160814.63047680413