#### This is the scripts for Intermediate Machine Learning from Kaggle.com
#### source code from https://www.kaggle.com/learn/intermediate-machine-learning
###### P.T.
###### Nov 2022

## 3. Pipelines

### 3.1 Load and prepare the data 

In [11]:
# Load Packages and get directory
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split

# Show max columns
pd.set_option('display.max_columns', None)

# Show directory
directory_name = os.getcwd()
os.getcwd()

'/Users/tangpeipei/Documents/Data Science Learning/Github/Machine_Learning_Tutorial/Kaggle_Learn/Intermedium Machine Learning'

In [12]:
# read the data
DATA = pd.read_csv(directory_name+'/melb_data.csv')
DATA.head(3)

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,2.0,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,2.0,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,3.0,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0


In [31]:
# Select the predictors and target
X = DATA.drop(labels=['Price'],axis=1)
y = DATA['Price']

# Separate data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state = 888, train_size = 0.8)

In [38]:
# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [col_name for col_name in X_train.columns if X_train[col_name].dtype == 'object' \
                    and X_train[col_name].nunique() < 10]
print('The data contains the following categorical variables: {}'.format(categorical_cols))

The data contains the following categorical variables: ['Type', 'Method', 'Regionname']


In [40]:
# Select numerical columns
numerical_cols = [col_name for col_name in X_train.columns if X_train[col_name].dtype in ['int64','float64']]
print('The data contains the following numeric variables: {}'.format(numerical_cols))

The data contains the following numeric variables: ['Rooms', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude', 'Propertycount']


In [41]:
# Only pick up the columns that we wanted:
my_col = categorical_cols + numerical_cols
X_train = X_train[my_col].copy()
X_valid = X_valid[my_col].copy()

In [45]:
# Look at the final data frame
X_train.head(3)

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
5818,u,S,Southern Metropolitan,2,6.1,3182.0,2.0,1.0,1.0,0.0,,,-37.8653,144.9824,13240.0
5239,h,S,Northern Metropolitan,3,11.2,3073.0,3.0,1.0,2.0,602.0,162.0,1970.0,-37.7115,145.0009,21650.0
1356,u,S,Northern Metropolitan,2,5.2,3056.0,2.0,1.0,1.0,0.0,74.0,2004.0,-37.7703,144.9533,11918.0


### 3.2 Define the pipeline

In [47]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [73]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

In [74]:
# Preprocessing for categorical data
categorical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown = 'ignore'))
])

In [75]:
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

### 3.3 Mdoeling

In [76]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=0)

In [77]:
# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

In [78]:
# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='constant',
                                                                verbose=0),
                                                  ['Rooms', 'Distance',
                                                   'Postcode', 'Bedroom2',
                                                   'Bathroom', 'Car',
 

In [79]:
# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)

In [80]:
# Evaluate the model
from sklearn.metrics import mean_absolute_error
score = mean_absolute_error(y_valid, preds)
print('MAE:', score)

MAE: 159163.5040183393


## 4. Cross-Validation

In [86]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Select subset of predictors
cols_to_use = ['Rooms', 'Distance', 'Landsize', 'BuildingArea', 'YearBuilt']
X = DATA[cols_to_use]

# Select target
y = DATA.Price

my_pipeline = Pipeline(steps=[('preprocessor', SimpleImputer()),
                              ('model', RandomForestRegressor(n_estimators=50,
                                                              random_state=0))
                             ])

In [87]:
from sklearn.model_selection import cross_val_score

# Multiply by -1 since sklearn calculates *negative* MAE
scores = -1 * cross_val_score(my_pipeline, X, y,
                              cv=5,
                              scoring='neg_mean_absolute_error')

print("MAE scores:\n", scores)

MAE scores:
 [301628.7893587  303164.4782723  287298.331666   236061.84754543
 260383.45111427]
