In [None]:
# Setup
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the data
data = pd.read_csv('melb_data.csv')

# Select target
y = data.Price

# To keep things simple, we'll use only numerical predictors
melb_predictors = data.drop(['Price'], axis=1)
X = melb_predictors.select_dtypes(exclude=['object'])

# Divide data into training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)

In [None]:
# Function for comparing different approaches
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)



In [None]:
# Score from Approach 1 (Drop Columns with Missing Values)
# Get names of columns with missing values
cols_with_missing = [col for col in X_train.columns
                     if X_train[col].isnull().any()]

# Drop columns in training and validation data
reduced_X_train = X_train.drop(cols_with_missing, axis=1)
reduced_X_valid = X_valid.drop(cols_with_missing, axis=1)

print("MAE from Approach 1 (Drop columns with missing values):")
print(score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid))

In [None]:
from sklearn.impute import SimpleImputer

# Imputation
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))

# Imputation removed column names; put them back
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

print("MAE from Approach 2 (Imputation):")
print(score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid))


In [None]:
# Make copy to avoid changing original data (when imputing)
X_train_plus = X_train.copy()
X_valid_plus = X_valid.copy()

# Make new columns indicating what will be imputed
for col in cols_with_missing:
    X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull()
    X_valid_plus[col + '_was_missing'] = X_valid_plus[col].isnull()

# Imputation
my_imputer = SimpleImputer()
imputed_X_train_plus = pd.DataFrame(my_imputer.fit_transform(X_train_plus))
imputed_X_valid_plus = pd.DataFrame(my_imputer.transform(X_valid_plus))

# Imputation removed column names; put them back
imputed_X_train_plus.columns = X_train_plus.columns
imputed_X_valid_plus.columns = X_valid_plus.columns

print("MAE from Approach 3 (An Extension to Imputation):")
print(score_dataset(imputed_X_train_plus, imputed_X_valid_plus, y_train, y_valid))



In [None]:
# Use median as imputer
# Imputation
from pyexpat import model

final_imputer = SimpleImputer(strategy='median')
final_X_train = pd.DataFrame(final_imputer.fit_transform(X_train))
final_X_valid = pd.DataFrame(final_imputer.transform(X_valid))

# Imputation removed column names; put them back
final_X_train.columns = X_train.columns
final_X_valid.columns = X_valid.columns

print("MAE from Approach 4 (Use median):")
print
(score_dataset(final_X_train, final_X_valid, y_train, y_valid))

# Preprocess test data
#final_X_test = pd.DataFrame(final_imputer.transform(X_test))

# Get test predictions
preds_test = model.predict(final_X_test)

In [None]:
# Shape of training data (num_rows, num_columns)
print(X_train.shape)

# Number of missing values in each column of training data
missing_val_count_by_column = (X_train.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

In [None]:
#Categorical Variables
#Load Data and setup data for trainning and validation
import pandas as pd
from sklearn.model_selection import train_test_split

# Read the data
data = pd.read_csv('melb_data.csv')

# Separate target from predictors
y = data.Price
X = data.drop(['Price'], axis=1)

# Divide data into training and validation subsets
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

# Drop columns with missing values (simplest approach)
cols_with_missing = [col for col in X_train_full.columns if X_train_full[col].isnull().any()] 
X_train_full.drop(cols_with_missing, axis=1, inplace=True)
X_valid_full.drop(cols_with_missing, axis=1, inplace=True)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
                        X_train_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = low_cardinality_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()



In [5]:
#Take a look the data
X_train.head()

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount
12167,u,S,Southern Metropolitan,1,5.0,3182.0,1.0,1.0,0.0,-37.85984,144.9867,13240.0
6524,h,SA,Western Metropolitan,2,8.0,3016.0,2.0,2.0,193.0,-37.858,144.9005,6380.0
8413,h,S,Western Metropolitan,3,12.6,3020.0,3.0,1.0,555.0,-37.7988,144.822,3755.0
2919,u,SP,Northern Metropolitan,3,13.0,3046.0,3.0,1.0,265.0,-37.7083,144.9158,8870.0
6043,h,S,Western Metropolitan,3,13.3,3020.0,3.0,1.0,673.0,-37.7623,144.8272,4217.0


In [4]:
#Get the list of categorical variables
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)

print('Categorical variables:')
print(object_cols)

Categorical variables:
['Type', 'Method', 'Regionname']


In [8]:
#Define function to measure quality of each approach
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

#Function for comparing different approaches
def score_dataset(X_train, X_valid,y_train,y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state = 0)
    model.fit(X_train,y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error (y_valid, preds)

In [9]:
#Approach 1 (Drop Categorical Variables)
drop_X_train = X_train.select_dtypes(exclude = ['object'])
drop_X_valid = X_valid.select_dtypes(exclude=['object'])
print ('MAE from Approach 1 (Drop categorical variables):')
print(score_dataset(drop_X_train,drop_X_valid,y_train,y_valid))

MAE from Approach 1 (Drop categorical variables):
175707.61156991488


In [10]:
#Approcach 2 (Oridinal Encoding)
#SKlearn has OridinalEncoder that can be used to get oridnal encodings
from sklearn.preprocessing import OrdinalEncoder

#Make copy to avoid changing original data
label_X_train = X_train.copy()
lable_X_valid = X_valid.copy()

#Apply ordinal encoder to each column wih categorical data
ordinal_encoder = OrdinalEncoder()
label_X_train[object_cols] = ordinal_encoder.fit_transform(X_train[object_cols])
lable_X_valid[object_cols] = ordinal_encoder.fit_transform(X_valid[object_cols])

print("MAE from Approach 2 (Ordinal Encoding):")
print(score_dataset(label_X_train,lable_X_valid,y_train,y_valid))


MAE from Approach 2 (Ordinal Encoding):
165919.14549617787


In [None]:
#Approach 2 extension (Not in original guide)- Fitting an ordinal encoder to a column in the training data 
#creates a corresponding integer-valued label for each unique value that appears in the training data.
#To solove it we can to below

#Categorical columns in the training data
object_cols = [col for col in X_train.columns if X_train[col].dtypes =='object']

#Columns that can be safely ordinal encoded
good_label_cols = [col for fol in object_cols if set (X_valid[col].issubset(set(X_train[col])))]

#Problematic columns that will be dropped from the dataset

bad_label_cols = list(set(object_cols)-set(good_label_cols))


In [17]:
#Approach 3 (One-Hot Encoding)
#Sklearn has OneHotEncoder to get one-hot encoding
from sklearn.preprocessing import OneHotEncoder

#Apply one-hot encoder to each column with categorical data
#We set handle_unknown='ignore' to avoid errors when the validation data contains classes that aren't represented in the training data, and
#setting sparse_output=False ensures that the encoded columns are returned as a numpy array (instead of a sparse matrix).
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.fit_transform(X_valid[object_cols]))

#One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

#Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis = 1)
num_X_valid = X_valid.drop(object_cols, axis =1)

#Add one-hot encoded columns to numercial features
OH_X_train = pd.concat([num_X_train,OH_cols_train], axis = 1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis =1)

#Ensure all columns have string type
OH_X_train.columns = OH_X_train.columns.astype(str)
OH_X_valid.columns = OH_X_valid.columns.astype(str)

print('MAE from Approach 3 (One-Hot Encoding):')
print(score_dataset(OH_X_train,OH_X_valid,y_train,y_valid))

MAE from Approach 3 (One-Hot Encoding):
166111.84115541063


In [None]:
#Approach 3 extension
#To find low cardinality colums for one-hot encoding
# Columns that will be one-hot encoded
low_cardinality_cols = [col for col in object_cols if X_train[col].nunique() < 10]

# Columns that will be dropped from the dataset
high_cardinality_cols = list(set(object_cols)-set(low_cardinality_cols))

In [3]:
#Pipeline
#Load data and setup
import pandas as pd
from sklearn.model_selection import train_test_split

# Read the data
data = pd.read_csv('melb_data.csv')

# Separate target from predictors
y = data.Price
X = data.drop(['Price'], axis=1)

# Divide data into training and validation subsets
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
                        X_train_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

In [4]:
#Check the data
X_train.head()

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
12167,u,S,Southern Metropolitan,1,5.0,3182.0,1.0,1.0,1.0,0.0,,1940.0,-37.85984,144.9867,13240.0
6524,h,SA,Western Metropolitan,2,8.0,3016.0,2.0,2.0,1.0,193.0,,,-37.858,144.9005,6380.0
8413,h,S,Western Metropolitan,3,12.6,3020.0,3.0,1.0,1.0,555.0,,,-37.7988,144.822,3755.0
2919,u,SP,Northern Metropolitan,3,13.0,3046.0,3.0,1.0,1.0,265.0,,1995.0,-37.7083,144.9158,8870.0
6043,h,S,Western Metropolitan,3,13.3,3020.0,3.0,1.0,2.0,673.0,673.0,1970.0,-37.7623,144.8272,4217.0


In [7]:
#Construct pipeline in 3 steps
#Step 1 Define Preprocessing steps
#imputes missing value in numerical data and imputes missing values and 
#applies a one-hot encoding to categorical data

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

#preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy = 'constant')

#preprocessing for categorical data
categorical_transformer = Pipeline(steps=['imputer',SimpleImputer(strategy='most_frequent'),('onehot',OneHotEncoder(handle_unknown = 'ignore'))])

#Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(transformers=[('num',numerical_transformer, numerical_cols),('cat',categorical_transformer,categorical_cols)])

In [8]:
#Step 2 Define the model
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor (n_estimators=100, random_state=0)

In [12]:
#Step3 Create and Evaluate the Pipline
#Bundle preprocessing and modeling code in a pipline
my_pipeline = Pipeline(steps=[('preprocessor',preprocessor),('model',model)])

#Preprocessing of traning data, fit model
my_pipeline.fit(X_train, y_train)

#Preprocessing of validiation dta, get prediction
preds = my_pipeline.predict(X_valid)

#Evalueate the model

score = mean_absolute_error(y_valid, preds)

print('MAE', score)

ValueError: too many values to unpack (expected 2)