In [2]:
import os
import pandas as pd 
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import  OneHotEncoder
from sklearn.compose import  ColumnTransformer

In [3]:
os.chdir('C:\\Users\\Aminou Ahmad\\Aminu Ahmadh\\Datasets\\home-data-for-ml-course')
data = pd.read_csv('train.csv')

In [4]:
y = data.SalePrice
X = data.drop('SalePrice', axis=1)

In [5]:
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2)

In [6]:
cat_cols = [col for col in X_train_full.columns if X_train_full[col].dtype == 'object' and X_train_full[col].nunique() < 5]
num_cols = [col for col in X_train_full.columns if X_train_full[col].dtype in ['int64', 'float64']]
cols = cat_cols + num_cols

In [7]:
X_train = X_train_full[cols]
X_valid = X_valid_full[cols]

In [8]:
cat_trans = Pipeline(steps=[('impute', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore'))])
num_trans = SimpleImputer(strategy='median')

In [9]:
preprocess = ColumnTransformer(transformers=[('num', num_trans, num_cols), ('cat', cat_trans, cat_cols)])

In [10]:
model = RandomForestRegressor(n_estimators=200, min_samples_split=5, random_state=0, n_jobs=-1)

In [11]:
my_pipeline = Pipeline(steps=[('preprocessor', preprocess), ('model', model)])

In [12]:
my_pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  SimpleImputer(strategy='median'),
                                                  ['Id', 'MSSubClass',
                                                   'LotFrontage', 'LotArea',
                                                   'OverallQual', 'OverallCond',
                                                   'YearBuilt', 'YearRemodAdd',
                                                   'MasVnrArea', 'BsmtFinSF1',
                                                   'BsmtFinSF2', 'BsmtUnfSF',
                                                   'TotalBsmtSF', '1stFlrSF',
                                                   '2ndFlrSF', 'LowQualFinSF',
                                                   'GrLivArea', 'BsmtFullBath',
                                                   'BsmtHalfBath', 'Ful...
                                                

In [13]:
preds = my_pipeline.predict(X_valid)

In [14]:
print('mae: ', mean_absolute_error(y_valid, preds))

mae:  16897.886966322152


In [15]:
preds[:5]

array([188033.10499928,  70837.2458631 , 215816.09911869, 138093.71929834,
       243708.91118994])

In [16]:
y_valid[:5]

1207    200000
636      60000
2       223500
24      154000
1451    287090
Name: SalePrice, dtype: int64