In [14]:
# Import required libraries
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor

data = pd.read_csv('train.csv') # Read in the data
data.dropna(subset = ['SalePrice'], axis = 0, inplace = True) # Delete every record that does not have the saleprice
y = data.SalePrice 
object_cols = [col for col in data.columns if data[col].dtype == 'object' and data[col].nunique()<15] # Select categorical table
numerical_cols = [col for col in data.columns if data[col].dtype in ['int64', 'float64']] # Select numerical columns
X = data[numerical_cols + object_cols]

In [17]:
# Creating Pipeline
# Preprocessing for numerical data
numerical_prep = SimpleImputer(strategy='most_frequent')

# Preprocessing for categorical data
categorical_prep = Pipeline(steps = [
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle together the preprocessing steps
preprocessor = ColumnTransformer(transformers = [
    ('numerical', numerical_prep, numerical_cols),
    ('categorical', categorical_prep, object_cols)
])
# Select Random Forest as the model
model = RandomForestRegressor(n_estimators=100)

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])

clf.fit(X_train, y_train)

In [None]:
# Splitting train and test data