In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

#read the data
X = pd.read_csv("train.csv", index_col='Id')
X_test = pd.read_csv("test.csv", index_col='Id')

#rm rows with missing target, separate target from predictors
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice
X.drop(['SalePrice'], axis=1, inplace=True)

#drop cols with missing values
cols_with_missing = [col for col in X.columns if X[col].isnull().any()]
X.drop(cols_with_missing, axis=1, inplace=True)

#breaking off valid set from train data
X_train, X_valid, y_train, y_valid = train_test_split(X, y,
                                                      train_size=0.8, test_size=0.2,
                                                      random_state=0)

In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

In [9]:
#ordinal encoding

#categorical columns in the training data
object_col = [col for col in X_train.columns if X_train[col].dtype == 'object']

good_label_cols = [col for col in object_col if set(X_valid[col]).issubset(set(X_train[col]))]
bad_label_cols = list(set(object_col)-set(good_label_cols))

from sklearn.preprocessing import OrdinalEncoder

#drop categorical cols that will not be encoded

label_X_train = X_train.drop(bad_label_cols, axis=1)
label_X_valid = X_valid.drop(bad_label_cols, axis=1)

#apply OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
label_X_train[good_label_cols] = ordinal_encoder.fit_transform(X_train[good_label_cols])
label_X_valid[good_label_cols] = ordinal_encoder.transform(X_valid[good_label_cols])

In [11]:
print(score_dataset(label_X_train, label_X_valid, y_train, y_valid))

17098.01649543379
