# Cross-Validation

**Source (all credits to):** *Kaggle: Intermediate Machine Learning Course*

# 1. Import Libraries

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold # to use shuffle=True

# 2. Prepare Data

In [17]:
# Load the data
data = pd.read_csv('../datasets/melb_data.csv')

# Select subset of predictors
cols_to_use = ['Rooms', 'Distance', 'Landsize', 'BuildingArea', 'YearBuilt']
X = data[cols_to_use]

# Select target
y = data.Price

# 3. Create Pipeline

In [18]:
my_pipeline = Pipeline(steps=[('preprocessor', SimpleImputer()),
                              ('model', RandomForestRegressor(n_estimators=50,
                                                              random_state=0))
                             ])

# 4. Do Cross-Validation

In [19]:
# Multiply by -1 since sklearn calculates *negative* MAE
cv = KFold(n_splits = 5, shuffle = True) # to use shuffle=True

scores = -1 * cross_val_score(my_pipeline, X, y,
                              cv=cv,
                              scoring='neg_mean_absolute_error')

print("MAE scores:\n", scores)

MAE scores:
 [251753.07757452 241448.77349907 252381.46916321 250943.03920666
 254093.52724504]


# 5. Get Average MAE

In [20]:
print("Average MAE score (across experiments):")
print(scores.mean())

Average MAE score (across experiments):
250123.97733770107
