In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
import seaborn as sns

from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

In [3]:
df = pd.read_csv('ames.csv')

In [11]:
# Read the data
X = df[:2001].copy()
X_test = df[2001:].copy()
X_test2 = X_test.drop(columns=['Sale_Price'])

y = X.Sale_Price
X.drop(columns=['Sale_Price'], inplace=True)

(2001, 80)

In [5]:
categorical_cols = [col for col in X.columns if X[col].dtype == 'object']
numerical_cols = [col for col in X.columns if (X[col].dtype == 'int64' or X[col].dtype == 'float64')]

In [6]:
numerical_transformer = SimpleImputer()
categorical_transformer = Pipeline(steps=
                                   [('imputer', SimpleImputer(strategy='most_frequent')),
                                    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=
                                 [('num', numerical_transformer, numerical_cols), 
                                  ('cat', categorical_transformer, categorical_cols)])

model = XGBRegressor(random_state=0)

pipeline = Pipeline(steps=
                   [('preprocess', preprocessor),
                   ('model', model)])

grid = GridSearchCV(pipeline,  
                    param_grid={'model__n_estimators': [2000, 3000],
                                'model__learning_rate' : [0.01, 0.05],                                
                                'model__min_child_weight' : [0, 1]
                               },
                    cv = 10,
                    scoring = 'neg_mean_absolute_error')

grid.fit(X, y)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('preprocess',
                                        ColumnTransformer(transformers=[('num',
                                                                         SimpleImputer(),
                                                                         ['Lot_Frontage',
                                                                          'Lot_Area',
                                                                          'Year_Built',
                                                                          'Year_Remod_Add',
                                                                          'Mas_Vnr_Area',
                                                                          'BsmtFin_SF_1',
                                                                          'BsmtFin_SF_2',
                                                                          'Bsmt_Unf_SF',
                                         

In [8]:
print(f"Best model parameters: {grid.best_params_}")
print(f"Best score: {-1 * grid.best_score_}")

Best model parameters: {'model__learning_rate': 0.01, 'model__min_child_weight': 0, 'model__n_estimators': 3000}
Best score: 15335.329289606656


In [19]:
# save test predictions to file
predictions = grid.predict(X_test2)

In [20]:
output = pd.DataFrame({'Id': X_test2.index, 'Sale_Price': predictions})


In [21]:
output

Unnamed: 0,Id,Sale_Price
0,2001,178582.484375
1,2002,112390.351562
2,2003,88727.718750
3,2004,124044.031250
4,2005,105044.132812
...,...,...
924,2925,143160.765625
925,2926,138230.953125
926,2927,127251.203125
927,2928,170367.531250


In [22]:
output.to_csv('submission.csv', index=False)    
