In [1]:
# Set up code checking
import os
if not os.path.exists("../input/train.csv"):
    os.symlink("../input/home-data-for-ml-course/train.csv", "../input/train.csv")  
    os.symlink("../input/home-data-for-ml-course/test.csv", "../input/test.csv") 
from learntools.core import binder
binder.bind(globals())
from learntools.ml_intermediate.ex6 import *
print("Setup Complete")

Setup Complete


I maintained the exercise preprocessing.

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split 
import numpy as np

X = pd.read_csv('../input/train.csv', index_col='Id') 
X_test_full = pd.read_csv('../input/test.csv', index_col='Id')

X.dropna(axis=0, subset=['SalePrice'], inplace=True) 
y = X.SalePrice 
X.drop(['SalePrice'], axis=1, inplace=True) 

X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0) 
low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and X_train_full[cname].dtype == "object"]
numeric_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

my_cols = low_cardinality_cols + numeric_cols 
X_train = X_train_full[my_cols].copy() 
X_valid = X_valid_full[my_cols].copy() 
X_test = X_test_full[my_cols].copy() 

X_train = pd.get_dummies(X_train) 
X_valid = pd.get_dummies(X_valid) 
X_test = pd.get_dummies(X_test) 
X_train, X_valid = X_train.align(X_valid, join='left', axis=1) 
X_train, X_test = X_train.align(X_test, join='left', axis=1)


In [3]:
print(X_train.shape)

(1168, 227)


Exploratory first model. Cool to see the difference between default and after hyperparameters turning.

In [4]:
from xgboost import XGBRegressor

# Define the model
my_model_1 = XGBRegressor(random_state = 0)

# Fit the model
my_model_1.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [5]:
from sklearn.metrics import mean_absolute_error

# Get predictions
predictions_1 = my_model_1.predict(X_valid)

In [6]:
# Calculate MAE
mae_1 = mean_absolute_error(predictions_1, y_valid)


print("Mean Absolute Error:" , mae_1)

Mean Absolute Error: 17662.736729452055


Check for missing values

In [7]:
X_train.isnull().sum()

MSSubClass                 0
LotFrontage              212
LotArea                    0
OverallQual                0
OverallCond                0
                        ... 
SaleCondition_AdjLand      0
SaleCondition_Alloca       0
SaleCondition_Family       0
SaleCondition_Normal       0
SaleCondition_Partial      0
Length: 227, dtype: int64

In [8]:
X_valid.isnull().sum()

MSSubClass                0
LotFrontage              47
LotArea                   0
OverallQual               0
OverallCond               0
                         ..
SaleCondition_AdjLand     0
SaleCondition_Alloca      0
SaleCondition_Family      0
SaleCondition_Normal      0
SaleCondition_Partial     0
Length: 227, dtype: int64

In [9]:
X_test.isnull().sum()

MSSubClass                 0
LotFrontage              227
LotArea                    0
OverallQual                0
OverallCond                0
                        ... 
SaleCondition_AdjLand      0
SaleCondition_Alloca       0
SaleCondition_Family       0
SaleCondition_Normal       0
SaleCondition_Partial      0
Length: 227, dtype: int64

In [10]:
X_train.head()

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
619,20,90.0,11694,9,5,2007,2007,452.0,48,0,...,0,1,0,0,0,0,0,0,0,1
871,20,60.0,6600,5,5,1962,1962,0.0,0,0,...,0,0,0,1,0,0,0,0,1,0
93,30,80.0,13360,5,7,1921,2006,0.0,713,0,...,0,0,0,1,0,0,0,0,1,0
818,20,,13265,8,5,2002,2002,148.0,1218,0,...,0,0,0,1,0,0,0,0,1,0
303,20,118.0,13704,7,5,2001,2002,150.0,0,0,...,0,0,0,1,0,0,0,0,1,0


Impute value in order to get ourselves rid of missing values. We will use the mean of the values we do have in place of the ones we don’t.  

In [11]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputed_X_train = imputer.fit_transform(X_train)
imputed_X_valid = imputer.transform(X_valid)
imputed_X_test = imputer.transform(X_test)

In [12]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
imputed_X_train = sc.fit_transform(imputed_X_train)
imputed_X_valid = sc.transform(imputed_X_valid)
imputed_X_test = sc.transform(imputed_X_test)

Now we tune the models. We tried LightGBM but XGB works better. It took a lot of iterating to get here.

In [13]:
from tensorflow import keras 
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping

model = keras.Sequential([ 
layers.Dense(1024, activation='relu', input_shape=[227]), 
layers.Dropout(rate = 0.4),
layers.Dense(1024, activation='relu'),
layers.Dropout(rate = 0.4),
layers.Dense(1024, activation='relu'), 
layers.Dropout(rate = 0.4),
layers.Dense(1), 
])

early_stopping = EarlyStopping(min_delta = 0.001,
                               patience = 30,
                               restore_best_weights = True)


User settings:

   KMP_AFFINITY=granularity=fine,verbose,compact,1,0
   KMP_BLOCKTIME=0
   KMP_DUPLICATE_LIB_OK=True
   KMP_INIT_AT_FORK=FALSE
   KMP_SETTINGS=1

Effective settings:

   KMP_ABORT_DELAY=0
   KMP_ADAPTIVE_LOCK_PROPS='1,1024'
   KMP_ALIGN_ALLOC=64
   KMP_ALL_THREADPRIVATE=128
   KMP_ATOMIC_MODE=2
   KMP_BLOCKTIME=0
   KMP_CPUINFO_FILE: value is not defined
   KMP_DETERMINISTIC_REDUCTION=false
   KMP_DEVICE_THREAD_LIMIT=2147483647
   KMP_DISP_NUM_BUFFERS=7
   KMP_DUPLICATE_LIB_OK=true
   KMP_ENABLE_TASK_THROTTLING=true
   KMP_FORCE_REDUCTION: value is not defined
   KMP_FOREIGN_THREADS_THREADPRIVATE=true
   KMP_FORKJOIN_BARRIER='2,2'
   KMP_FORKJOIN_BARRIER_PATTERN='hyper,hyper'
   KMP_GTID_MODE=3
   KMP_HANDLE_SIGNALS=false
   KMP_HOT_TEAMS_MAX_LEVEL=1
   KMP_HOT_TEAMS_MODE=0
   KMP_INIT_AT_FORK=true
   KMP_LIBRARY=throughput
   KMP_LOCK_KIND=queuing
   KMP_MALLOC_POOL_INCR=1M
   KMP_NUM_LOCKS_IN_BLOCK=1
   KMP_PLAIN_BARRIER='2,2'
   KMP_PLAIN_BARRIER_PATTERN='hyper,hype

In [14]:
model.compile( 
    optimizer='adam', 
    loss='mae', )

In [15]:
prediction = model.fit( 
    imputed_X_train, y_train, 
    validation_data=(imputed_X_valid, y_valid), 
    batch_size=256, 
    epochs=1000,
    callbacks = [early_stopping],
    verbose = 2,
)

2021-10-27 23:41:07.415039: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/1000
5/5 - 1s - loss: 180790.0781 - val_loss: 181282.0469
Epoch 2/1000
5/5 - 0s - loss: 180624.0781 - val_loss: 180901.7812
Epoch 3/1000
5/5 - 0s - loss: 180057.3750 - val_loss: 179888.4844
Epoch 4/1000
5/5 - 0s - loss: 178657.7812 - val_loss: 177653.4531
Epoch 5/1000
5/5 - 0s - loss: 175642.5625 - val_loss: 173183.4531
Epoch 6/1000
5/5 - 0s - loss: 169764.8750 - val_loss: 164852.5000
Epoch 7/1000
5/5 - 0s - loss: 158986.3594 - val_loss: 150163.5469
Epoch 8/1000
5/5 - 0s - loss: 140735.6094 - val_loss: 126070.6719
Epoch 9/1000
5/5 - 0s - loss: 113221.4375 - val_loss: 93944.0859
Epoch 10/1000
5/5 - 0s - loss: 78781.5156 - val_loss: 61580.3164
Epoch 11/1000
5/5 - 0s - loss: 50663.5742 - val_loss: 55046.4258
Epoch 12/1000
5/5 - 0s - loss: 49924.6172 - val_loss: 46786.4258
Epoch 13/1000
5/5 - 0s - loss: 37340.0547 - val_loss: 39629.6953
Epoch 14/1000
5/5 - 0s - loss: 32902.6562 - val_loss: 40960.8281
Epoch 15/1000
5/5 - 0s - loss: 30225.4727 - val_loss: 34502.5625
Epoch 16/1000
5/5

In [16]:
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': prediction})
output.to_csv('submission.csv', index=False)

It has been a huge learning lesson for me I have been several days at it. Hope you like it and get something out of it.