In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer
from sklearn.ensemble import GradientBoostingRegressor
from itertools import product
from xgboost import XGBRegressor

sns.set(style="ticks", context="talk")
plt.style.use("dark_background")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [2]:
df_train = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
df_test = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")
df_sample = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv")

In [3]:
y = df_train["SalePrice"]
df_train = df_train.drop("SalePrice", axis=1)

In [4]:
def na_s(train, test):
    print(train.shape[0])
    for col in train.columns:
        n_nas = train[col].isna().sum()
        if n_nas > 0:
            print(f"{col}: {round(100 * n_nas / train.shape[0], 2)}%")
            if 100 * n_nas / train.shape[0] > 33.33:
                train = train.drop(col, axis = 1)
                test = test.drop(col, axis = 1)
                print("Dropped")
            print("______________________")
    return (train, test)

In [5]:
# remove col with more than 33.33% nan
(df_train, df_test) = na_s(df_train, df_test)

1460
LotFrontage: 17.74%
______________________
Alley: 93.77%
Dropped
______________________
MasVnrType: 59.73%
Dropped
______________________
MasVnrArea: 0.55%
______________________
BsmtQual: 2.53%
______________________
BsmtCond: 2.53%
______________________
BsmtExposure: 2.6%
______________________
BsmtFinType1: 2.53%
______________________
BsmtFinType2: 2.6%
______________________
Electrical: 0.07%
______________________
FireplaceQu: 47.26%
Dropped
______________________
GarageType: 5.55%
______________________
GarageYrBlt: 5.55%
______________________
GarageFinish: 5.55%
______________________
GarageQual: 5.55%
______________________
GarageCond: 5.55%
______________________
PoolQC: 99.52%
Dropped
______________________
Fence: 80.75%
Dropped
______________________
MiscFeature: 96.3%
Dropped
______________________


In [6]:
print("train", df_train.shape)
print("test", df_test.shape)

train (1460, 74)
test (1459, 74)


In [7]:
for col, typ in zip(df_train.columns, df_train.dtypes):
    print(col, ":", typ)

Id : int64
MSSubClass : int64
MSZoning : object
LotFrontage : float64
LotArea : int64
Street : object
LotShape : object
LandContour : object
Utilities : object
LotConfig : object
LandSlope : object
Neighborhood : object
Condition1 : object
Condition2 : object
BldgType : object
HouseStyle : object
OverallQual : int64
OverallCond : int64
YearBuilt : int64
YearRemodAdd : int64
RoofStyle : object
RoofMatl : object
Exterior1st : object
Exterior2nd : object
MasVnrArea : float64
ExterQual : object
ExterCond : object
Foundation : object
BsmtQual : object
BsmtCond : object
BsmtExposure : object
BsmtFinType1 : object
BsmtFinSF1 : int64
BsmtFinType2 : object
BsmtFinSF2 : int64
BsmtUnfSF : int64
TotalBsmtSF : int64
Heating : object
HeatingQC : object
CentralAir : object
Electrical : object
1stFlrSF : int64
2ndFlrSF : int64
LowQualFinSF : int64
GrLivArea : int64
BsmtFullBath : int64
BsmtHalfBath : int64
FullBath : int64
HalfBath : int64
BedroomAbvGr : int64
KitchenAbvGr : int64
KitchenQual : object

In [8]:
for col in df_train.columns:
    typ = df_train[col].dtypes
    if typ == "object":
        n_train = len(df_train[col].unique())
        n_test = len(df_test[col].unique())
        if n_train != n_test:
            print(col,", train: ",n_train,"; test:", n_test)
#         sns.boxplot(data=df_train, x=col, y="SalePrice")
#         plt.show()

MSZoning , train:  5 ; test: 6
Condition2 , train:  8 ; test: 5
HouseStyle , train:  8 ; test: 7
RoofMatl , train:  8 ; test: 4
Exterior1st , train:  15 ; test: 14
Heating , train:  6 ; test: 4
Electrical , train:  6 ; test: 4
KitchenQual , train:  4 ; test: 5
Functional , train:  7 ; test: 8
GarageQual , train:  6 ; test: 5
SaleType , train:  9 ; test: 10


In [9]:
df_train["data"] = "train"
df_test["data"] = "test"
df_tot = pd.concat([df_train, df_test])

X_tot = pd.get_dummies(df_tot)
X_tot.shape

(2919, 269)

In [10]:
X_tot.columns

Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       ...
       'SaleType_Oth', 'SaleType_WD', 'SaleCondition_Abnorml',
       'SaleCondition_AdjLand', 'SaleCondition_Alloca', 'SaleCondition_Family',
       'SaleCondition_Normal', 'SaleCondition_Partial', 'data_test',
       'data_train'],
      dtype='object', length=269)

In [11]:
X_train = X_tot[X_tot["data_train"] == 1]
X_test = X_tot[X_tot["data_test"] == 1]
print(X_train.shape)
print(X_test.shape)

(1460, 269)
(1459, 269)


In [12]:
scaler = MinMaxScaler().fit(np.array(X_train))
X_train = scaler.transform(np.array(X_train))

imputer = KNNImputer(n_neighbors=5).fit(X_train)
X_train = imputer.transform(X_train)
print(X_train.shape)

(1460, 269)


In [13]:
X_test = scaler.transform(np.array(X_test))
X_test = imputer.transform(X_test)

In [14]:
model = GradientBoostingRegressor(learning_rate=0.1,
                                  n_estimators=100,
                                  subsample=1.0)
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X_train, y, cv=5, scoring="r2")
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.89 accuracy with a standard deviation of 0.02


In [15]:
# # n_estims = np.arange(60, 200, 20)
# n_estims = np.arange(5, 100, 5)
# # l_rates = np.array([0.5, .1, .075, .05, .025, .01, .0075, .005, .0025, .001])
# l_rates = np.arange(.05, .5, .05)

# combos = list(product(n_estims, l_rates))
# scorez = np.zeros(len(combos))

# for i in range(len(combos)):
    
#     l_rate = combos[i][1]
#     n_est = combos[i][0]
#     clf = GradientBoostingRegressor(learning_rate=l_rate, n_estimators=n_est)
#     scores = cross_val_score(clf, X_train, y, cv=10, scoring="r2")
#     scorez[i] = scores.mean()
#     print("%0.2f r2 with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
#     print(f"l rate = {l_rate}; # estim = {n_est};")

# idx = np.where(scorez == max(scorez))[0].item()
# print(idx)
# f_combo = combos[idx]
# print(f_combo)

f_combo = (80, 0.15)
model = GradientBoostingRegressor(learning_rate=f_combo[1], n_estimators=f_combo[0])

In [16]:
model.fit(X_train, y)

In [19]:
my_model = XGBRegressor()
my_model.fit(X_train, y)

In [20]:
predictions = my_model.predict(X_test)
# predictions = model.predict(X_test)

predictions.shape

(1459,)

In [21]:
output = pd.DataFrame({'Id': df_test.Id, 'SalePrice': predictions})
output.to_csv('submission_prices.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
