# Importing libraries.

In [1]:
import pandas as pd, numpy as np, xgboost as xgb, seaborn as sns, os

from scipy.stats import iqr

# Acquiring the data.

In [2]:
df_train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
df_test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

df_sale_price = df_train["SalePrice"]

df_all = pd.concat([df_train, df_test])
df_all.drop("SalePrice", axis = 1, inplace = True)

# Exploratory data analysis (EDA).

In [3]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2919 entries, 0 to 1458
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             2919 non-null   int64  
 1   MSSubClass     2919 non-null   int64  
 2   MSZoning       2915 non-null   object 
 3   LotFrontage    2433 non-null   float64
 4   LotArea        2919 non-null   int64  
 5   Street         2919 non-null   object 
 6   Alley          198 non-null    object 
 7   LotShape       2919 non-null   object 
 8   LandContour    2919 non-null   object 
 9   Utilities      2917 non-null   object 
 10  LotConfig      2919 non-null   object 
 11  LandSlope      2919 non-null   object 
 12  Neighborhood   2919 non-null   object 
 13  Condition1     2919 non-null   object 
 14  Condition2     2919 non-null   object 
 15  BldgType       2919 non-null   object 
 16  HouseStyle     2919 non-null   object 
 17  OverallQual    2919 non-null   int64  
 18  OverallC

In [4]:
df_all.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
count,2919.0,2919.0,2433.0,2919.0,2919.0,2919.0,2919.0,2919.0,2896.0,2918.0,...,2918.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0
mean,1460.0,57.137718,69.305795,10168.11408,6.089072,5.564577,1971.312778,1984.264474,102.201312,441.423235,...,472.874572,93.709832,47.486811,23.098321,2.602261,16.06235,2.251799,50.825968,6.213087,2007.792737
std,842.787043,42.517628,23.344905,7886.996359,1.409947,1.113131,30.291442,20.894344,179.334253,455.610826,...,215.394815,126.526589,67.575493,64.244246,25.188169,56.184365,35.663946,567.402211,2.714762,1.314964
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0
25%,730.5,20.0,59.0,7478.0,5.0,5.0,1953.5,1965.0,0.0,0.0,...,320.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2007.0
50%,1460.0,50.0,68.0,9453.0,6.0,5.0,1973.0,1993.0,0.0,368.5,...,480.0,0.0,26.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0
75%,2189.5,70.0,80.0,11570.0,7.0,6.0,2001.0,2004.0,164.0,733.0,...,576.0,168.0,70.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0
max,2919.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,1488.0,1424.0,742.0,1012.0,508.0,576.0,800.0,17000.0,12.0,2010.0


# Feature preprocessing.

## Handling NaN values.

### Identifying NaN values in the whole set.

In [5]:
nan_values = df_all.isna().sum()
nan_values[nan_values > 0].sort_values(ascending = False) / len(df_all) * 100

PoolQC          99.657417
MiscFeature     96.402878
Alley           93.216855
Fence           80.438506
FireplaceQu     48.646797
LotFrontage     16.649538
GarageFinish     5.447071
GarageQual       5.447071
GarageCond       5.447071
GarageYrBlt      5.447071
GarageType       5.378554
BsmtExposure     2.809181
BsmtCond         2.809181
BsmtQual         2.774923
BsmtFinType2     2.740665
BsmtFinType1     2.706406
MasVnrType       0.822199
MasVnrArea       0.787941
MSZoning         0.137033
BsmtFullBath     0.068517
BsmtHalfBath     0.068517
Functional       0.068517
Utilities        0.068517
GarageArea       0.034258
GarageCars       0.034258
Electrical       0.034258
KitchenQual      0.034258
TotalBsmtSF      0.034258
BsmtUnfSF        0.034258
BsmtFinSF2       0.034258
BsmtFinSF1       0.034258
Exterior2nd      0.034258
Exterior1st      0.034258
SaleType         0.034258
dtype: float64

In [6]:
df_all.drop(["Id", "PoolQC", "MiscFeature", "Alley", "Fence", "FireplaceQu"], axis = 1, inplace = True)

### Filling Nan values in qualitative features.

In [7]:
qualitative_nan_features = (df_all.select_dtypes(include = "object").isna().sum() > 0).index

for feature in qualitative_nan_features:
    df_all[feature].fillna(method = "ffill", inplace = True)

### Filling Nan values in quantitative features.

In [8]:
quantitative_nan_features = (df_all.select_dtypes(exclude = "object").isna().sum() > 0).index

for feature in quantitative_nan_features:
    df_all[feature].fillna(method = "ffill", inplace = True)

## Handling outliers.

### Identifying outliers.

In [9]:
df_quantitative_features = df_all.select_dtypes(exclude = "object")
df_qualitative_features = df_all.select_dtypes(include = "object")

In [10]:
def setting_fences(feature):
    q1, q3 = feature.quantile([.25, .75])
    i_q_r = iqr(feature, nan_policy = "raise")
    lower_fence = q1 - (1.5 * i_q_r)
    upper_fence = q3 + (1.5 * i_q_r)
    return lower_fence, upper_fence

In [11]:
df_fences = df_quantitative_features.apply(lambda x: setting_fences(x))

df_fences

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,-55.0,27.5,1340.0,2.0,3.5,1882.25,1906.5,-248.25,-1099.5,0.0,...,-64.0,-252.0,-105.0,0.0,0.0,0.0,0.0,0.0,-2.0,2004.0
1,145.0,111.5,17708.0,10.0,7.5,2072.25,2062.5,413.75,1832.5,0.0,...,960.0,420.0,175.0,0.0,0.0,0.0,0.0,0.0,14.0,2012.0


## Normalizing quantitative values (MinMaxNormalizer).

In [12]:
df_quantitative_features = (df_quantitative_features - df_quantitative_features.min()) / (df_quantitative_features.max() - df_quantitative_features.min())

df_quantitative_features

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,0.235294,0.150685,0.033420,0.666667,0.500,0.949275,0.883333,0.12250,0.125089,0.0,...,0.368280,0.000000,0.082210,0.000000,0.0,0.0,0.0,0.000000,0.090909,0.50
1,0.000000,0.202055,0.038795,0.555556,0.875,0.753623,0.433333,0.00000,0.173281,0.0,...,0.309140,0.209270,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.363636,0.25
2,0.235294,0.160959,0.046507,0.666667,0.500,0.934783,0.866667,0.10125,0.086109,0.0,...,0.408602,0.000000,0.056604,0.000000,0.0,0.0,0.0,0.000000,0.727273,0.50
3,0.294118,0.133562,0.038561,0.666667,0.500,0.311594,0.333333,0.00000,0.038271,0.0,...,0.431452,0.000000,0.047170,0.268775,0.0,0.0,0.0,0.000000,0.090909,0.00
4,0.235294,0.215753,0.060576,0.777778,0.500,0.927536,0.833333,0.21875,0.116052,0.0,...,0.561828,0.134831,0.113208,0.000000,0.0,0.0,0.0,0.000000,1.000000,0.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,0.823529,0.000000,0.002973,0.333333,0.750,0.710145,0.333333,0.00000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.454545,0.00
1455,0.823529,0.000000,0.002776,0.333333,0.500,0.710145,0.333333,0.00000,0.044649,0.0,...,0.192204,0.000000,0.032345,0.000000,0.0,0.0,0.0,0.000000,0.272727,0.00
1456,0.000000,0.476027,0.087406,0.444444,0.750,0.637681,0.766667,0.00000,0.216867,0.0,...,0.387097,0.332865,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.727273,0.00
1457,0.382353,0.140411,0.042726,0.444444,0.500,0.869565,0.700000,0.00000,0.059709,0.0,...,0.000000,0.056180,0.043127,0.000000,0.0,0.0,0.0,0.041176,0.545455,0.00


## Encoding qualitative featues.

In [13]:
df_qualitative_features = pd.get_dummies(df_qualitative_features, drop_first = True)

df_qualitative_features

Unnamed: 0,MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Street_Pave,LotShape_IR2,LotShape_IR3,LotShape_Reg,LandContour_HLS,LandContour_Low,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
1,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
2,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
3,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,0,0,0,1,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
1455,0,0,0,1,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
1456,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
1457,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0


In [14]:
df_all = pd.concat([df_quantitative_features, df_qualitative_features], axis = 1)

df_all

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,0.235294,0.150685,0.033420,0.666667,0.500,0.949275,0.883333,0.12250,0.125089,0.0,...,0,0,0,0,1,0,0,0,1,0
1,0.000000,0.202055,0.038795,0.555556,0.875,0.753623,0.433333,0.00000,0.173281,0.0,...,0,0,0,0,1,0,0,0,1,0
2,0.235294,0.160959,0.046507,0.666667,0.500,0.934783,0.866667,0.10125,0.086109,0.0,...,0,0,0,0,1,0,0,0,1,0
3,0.294118,0.133562,0.038561,0.666667,0.500,0.311594,0.333333,0.00000,0.038271,0.0,...,0,0,0,0,1,0,0,0,0,0
4,0.235294,0.215753,0.060576,0.777778,0.500,0.927536,0.833333,0.21875,0.116052,0.0,...,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,0.823529,0.000000,0.002973,0.333333,0.750,0.710145,0.333333,0.00000,0.000000,0.0,...,0,0,0,0,1,0,0,0,1,0
1455,0.823529,0.000000,0.002776,0.333333,0.500,0.710145,0.333333,0.00000,0.044649,0.0,...,0,0,0,0,1,0,0,0,0,0
1456,0.000000,0.476027,0.087406,0.444444,0.750,0.637681,0.766667,0.00000,0.216867,0.0,...,0,0,0,0,1,0,0,0,0,0
1457,0.382353,0.140411,0.042726,0.444444,0.500,0.869565,0.700000,0.00000,0.059709,0.0,...,0,0,0,0,1,0,0,0,1,0


# Building a model.

In [15]:
X_train = df_all.iloc[:1460, :]
y_train = df_sale_price

In [16]:
xgboost = xgb.XGBRegressor(learning_rate = 0.01,
                           n_estimators = 3460,
                           max_depth = 3, 
                           min_child_weight = 0,
                           gamma = 0, 
                           subsample = 0.7,
                           colsample_bytree = 0.7,
                           objective = 'reg:linear', 
                           nthread = -1,
                           scale_pos_weight = 1, 
                           seed = 27,
                           reg_alpha = 0.00006)

In [17]:
xgboost.fit(X_train, y_train)
y_pred = xgboost.predict(df_all.iloc[1460:, :])



# Making a submission.

In [18]:
df_submission = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

submission = pd.DataFrame({'Id': df_submission['Id'], 'SalePrice': y_pred})
submission.to_csv('submission.csv', index = False)