In [None]:
import pandas as pd
import numpy as np
from xgboost.sklearn import XGBRegressor
from sklearn.linear_model import LassoCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import math
from statistics import mode
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

df_actual = pd.read_csv('../input/train.csv')
tf_actual = pd.read_csv('../input/test.csv')
zt = tf_actual['Id'].values
df = df_actual.drop(['Id'],axis=1)
tf = tf_actual.drop(['Id'],axis=1)

In [None]:
num_cols = df.select_dtypes(include=['number']).columns
ndf = df[num_cols]
#Splitting columns with number values
ndf = ndf.drop(['MSSubClass','MoSold','YrSold'],axis=1)
#Numerical values which are actually categorical
ndf = ndf.fillna(ndf.median())
#Filling missing values with median gives better results than filling the with 0

ndf["GarageArea"][ndf["GarageArea"]>1100] = ndf["GarageArea"].mean()
ndf["TotalBsmtSF"][ndf["TotalBsmtSF"]>=3000] = ndf["TotalBsmtSF"].mean()
ndf["1stFlrSF"][ndf["1stFlrSF"]>=2500] = ndf["1stFlrSF"].mean()
ndf["GrLivArea"][ndf["GrLivArea"]>=4000] = ndf["GrLivArea"].mean()
ndf["WoodDeckSF"][ndf["WoodDeckSF"]>=600] = ndf["WoodDeckSF"].mean()
ndf["LotFrontage"][ndf["LotFrontage"]>=160] = ndf["LotFrontage"].mean()
ndf["LotArea"][ndf["LotArea"]>=27000] = ndf["LotArea"].mean()
len(ndf.columns)

In [None]:
ndf.columns

In [None]:
#Applying non-linear transformation - log base 2 to all columns which dont have 0 because log2(0) = -infinite
for _ in ndf.columns:
    if _ not in ndf.columns[(ndf == 0).any()]:
        ndf[_] = np.log2(ndf[_])

In [None]:
#Getting the columns which are not number, i.e., categorical columns
cat_cols = [_ for _ in df.columns if _ not in ndf.columns]
cdf = df[cat_cols]
len(cdf.columns)

In [None]:
cdf.columns

In [None]:
lowlearn_cat = [_ for _ in cdf.columns if 1 - sum(df[_] == mode(df[_]))/len(df) < 0.05]
#columns with more than 95% same values.
cdf = cdf.drop(lowlearn_cat,axis=1)
#are dropped
lowlearn_cat

In [None]:
len(cdf.columns)

In [None]:
#If a categorical column has a NA value it means that property is missing that amenity, which is actually
#a type of category and hence should be considered as another category.
actualmiss_cat = [_ for _ in cdf.columns if cdf[_].isnull().any()]
for _ in actualmiss_cat:
    null_ind = cdf[_].isnull()
    cdf.loc[null_ind, _] = 'None'
actualmiss_cat

In [None]:
#Converting the numerical features data type to string type
cdf[['MSSubClass', 'MoSold', 'YrSold']] = cdf[['MSSubClass', 'MoSold', 'YrSold']].astype(str)

In [None]:
#Binarising the categorical columns, a new column for each different category in each column is created.
cdf = pd.get_dummies(cdf)

In [None]:
#Again filling with median values which is same as filling with mode value as there are only 0's or 1's
cdf = cdf.fillna(cdf.median())

In [None]:
#Since too many columns have been created and there isn't enough data, the dimensionality of the data has
#to be reduced. Since there is a column for each different category in each column the categories with
#very low number properties belonging to it are removed. Below the percentage of properties belonging to
#each class is shown.
print(cdf[cdf == 1].count(axis=0)/14.6)

In [None]:
#Such columns are stored in a list
lownum_cat = []
for _ in cdf.columns:
    _c = 0
    for _i in range(1460):
        if(cdf.iloc[_i][_] == 1):
            _c+=1
    _p = _c/1460
    if(_p < 0.01):
        lownum_cat.append(_)
lownum_cat

In [None]:
ndf.shape,cdf.shape

In [None]:
dfc = pd.concat([ndf,cdf],axis=1)
len(dfc.columns)

In [None]:
#All of the above feature engineering is done to test data.
numt_cols = tf.select_dtypes(include=['number']).columns
ntf = tf[numt_cols]
ntf = ntf.drop(['MSSubClass','MoSold','YrSold'],axis=1)
ntf = ntf.fillna(ndf.median())
ntf["GarageArea"][ntf["GarageArea"]>1100] = ntf["GarageArea"].mean()
ntf["TotalBsmtSF"][ntf["TotalBsmtSF"]>=3000] = ntf["TotalBsmtSF"].mean()
ntf["1stFlrSF"][ntf["1stFlrSF"]>=2500] = ntf["1stFlrSF"].mean()
ntf["GrLivArea"][ntf["GrLivArea"]>=4000] = ntf["GrLivArea"].mean()
ntf["WoodDeckSF"][ntf["WoodDeckSF"]>=600] = ntf["WoodDeckSF"].mean()
ntf["LotFrontage"][ntf["LotFrontage"]>=160] = ntf["LotFrontage"].mean()
ntf["LotArea"][ntf["LotArea"]>=27000] = ntf["LotArea"].mean()
for _ in ntf.columns:
    if _ not in ntf.columns[(ntf == 0).any()]:
        ntf[_] = np.log2(ntf[_])
catt_cols = [_ for _ in tf.columns if _ not in ntf.columns]
ctf = tf[catt_cols]
ctf = ctf.drop(lowlearn_cat,axis=1)
for _ in actualmiss_cat:
    null_ind = ctf[_].isnull()
    ctf.loc[null_ind, _] = 'None'
ctf[['MSSubClass', 'MoSold', 'YrSold']] = ctf[['MSSubClass', 'MoSold', 'YrSold']].astype(str)
ctf = pd.get_dummies(ctf)
ctf = ctf.fillna(ctf.median())
ntf.shape,ctf.shape

In [None]:
tfc = pd.concat([ntf,ctf],axis=1)
len(tfc.columns)

In [None]:
#There are few columns in train data that are not in test data due to properties of certain categories
#not been present, therefore they are inserted into the training data and initialized to 0 to maintain
# dimensionality for computation
for col in cdf.columns:
    if col not in ctf.columns:
        ctf.insert(cdf.columns.get_loc(col), col, 0)
        tfc.insert(cdf.columns.get_loc(col), col, 0)

In [None]:
ctf.shape

In [None]:
#The columns which where in lownum_cat are dropped
cdf = cdf.drop(lownum_cat,axis=1)
ctf = ctf.drop(lownum_cat,axis=1)

In [None]:
cdf.shape,ctf.shape

In [None]:
#There might be few columns in test data that are not in train data due to properties of certain categories
#not been present, therefore they are dropped from the test data because there isn't data to learn about 
#these categories
for col in ctf.columns:
    if col not in cdf.columns:
        print(col)

In [None]:
ctf = ctf.drop(['MSSubClass_150'],axis = 1)

In [None]:
dfc = dfc.drop(lownum_cat,axis=1)
tfc = tfc.drop(lownum_cat,axis=1)
tfc = tfc.drop(['MSSubClass_150'],axis = 1)

In [None]:
dfc.shape,tfc.shape

In [None]:
#The hyperparameters are tuned using the gridsearchcv function which tries all combinations of the parameters
#that is given to it and tells the best parameters to use.
#XGBRegressor is used, for details refer the link given at the end.
params = {'max_depth': [3,4], 'n_estimators': [500,800,1000,1200], 'colsample_bytree': [0.2,0.3,0.4]}

bxgb = GridSearchCV(estimator = XGBRegressor(learning_rate=0.05), param_grid = params, cv = 5, n_jobs = -1, verbose = 3)
bxgb.fit(dfc.drop('SalePrice',axis = 1),dfc['SalePrice'])
print(bxgb.best_estimator_)

In [None]:
bxgb = XGBRegressor(colsample_bytree=0.2,learning_rate=0.05, max_depth=3, n_estimators=1000)
#cv_score is the score of each fold.
cv_score = cross_val_score(bxgb, dfc.drop('SalePrice', axis = 1), dfc['SalePrice'], cv = 5, n_jobs = -1)

In [None]:
#The average score across all folds.
print('Mean CV XGB: '+ str(np.mean(cv_score)))

In [None]:
bxgb.fit(dfc.drop('SalePrice',axis = 1),dfc['SalePrice'])
yt_xgb = bxgb.predict(tfc)

In [None]:
yt_xgb

In [None]:
yt2_xgb = np.power(2,yt_xgb)

In [None]:
yt2_xgb

In [None]:
#LassoCV is exactly same as Lasso but it will tune it's hyper parameters automatically on successive usage.
lcv = LassoCV()
slcv = Pipeline([('std_sclr',StandardScaler()),('lcv',lcv)])

cv_score = cross_val_score(slcv, dfc.drop(['SalePrice'], axis = 1), dfc['SalePrice'], cv = 5, n_jobs=-1)

In [None]:
print('Mean CV Lasso: '+ str(np.mean(cv_score)))

In [None]:
slcv.fit(dfc.drop('SalePrice',axis = 1),dfc['SalePrice'])
yt_lcv = slcv.predict(tfc)

In [None]:
w = slcv.named_steps['lcv'].coef_
count=0
for i in range(len(w)):
    if(w[i]==0):
        count+=1
w

In [None]:
count

In [None]:
yt_lcv

In [None]:
yt2_lcv = np.power(2,yt_lcv)

In [None]:
yt2_lcv

In [None]:
#Concept of ensemble modeling is used, it is like, not same as, bagging algorithms which take best among 
#values, Ex: Pocket algorithm in perceptron, but this works on the scale of models than values.
#A best example of ensemble modeling is random forest, rather than 1 desicion tree, multiple trees are grown
#and their average result is used, which gives better results than a single tree.
yt2 = (yt2_xgb + yt2_lcv)/2

In [None]:
zt

In [None]:
yt_pred = np.vstack((zt,yt2))
yt_pred

In [None]:
f = open('hp_output_lasxgb.csv', 'w')
f.write("Id,SalePrice\n")
for i in range(1459):
    f.write("{0},{1}\n".format(int(yt_pred[0][i]),float(yt_pred[1][i])))

In [None]:
"""
Useful Links:

Pipeline: https://medium.com/@yanhann10/a-brief-view-of-machine-learning-pipeline-in-python-5f50b941fca8
gridsearchCV: https://stackoverflow.com/questions/30102973/how-to-get-best-estimator-on-gridsearchcv-random-forest-classifier-scikit
Lasso algo: https://www.analyticsvidhya.com/blog/2016/01/complete-tutorial-ridge-lasso-regression-python/
Gradient boosting algorithm: https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/
Hyper parameter tuning of XGB: https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
"""