In [1]:
#Importing A lot of Stuff 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.neighbors import LocalOutlierFactor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor,StackingRegressor,GradientBoostingRegressor
from sklearn.linear_model import LassoCV,RidgeCV,ElasticNetCV
from sklearn.svm import SVR
from xgboost.sklearn import XGBRegressor
import lightgbm as lbg

In [2]:
#Reading the data
train=pd.read_csv('data/train.csv',index_col='Id')
test=pd.read_csv('data/test.csv',index_col='Id')
y=train['SalePrice']
X=train.append(test)

In [3]:
#Create a Dictionary of the Missing Values and dropping columns with too many missing values
def missing(X):
    missing_cols={}
    for i in range(len(X.columns)):
        if X.iloc[:,i].isnull().sum()>0:
            missing_cols[X.iloc[:,i].name]=X.iloc[:,i].isnull().sum()        
    return missing_cols
X.drop(['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'],axis=1,inplace=True)

In [4]:
#Manually Fill Important Missing Values
#These columns have NaN for features the houses dont have. So I am adding None to them.
null_cats=['BsmtQual', 'BsmtCond','BsmtExposure', 'BsmtFinType1','BsmtFinType2',
           'GarageType','GarageFinish','GarageQual', 'GarageCond']
for f in null_cats:
    X[f].fillna('None',inplace=True)
    
#These columns should be 0 for NaN values
null_nums=['MasVnrArea' ,'GarageYrBlt', 'GarageArea', 
           'GarageCars','BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath']
for f in null_nums:
    X[f].fillna(0,inplace=True)
    
X['LotFrontage']=X.groupby('Neighborhood')['LotFrontage'].transform(lambda x:x.fillna(x.median()))
missing(X)

{'MSZoning': 4,
 'Utilities': 2,
 'Exterior1st': 1,
 'Exterior2nd': 1,
 'MasVnrType': 24,
 'Electrical': 1,
 'KitchenQual': 1,
 'Functional': 2,
 'SaleType': 1,
 'SalePrice': 1459}

In [5]:
#Fill the Remaining missing values with the imputer, median and most_frequent are used so that outliers do not affect this.
num_cols = X.select_dtypes([np.int64,np.float64]).columns
cat_cols = X.select_dtypes([object]).columns


imp_mean = SimpleImputer(missing_values=np.nan, strategy='median')
X_num=X[num_cols]

X_numeric=pd.DataFrame(imp_mean.fit_transform(X_num))
X_numeric.columns=X_num.columns
X_numeric.index=X_num.index

imp_most = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
X_cat=X[cat_cols]

X_categorical=pd.DataFrame(imp_most.fit_transform(X_cat))
X_categorical.columns=X_cat.columns
X_categorical.index=X_cat.index

X=pd.concat([X_numeric,X_categorical],axis=1)
missing(X)

{}

In [6]:
#LabelEncoding All the Categories
label=LabelEncoder()
for feat in cat_cols:
    X[feat]=label.fit_transform(X[feat])

In [7]:
#Outlier Detection and Removal
train=X[:1460]
def outliers(x, y=y, top=5):
    lof = LocalOutlierFactor(n_neighbors=50, contamination=0.1)
    x_ =np.array(x).reshape(-1,1)
    preds = lof.fit_predict(x_)
    lof_scr = lof.negative_outlier_factor_
    out_idx = pd.Series(lof_scr).sort_values()[:top].index
    return out_idx
outs1 = outliers(train['OverallQual'], top=5)
outs2 = outliers(train['GrLivArea'],top=5)
outs=outs1.append(outs2)
X.drop(outs,inplace=True)
y.drop(outs,inplace=True)

In [8]:
#Applying Standard Scaler 
X_scaled=pd.DataFrame(StandardScaler().fit_transform(X))
X_scaled.columns=X.columns
X_scaled.index=X.index
X_scaled

#Applying log to SalePrice to normalise its distribution.
y=np.log(y)

In [9]:
#Splitting the data into train and test sets according to kaggle
X_scaled.drop('SalePrice',inplace=True,axis=1)
train=X_scaled[:1451]
test=X_scaled[1451:]

#Creating a train test split for creating a model
X_train,X_test,y_train,y_test=train_test_split(train, y, test_size=.2, random_state=42)

In [12]:
#this model uses both stacking and blending approach. Hyperparameter isnt done so that can still improve this models accuracy
def model(X_train,y_train,X_test):
    #Single Models for Blending Aprroach
    las=LassoCV(max_iter=1e7,)
    rid=RidgeCV()
    enr=ElasticNetCV(max_iter=1e7)
    gbr=GradientBoostingRegressor()
    svm=SVR(C= 20, epsilon= 0.008, gamma=0.0003)
    xgb1=XGBRegressor(objective='reg:squarederror',learning_rate=0.02,n_estimators=3000,subsample=0.7,reg_alpha=0.00006)
    lgbm=lbg.LGBMRegressor(boosting_type='gbdt',objective='regression',n_estimators=3000,learning_rate=0.01,max_bin=200,
                           bagging_fraction=0.75,bagging_freq=5,feature_fraction=0.2)
    
    estimators=[('ridge', rid),('lasso',las),('ENR',enr),('GBR',gbr),('SVR',svm),('XGB',xgb1),('LGBM',lgbm)]
    
    #The Stacked Model
    stack_model=StackingRegressor(estimators=estimators,final_estimator=xgb1)
    stack_model.fit(X_train,y_train)
    y_stack=stack_model.predict(X_test)
    
    #Linear Blending of all models including the stacked one
    las.fit(X_train,y_train)
    rid.fit(X_train,y_train)
    enr.fit(X_train,y_train)
    gbr.fit(X_train,y_train)
    svm.fit(X_train,y_train)
    xgb1.fit(X_train,y_train)
    lgbm.fit(X_train,y_train)
    y1=las.predict(X_test)
    y2=rid.predict(X_test)
    y3=enr.predict(X_test)
    y4=gbr.predict(X_test)
    y5=svm.predict(X_test)
    y6=xgb1.predict(X_test)
    y7=lgbm.predict(X_test)
    
    
    
    #Blending all the results
    y_pred=0.1*y1+0.1*y2+0.1*y3+0.1*y4+0.1*y5+0.1*y6+0.1*y7+0.3*y_stack
    
    return y_pred

In [13]:
y_pred=model(X_train,y_train,X_test) #Model's performance on test data
mean_absolute_error(y_pred,y_test)



0.08693963582477862

In [14]:
y_pred=model(train,y,train)  #Models's Performance on training data. 
mean_absolute_error(y_pred,y)#This helps in understanding if the model has any bias or variance



0.04929923062855886

In [15]:
result=model(train,y,test) #Making the final predictions



In [17]:
res=pd.read_csv('data/sample_submission.csv',index_col=None)  #Creating a submission file
res['SalePrice']=np.floor(np.exp(result))
res.to_csv('Submission.csv',index=False)