In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from sklearn.ensemble import GradientBoostingRegressor


In [2]:
df_train = pd.read_csv("C:/Users/harish/Desktop/train.csv",index_col='Id')
Y_train=df_train['SalePrice']
df_train=df_train.iloc[:,:-1]#This will skip last column


df_test = pd.read_csv("C:/Users/harish/Desktop/test.csv",index_col='Id')

In [3]:
df= pd.concat([df_train, df_test])#We concatenate the trainng and test set so that we can apply all the preprocessing at 
                                  # once on both training and test set

In [4]:
#For most of the categorical variables, we fill NA by defining a separate category.
df['LotFrontage'].fillna(0,inplace=True)
df['Alley'].fillna('NAA',inplace=True)
df['MasVnrType'].fillna('None',inplace=True)
df.update(df[['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2']].fillna('NB'))
df['MasVnrArea'].fillna(0,inplace=True)
df['Electrical'].fillna(df['Electrical'].mode()[0],inplace=True)
df['FireplaceQu'].fillna('NFP',inplace=True)
df.update(df[['GarageType','GarageYrBlt','GarageFinish','GarageQual','GarageCond']].fillna('NG'))
df['PoolQC'].fillna('NP',inplace=True)
df['Fence'].fillna('NF',inplace=True)

df.fillna(method='ffill',inplace=True)#There are some variables in test set that have NA values when the same variable
                                      #does not have any NA in training set.To tackle these type variables, we have used ffill


In [5]:

df=pd.get_dummies(df,columns=['MSZoning','Neighborhood','Condition1','Condition2','BldgType',
                              'MasVnrType','Exterior2nd','Exterior1st','RoofStyle','Heating',
                              'CentralAir','RoofMatl'],drop_first=True)#one hot encoding used for categorical variables 
                                                                       #that do are not ordinal

#Label encoding used for categorical variables with ordinality
df['SaleType']=LabelEncoder().fit_transform(df['SaleType'])
df['SaleCondition']=LabelEncoder().fit_transform(df['SaleCondition'])


In [6]:
#We are manually replacing categorical(ordinal) variables with numerical entries to preserve their order of importance
replacements = {
   'Street': {'Pave': 1,'Grvl': 0},
    'Alley': {'NAA': 0,'Grvl': 1,'Pave':2},
    'LotShape': {'IR3': 0,'IR2': 1,'IR1':2,'Reg':3},
    'LandContour': {'Low': 0,'HLS': 1,'Bnk':2,'Lvl':3},
    'LotConfig': {'Inside': 0,'Corner': 1,'CulDSac':2,'FR2':3,'FR3':4},
    'LandSlope': {'Sev': 0,'Mod': 1,'Gtl':2},
    'HouseStyle': {'1Story': 1,'1.5Fin':1.5,'1.5Unf':1.25,'2Story':2,'2.5Fin':2.5,'2.5Unf':2.25,'SFoyer':3,'SLvl':3.5},
    'ExterQual': {'Ex': 4,'Gd':3,'TA':2,'Fa':1,'Po':0},
    'ExterCond': {'Ex': 4,'Gd':3,'TA':2,'Fa':1,'Po':0},
    'Foundation': {'PConc':5,'CBlock':4,'Stone':3,'Slab':2,'BrkTil':1,'Wood':0},
    'BsmtQual': {'Ex':105,'Gd':95,'TA':85,'Fa':75,'Po':42.5,'NB':0},
    'BsmtCond': {'Ex': 5,'Gd':4,'TA':3,'Fa':2,'Po':1,'NB':0},
    'BsmtExposure':{'Gd': 4,'Av':3,'Mn':2,'No':1,'NB':0},
    'BsmtFinType1':{'GLQ':6,'ALQ':5,'BLQ':4,'Rec':3,'LwQ':2,'Unf':1,'NB':0},
    'BsmtFinType2':{'GLQ':6,'ALQ':5,'BLQ':4,'Rec':3,'LwQ':2,'Unf':1,'NB':0},
    'HeatingQC':{'Ex': 4,'Gd':3,'TA':2,'Fa':1,'Po':0},
    'Electrical':{'SBrkr': 4,'FuseA':3,'FuseF':2,'FuseP':1,'Mix':2.5},
    'KitchenQual':{'Ex': 5,'Gd':4,'TA':3,'Fa':2,'Po':1},
    'Functional':{'Typ': 7,'Min1':6,'Min2':5,'Mod':4,'Maj1':3,'Maj2':2,'Sev':1,'Sal':0},
    'FireplaceQu':{'Ex': 5,'Gd':4,'TA':3,'Fa':2,'Po':1,'NFP':0},
    'GarageType':{'2Types': 6,'Attchd':5,'Basment':4,'BuiltIn':3,'CarPort':2,'Detchd':1,'NG':0},
    'GarageFinish':{'Fin':3,'RFn':2,'Unf':1,'NG':0}, 
    'GarageQual':{'Ex': 5,'Gd':4,'TA':3,'Fa':2,'Po':1,'NG':0},
    'GarageCond':{'Ex': 5,'Gd':4,'TA':3,'Fa':2,'Po':1,'NG':0},
    'PavedDrive':{'Y': 3,'P':2,'N':1},
    'PoolQC':{'Ex': 4,'Gd':3,'TA':2,'Fa':1,'NP':0},
    'Fence':{'GdPrv': 4,'MnPrv':3,'GdWo':2,'MnWw':1,'NF':0}   
}

df.replace(replacements, inplace=True)


df.loc[df['Utilities'] =='AllPub', 'Utilities'] = 1
df.loc[df['Utilities'] != 1, 'Utilities'] = 0


In [7]:
#FEATURE ENGINEERING
df['basementinfo']=(df['BsmtFinSF1']*df['BsmtFinType2'])+(df['BsmtFinSF1']*df['BsmtFinType2'])+df['BsmtUnfSF']
df['Totalbath']=df['BsmtFullBath']+df['BsmtHalfBath']+df['FullBath']+df['HalfBath']
df['Kitcheninfo']=df['KitchenQual']*df['KitchenAbvGr']
df['Fireplaceinfo']=df['FireplaceQu']*df['Fireplaces']
df['Garageinfo']=df['GarageType']*df['GarageFinish']*df['GarageQual']*df['GarageCond']*df['GarageCars']*df['GarageArea']
df['Porchtotal']=df['OpenPorchSF']+df['EnclosedPorch']+df['3SsnPorch']+df['ScreenPorch']
df['Poolinfo']=df['PoolQC']*df['PoolArea']


In [8]:
df.drop(['BsmtFinSF1', 'BsmtFinSF2','BsmtFinType1','BsmtFinType2','BsmtUnfSF','TotalBsmtSF',
        '1stFlrSF', '2ndFlrSF','LowQualFinSF','KitchenQual', 'KitchenAbvGr','FireplaceQu', 
         'Fireplaces','GarageYrBlt','GarageType','GarageFinish','GarageQual','GarageCond',
         'GarageCars','GarageArea','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch'
        ,'PoolQC', 'PoolArea','MiscFeature','MoSold','BsmtFullBath', 'BsmtHalfBath','FullBath','HalfBath'], axis=1,inplace=True)

In [9]:
X_train=df.iloc[:(df_train.shape[0]),]
X_test =df.iloc[(df_train.shape[0]):,]

In [117]:
modelGB=GradientBoostingRegressor(n_estimators=400,random_state=6)
GB=modelGB.fit(X_train,Y_train)

In [120]:
y_pred=GB.predict(X_test)

In [121]:
s1=pd.Series(X_test.index.values, name='Id')
s2=pd.Series(y_pred, name='SalePrice')
sub= pd.concat([s1, s2], axis=1)
sub.head()

Unnamed: 0,Id,SalePrice
0,1461,127347.726795
1,1462,150319.230065
2,1463,199236.215572
3,1464,195172.618018
4,1465,180640.005063


In [122]:
import os
os.chdir("C:/Users/ashwini/Desktop")
sub.to_csv('finaldataprocessing_GB.csv',index=False)