In [14]:
import import_ipynb
from Helpers import *
import pandas as pd
import numpy as np

In [15]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

data = train.copy()


In [16]:
# high NAN count
high_nan_columns = ['Electrical','LotFrontage','GarageYrBlt']
no_variety_columns = ['Utilities','Condition2','Heating','PavedDrive','Street','BsmtCond','GarageFinish','GarageFinish','RoofMatl']
columnsToFillNone= ['GarageCond','BsmtQual', 'FireplaceQu', 'MiscFeature','PoolQC','Alley','Fence','MasVnrType','GarageQual', 'GarageType','BsmtFinType2','BsmtFinType1','BsmtExposure']
columnsToFillZero= ['MasVnrArea',"BsmtFinSF1","BsmtFinSF2",'BsmtUnfSF',"TotalBsmtSF","BsmtFullBath","BsmtHalfBath","GarageCars","GarageArea" ] 
ignoreUncorrelatedRate = 0.00
categoryColumns2 = ["OverallQual","OverallCond","MoSold","BsmtFullBath","FullBath","HalfBath","Fireplaces","GarageCars",'BsmtHalfBath', 'MSSubClass']
areaColumns=[i for i in train.columns if "SF" in i or 'Area' in i or 'ScreenPorch' in i] 

In [17]:


class HomePreprocessor:
    def __init__(self, drop_low_correlated=False):
        self.drop_low_correlated = drop_low_correlated

    def fit(self, X, y=None):
        return self

    def transform(self, X, *_):

        if (drop_low_correlated):
            X=X.pipe(drop_non_correlated_columns, min_corr=ignoreUncorrelatedRate)
        
        df = df.pipe(drop_columns, columns=high_nan_columns) # high number of NA values 
        .pipe(drop_columns, columns=no_variety_columns) # no variety

        .pipe(fill_NA, columns=columnsToFillNone, value='None') # 
        .pipe(fill_NA, columns=columnsToFillZero, value=0) # 
        .pipe(get_dummies_for_category_columns, columns=X.columns[X.dtypes=="object"])
        .pipe(get_dummies_for_category_columns, columns=categoryColumns2)
        .pipe(set_type, 'MasVnrArea','int64')
        .pipe(year_to_age, 'YearBuilt')
        .pipe(year_to_age, 'YearRemodAdd')
        .pipe(year_to_age, 'YrSold')
        .pipe(drop_columns, columns=["Id","BedroomAbvGr"])## drop Garbage
        .pipe(log_tranform, columns=areaColumns)       
        return X

train = preprocess(train).pipe(drop_non_correlated_columns, min_corr=ignoreUncorrelatedRate) 
test_ids=test['Id']
test = preprocess(test)
train.describe()

Unnamed: 0,KitchenAbvGr,TotRmsAbvGrd,EnclosedPorch,3SsnPorch,MiscVal,SalePrice,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,...,log_TotalBsmtSF,log_1stFlrSF,log_2ndFlrSF,log_LowQualFinSF,log_GrLivArea,log_GarageArea,log_WoodDeckSF,log_OpenPorchSF,log_ScreenPorch,log_PoolArea
count,1448.0,1448.0,1448.0,1448.0,1448.0,1448.0,1448.0,1448.0,1448.0,1448.0,...,1448.0,1448.0,1448.0,1448.0,1448.0,1448.0,1448.0,1448.0,1448.0,1448.0
mean,1.046961,6.51105,22.101519,3.437845,43.020718,180085.632597,0.006906,0.04489,0.01105,0.786602,...,6.745084,7.004752,2.865757,0.100641,7.265631,5.807202,2.441869,2.3025,0.410379,0.026418
std,0.221209,1.615368,61.33759,29.436993,497.718904,78663.945304,0.082844,0.207133,0.104571,0.409848,...,1.148086,0.314542,3.29357,0.750391,0.331355,1.451715,2.591356,2.150766,1.402543,0.409777
min,0.0,2.0,0.0,0.0,0.0,34900.0,0.0,0.0,0.0,0.0,...,0.0,5.814131,0.0,0.0,5.814131,0.0,0.0,0.0,0.0,0.0
25%,1.0,5.0,0.0,0.0,0.0,129900.0,0.0,0.0,0.0,1.0,...,6.679285,6.782758,0.0,0.0,7.028644,5.795297,0.0,0.0,0.0,0.0
50%,1.0,6.0,0.0,0.0,0.0,162700.0,0.0,0.0,0.0,1.0,...,6.898715,6.988873,0.0,0.0,7.284135,6.171701,0.0,3.218876,0.0,0.0
75%,1.0,7.0,0.0,0.0,0.0,213062.5,0.0,0.0,0.0,1.0,...,7.158903,7.232191,6.591674,0.0,7.481556,6.357842,5.129899,4.234107,0.0,0.0
max,3.0,14.0,552.0,508.0,15500.0,755000.0,1.0,1.0,1.0,1.0,...,8.073091,8.079928,7.63337,6.350886,8.450412,7.237778,6.602588,6.306275,6.175867,6.605298
