In [1]:
import sklearn as sk
import sklearn.covariance as cov
import pandas as pd
import numpy as np

In [2]:
import sklearn.preprocessing as pre
from DataFrameImputer import DataFrameImputer

In [3]:
# Import and replace NaNs with most frequent value for labels and mean for numericals
train_original = pd.DataFrame.from_csv("train.csv")
test_original = pd.DataFrame.from_csv("test.csv")
lenght_train = len(train_original)
total = train_original.append(test_original)
total['SalePrice'] = total['SalePrice'].fillna(value=0) # don't impute price of the test data

total_dframe = DataFrameImputer().fit_transform(total)

In [4]:
def ascending(set, na=None):
    types = zip(set, range(1, len(set) + 1))

    if na != None:  # if there is a 'not appllicable' classification, generate an array with a 'has' and 'label' quality
        types = map(lambda x: [x[0], 1, x[1]], types)
        types.append([na, 0, 0])

    return dict(types)

In [5]:
# Label Properties
unneccessary = ['TotalBsmtSF', 'TotRmsAbvGrd']

special = ['MoSold', 'YrSold','SalePrice']

already_numerical = ['LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea',
                     'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 
                     'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 
                     'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 
                     'GarageArea', 'WoodDeckSF',
                     'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal','OverallQual', 'OverallCond',]

default = ['Ex', 'Gd', 'TA', 'Fa', 'Po']
ordered = {
    'LandContour': ['Low', 'Lvl', 'Bnk', 'HLS'],
    'Utilities': ['AllPub', 'NoSewr', 'NoSeWa', 'ELO'],
    'LandSlope': ['Gtl', 'Mod', 'Sev'],
    'ExterQual': default,
    'ExterCond': default,
    'BsmtQual': default,  # has an NA
    'BsmtCond': default,  # has an NA
    'BsmtExposure': ['Gd', 'Av', 'Mn', 'No'],  # has an NA
    'BsmtFinType1': ['GLQ', 'ALQ', 'BLQ', 'Rec', 'LwQ', 'Unf'],  # NA
    'BsmtFinType2': ['GLQ', 'ALQ', 'BLQ', 'Rec', 'LwQ', 'Unf'],
    'HeatingQC': default,
    'KitchenQual': default,
    'Functional': ['Typ', 'Min1', 'Min2', 'Mod', 'Maj1', 'Maj2', 'Sev', 'Sal'],
    'FireplaceQu': default,  # NA
    'GarageFinish': ['Fin', 'RFn', 'Unf'],  # NA
    'GarageQual': default,  # NA
    'GarageCond': default,  # NA
    'PavedDrive': ['Y', 'P', 'N'],
    'PoolQC': default,  # NA
    'Fence': ['GdPrv', 'MnPrv', 'GdWo', 'MnWw', 'NA'],
}

unordered = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LotConfig', 

             'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
             'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
             'Foundation', 'Heating', 'CentralAir', 'Electrical', 'GarageType', 'MiscFeature', 'SaleType', 'SaleCondition'
             ]

In [6]:
# check for all colummns
assert set.intersection(set(already_numerical),set(ordered.keys())) == set()
assert set.intersection(set(unordered),set(ordered.keys())) == set()
assert set.intersection(set(unordered),set(already_numerical)) == set()

In [7]:
all_columns = set(unneccessary + special + already_numerical + list(ordered.keys()) + unordered)
assert all_columns.symmetric_difference(set(total_dframe.columns))==set()

In [8]:
data = pd.DataFrame()

data['SalePrice'] = total_dframe['SalePrice'] # make price the first entry
data['SellDate'] = total_dframe['YrSold']+total_dframe['MoSold']/12

In [9]:
for item in already_numerical:
    data[item] = total_dframe[item]

In [10]:
for item in ordered: # enumerate keys
    d = ascending(ordered[item])
    data[item] = list(map(lambda x:d[x],total_dframe[item]))

In [11]:
lable_binarizer = dict()
for item in unordered:
    lb = pre.LabelBinarizer()
    lable_binarizer[item] = lb #save for later decoding
    
    new_columns = lb.fit_transform(total_dframe[item])
    for class_,content in zip(lb.classes_, new_columns.T):
        data[item+" "+str(class_)] = content
        

In [12]:
train = data[0:lenght_train]
test = data[lenght_train:]

train.to_csv("cleaned_train.csv")
test.to_csv("cleaned_test.csv")