In [76]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [176]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

def add_mult_dummy(df, lst_dummies):
    '''
    df: dataframe to get the dummies from and append the columns to 
    
    lst_dummies: List column names to be converted to dummies as strings
    
    returns: 
    dataframe with columns lst_dummies converted to dummy encoding
    '''
    
    for dum in lst_dummies: 
        if len(df[df[dum].isnull()]) != 0:
            df[dum] = df[dum].fillna("None")
        
        d = pd.get_dummies(df[dum], prefix = dum)
        
        df = pd.concat([df, d], axis = 1).drop(dum, axis = 1)
    
    return df


def scale_mult(df, lst_norm):
    '''
    df: dataframe to scale values of 
    
    lst_norm: List of columns to scale
    
    returns:
    dataframe with specified columns normalized
    '''
    
    scaler = StandardScaler()
    
    for col in lst_norm: 
        temp = df[col].to_numpy().reshape(-1, 1)
        
        df[col] = scaler.fit_transform(temp)
        
    return df

train.LotFrontage = train.LotFrontage.fillna(0)
test.LotFrontage = test.LotFrontage.fillna(0)

test.GarageCars = test.GarageCars.fillna(0).astype(int)
test.BsmtFullBath = test.BsmtFullBath.fillna(0).astype(int)
test.BsmtHalfBath = test.BsmtHalfBath.fillna(0).astype(int)

full = pd.concat([train, test])


#TODO: Central air

lst_dummies = [
    'MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 
    'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood',
    'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', "OverallCond",
    "RoofStyle", "RoofMatl", 'Exterior1st', 'Exterior2nd', 'MasVnrType',  "ExterQual", 
    "ExterCond", 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 
    'BsmtFinType2', 'Heating', "HeatingQC", 'Electrical', "BsmtFullBath", "BsmtHalfBath",
    "FullBath", "HalfBath", "BedroomAbvGr", "KitchenAbvGr", "KitchenQual", "TotRmsAbvGrd", "Functional",
    "Fireplaces", "FireplaceQu", "GarageType", "GarageFinish", "GarageCars", "GarageQual",
    'GarageCond', "PavedDrive", 'PoolQC', 'Fence', 'MiscFeature', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition'
    
]

lst_scale = [
    "LotFrontage", 'LotArea', "YearBuilt", 'YearRemodAdd', "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF",
    "TotalBsmtSF", '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', "GrLivArea", "GarageYrBlt", 'WoodDeckSF',  'OpenPorchSF',
    'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', "GarageArea"
]


full = add_mult_dummy(full, lst_dummies)
full = scale_mult(full, lst_scale)

train_f = full[:1460]
test_f = full[1460:]
# train = add_mult_dummy(train, lst_dummies)
# train = scale_mult(train, lst_scale)

# test = add_mult_dummy(test, lst_dummies)
# test = scale_mult(test, lst_scale)

In [170]:
for i in test.columns:
    if i not in train.columns:
        print(i)

MSSubClass_150
MSZoning_None
Utilities_None
Exterior1st_None
Exterior2nd_None
FullBath_4
KitchenQual_None
TotRmsAbvGrd_13
TotRmsAbvGrd_15
Functional_None
Fireplaces_4
GarageCars_5
SaleType_None


In [171]:
for i in train.columns: 
    if i not in test.columns:
        print(i)

SalePrice
Utilities_NoSeWa
Condition2_RRAe
Condition2_RRAn
Condition2_RRNn
HouseStyle_2.5Fin
RoofMatl_ClyTile
RoofMatl_Membran
RoofMatl_Metal
RoofMatl_Roll
Exterior1st_ImStucc
Exterior1st_Stone
Exterior2nd_Other
Heating_Floor
Heating_OthW
Electrical_Mix
Electrical_None
BedroomAbvGr_8
KitchenAbvGr_3
TotRmsAbvGrd_2
TotRmsAbvGrd_14
GarageQual_Ex
PoolQC_Fa
MiscFeature_TenC
