In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
X = pd.read_csv('train.csv')
X_train = X.drop(columns = ['SalePrice', 'Id'], axis = 1)
y_train = X["SalePrice"]

In [3]:
X_train.dropna(axis = 1, inplace = True)
X_train.head()

Unnamed: 0,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,...,61,0,0,0,0,0,2,2008,WD,Normal
1,20,RL,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,...,0,0,0,0,0,0,5,2007,WD,Normal
2,60,RL,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,...,42,0,0,0,0,0,9,2008,WD,Normal
3,70,RL,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,...,35,272,0,0,0,0,2,2006,WD,Abnorml
4,60,RL,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,...,84,0,0,0,0,0,12,2008,WD,Normal


# Ordinal Encoding


In [4]:
def OrdinalEncoding(X_train):
    categorical_cols = [cname for cname in X_train.columns if X_train[cname].dtype == 'object']
    non_cat_cols = [cname for cname in X_train.columns if cname not in categorical_cols]
    
    X_categorical = X_train[categorical_cols]
    X_non_cat = X_train[non_cat_cols]
    X_new_cat = X_categorical.copy()
    
    for col in X_categorical.columns:
        myDict = {category : idx for idx, category in enumerate(X_categorical[col].unique())}
        for i, val in enumerate(X_categorical[col]):
            X_new_cat[col][i] = myDict[val]
    
    X_train_OE = pd.concat([X_new_cat, X_non_cat], axis = 1)
    
    return X_train_OE

In [5]:
X_train.head()

Unnamed: 0,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,...,61,0,0,0,0,0,2,2008,WD,Normal
1,20,RL,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,...,0,0,0,0,0,0,5,2007,WD,Normal
2,60,RL,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,...,42,0,0,0,0,0,9,2008,WD,Normal
3,70,RL,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,...,35,272,0,0,0,0,2,2006,WD,Abnorml
4,60,RL,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,...,84,0,0,0,0,0,12,2008,WD,Normal


In [6]:
X_train_OE = OrdinalEncoding(X_train)
X_train_OE.head()

Unnamed: 0,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,0,0,0,0,0,0,0,0,0,0,...,548,0,61,0,0,0,0,0,2,2008
1,0,0,0,0,0,1,0,1,1,0,...,460,298,0,0,0,0,0,0,5,2007
2,0,0,1,0,0,0,0,0,0,0,...,608,0,42,0,0,0,0,0,9,2008
3,0,0,1,0,0,2,0,2,0,0,...,642,0,35,272,0,0,0,0,2,2006
4,0,0,1,0,0,1,0,3,0,0,...,836,192,84,0,0,0,0,0,12,2008


In [7]:
X_train.head()

Unnamed: 0,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,...,61,0,0,0,0,0,2,2008,WD,Normal
1,20,RL,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,...,0,0,0,0,0,0,5,2007,WD,Normal
2,60,RL,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,...,42,0,0,0,0,0,9,2008,WD,Normal
3,70,RL,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,...,35,272,0,0,0,0,2,2006,WD,Abnorml
4,60,RL,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,...,84,0,0,0,0,0,12,2008,WD,Normal


# One Hot Encoding


In [8]:
def OneHotEncoding(X_train):
    categorical_cols = [cname for cname in X_train.columns if (X_train[cname].dtype == 'object' and X_train[cname].nunique() < 10)]
    non_cat_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ['int64', 'float64']]

    X_categorical = X_train[categorical_cols]
    X_non_cat = X_train[non_cat_cols]
    X_new_cat = pd.DataFrame()
    
    for col in X_categorical.columns:
        uniques = X_categorical[col].unique()
        df = pd.DataFrame(columns = uniques)
        for val in uniques:
            s = pd.Series([0] * X_train.shape[0], name = val)
            df[val] = s
        for idx, category in enumerate(X_categorical[col]):
            df[category][idx] = 1

        X_new_cat = pd.concat([X_new_cat, df], axis = 1)
    
    X_train_OH = pd.concat([X_new_cat, X_non_cat], axis = 1)
    return X_train_OH

In [9]:
X_train_OH = OneHotEncoding(X_train)

In [10]:
X_train_OH.head()

Unnamed: 0,RL,RM,C (all),FV,RH,Pave,Grvl,Reg,IR1,IR2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,1,0,0,0,0,1,0,1,0,0,...,548,0,61,0,0,0,0,0,2,2008
1,1,0,0,0,0,1,0,1,0,0,...,460,298,0,0,0,0,0,0,5,2007
2,1,0,0,0,0,1,0,0,1,0,...,608,0,42,0,0,0,0,0,9,2008
3,1,0,0,0,0,1,0,0,1,0,...,642,0,35,272,0,0,0,0,2,2006
4,1,0,0,0,0,1,0,0,1,0,...,836,192,84,0,0,0,0,0,12,2008


In [11]:
X_train.head()

Unnamed: 0,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,...,61,0,0,0,0,0,2,2008,WD,Normal
1,20,RL,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,...,0,0,0,0,0,0,5,2007,WD,Normal
2,60,RL,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,...,42,0,0,0,0,0,9,2008,WD,Normal
3,70,RL,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,...,35,272,0,0,0,0,2,2006,WD,Abnorml
4,60,RL,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,...,84,0,0,0,0,0,12,2008,WD,Normal
