In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import warnings
warnings.filterwarnings('ignore')

## Encoding categoracl features

In [2]:
data = pd.read_csv('./Data/house_prices_kaggle/train.csv')

In [3]:
def miss(df):
    # it would be cool to add all types of columns into table
    missing = df.isnull().sum()
    missing = missing[missing > 0].sort_values(ascending = False)
    perc_miss = 100 * missing / len(df)
    missing = pd.concat([missing, perc_miss], axis = 1)
    missing.columns = [['Total missed values', '% of missed values']]
    print("This DataFrame has " + str(missing.shape[0]) + " columns with NaN values from " + 
          str(df.shape[1]) + " columns ");
    return missing

In [4]:
miss(data)

This DataFrame has 19 columns with NaN values from 81 columns 


Unnamed: 0,Total missed values,% of missed values
PoolQC,1453,99.520548
MiscFeature,1406,96.30137
Alley,1369,93.767123
Fence,1179,80.753425
FireplaceQu,690,47.260274
LotFrontage,259,17.739726
GarageYrBlt,81,5.547945
GarageType,81,5.547945
GarageFinish,81,5.547945
GarageQual,81,5.547945


In [5]:
# LabelEncoding categorical features using scikit-learn
label = LabelEncoder()
categ_features = data.select_dtypes('object')
categ_features = categ_features.drop(columns = ['Alley', 'PoolQC', 'MiscFeature', 'Fence', 'FireplaceQu'])
categ_features = categ_features.dropna()
X = categ_features.apply(label.fit_transform)
print("The shape of this DataFrame is " + str(X.shape))
X.head()

The shape of this DataFrame is (1338, 38)


Unnamed: 0,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,...,Electrical,KitchenQual,Functional,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,SaleType,SaleCondition
0,3,1,3,3,0,4,0,5,2,2,...,4,2,6,1,1,4,4,2,8,4
1,3,1,3,3,0,2,0,24,1,2,...,4,3,6,1,1,4,4,2,8,4
2,3,1,0,3,0,4,0,5,2,2,...,4,2,6,1,1,4,4,2,8,4
3,3,1,0,3,0,0,0,6,2,2,...,4,2,6,5,2,4,4,2,8,0
4,3,1,0,3,0,2,0,15,2,2,...,4,2,6,1,1,4,4,2,8,4


In [6]:
# One-Hot Encoding categorical features using scikit-learn
onehot = OneHotEncoder()
onehot.fit(X)
X2 = onehot.transform(X).toarray()
print("In the X were " + str(X.nunique().sum()) + " unique values and " + str(X.shape[1]) + " columns\n")
print("The shape of X2 is " + str(X2.shape))
X2

In the X were 229 unique values and 38 columns

The shape of X2 is (1338, 229)


array([[0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.]])

In [7]:
categ_features = data.select_dtypes('object')
print("Unique values of column 'LotConfig' are: \n" + str(categ_features['LotConfig'].unique()))
label = LabelEncoder()
label.fit(categ_features['LotConfig'].unique())
categ_features['LotConfig'] = pd.Series(label.fit_transform(data['LotConfig']))
print("\nEncoded values: ")
print(dict(enumerate(label.classes_)))
categ_features.head()

Unique values of column 'LotConfig' are: 
['Inside' 'FR2' 'Corner' 'CulDSac' 'FR3']

Encoded values: 
{0: 'Corner', 1: 'CulDSac', 2: 'FR2', 3: 'FR3', 4: 'Inside'}


Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,RL,Pave,,Reg,Lvl,AllPub,4,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,RL,Pave,,Reg,Lvl,AllPub,2,Gtl,Veenker,Feedr,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,RL,Pave,,IR1,Lvl,AllPub,4,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,RL,Pave,,IR1,Lvl,AllPub,0,Gtl,Crawfor,Norm,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
4,RL,Pave,,IR1,Lvl,AllPub,2,Gtl,NoRidge,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
