In [None]:
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics

test_data = pd.read_csv("test.csv")
train_data = pd.read_csv("train.csv")


In [None]:
#Drop high null value attributes
train_data.drop(['Alley','PoolQC','Fence','MiscFeature','LandSlope',
                'Condition1','Condition2','Exterior1st','Exterior2nd'], axis = 1, inplace=True)
test_data.drop(['Alley','PoolQC','Fence','MiscFeature','LandSlope',
               'Condition1','Condition2','Exterior1st','Exterior2nd'], axis = 1, inplace=True)

In [None]:
#Clean out null values. Replace categorical null with "Missing" and numerical null with 0.
for col in ('MSZoning','Utilities','MasVnrType',
            'BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2',
            'KitchenQual','Functional','FireplaceQu','GarageType','GarageFinish',
            'GarageQual','GarageCond','SaleType' ) : train_data[col] = train_data[col].fillna('Missing')
for col in ('MSZoning','Utilities','MasVnrType',
            'BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2',
            'KitchenQual','Functional','FireplaceQu','GarageType','GarageFinish',
            'GarageQual','GarageCond','SaleType','BsmtFinSF1','BsmtFinSF2') : test_data[col] = test_data[col].fillna('Missing')
train_data['LotFrontage'] = train_data['LotFrontage'].fillna(train_data['LotFrontage'].mode()[0])
train_data['MasVnrArea'] = train_data['MasVnrArea'].fillna(train_data['MasVnrArea'].mode()[0])
train_data['Electrical'] = train_data['Electrical'].fillna(train_data['Electrical'].mode()[0])
train_data['GarageYrBlt'] = train_data['GarageYrBlt'].fillna(train_data['GarageYrBlt'].mode()[0])
train_data['GarageCars'] = train_data['GarageCars'].fillna(0)
train_data['GarageArea'] = train_data['GarageArea'].fillna(0)
test_data ['LotFrontage'] = test_data['LotFrontage'].fillna(test_data['LotFrontage'].mode()[0])
test_data ['MasVnrArea'] = test_data['MasVnrArea'].fillna(test_data['MasVnrArea'].mode()[0])
test_data ['GarageYrBlt'] = test_data['GarageYrBlt'].fillna(test_data['GarageYrBlt'].mode()[0])
test_data ['BsmtUnfSF'] = test_data['BsmtUnfSF'].fillna(test_data['BsmtUnfSF'].mode()[0])
test_data ['TotalBsmtSF'] = test_data['TotalBsmtSF'].fillna(test_data['TotalBsmtSF'].mode()[0])
test_data ['BsmtFullBath'] = test_data['BsmtFullBath'].fillna(test_data['BsmtFullBath'].mode()[0])
test_data ['BsmtHalfBath'] = test_data['BsmtHalfBath'].fillna(test_data['BsmtHalfBath'].mode()[0])
test_data ['GarageCars'] = test_data['GarageCars'].fillna(0)
test_data ['GarageArea'] = test_data['GarageArea'].fillna(0)

In [None]:
#Convert Categorical into numerical 1 - identify categorical columns in test data
from sklearn.preprocessing import LabelEncoder
cat_test_data = test_data.select_dtypes(include=['object']).copy()
cat_test_data.columns

In [None]:
#Convert Categorical into numerical 2 - identify categorical values in train data
cat_train_data = train_data.select_dtypes(include=['object']).copy()
cat_train_data.columns

In [None]:
#Convert Categorical into numerical 3 - convert test categorical using LabelEncoder
categoricals = ('MSZoning', 'LotFrontage', 'Street', 'LotShape', 'LandContour',
       'Utilities', 'LotConfig', 'Neighborhood', 'BldgType',
       'HouseStyle', 'RoofStyle', 'RoofMatl', 'MasVnrType', 'MasVnrArea', 'ExterQual',
       'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
       'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical',
       'BsmtFullBath', 'BsmtHalfBath', 'KitchenQual', 'Functional',
       'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition')
for i in categoricals:
    labeler = LabelEncoder()
    labeler.fit(list(test_data[i].values))
    test_data[i] = labeler.transform(list(test_data[i].values))
test_data.head()

In [None]:
#Convert Categorical into numerical 4 - convert train categorical using LabelEncoder
categoricals = ('MSZoning', 'LotFrontage', 'Street', 'LotShape', 'LandContour',
       'Utilities', 'LotConfig', 'Neighborhood', 'BldgType', 'HouseStyle',
       'RoofStyle', 'RoofMatl', 'MasVnrType', 'MasVnrArea', 'ExterQual',
       'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
       'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir',
       'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType',
       'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive',
       'SaleType', 'SaleCondition')
for i in categoricals:
    labeler = LabelEncoder()
    labeler.fit(list(train_data[i].values))
    train_data[i] = labeler.transform(list(train_data[i].values))
train_data.head()

test_data.to_csv('test_data_clean.csv')
train_data.to_csv('train_data_clean.csv')