In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
## Preprocess kaggle testing data

raw_data = pd.read_csv("./housing_data/test.csv")
# 2. selecting useful features and drop columns with nan
filtered_features = ["MSZoning", "LotFrontage", "LotArea", "Street", "LotShape", "LandContour", "Utilities", "LotConfig", "LandSlope", "BldgType", "HouseStyle", 
                     "OverallQual", "OverallCond", "YearBuilt", "YearRemodAdd", "MasVnrArea", "ExterQual", "ExterCond", "Foundation", "BsmtQual", "BsmtCond", 
                     "BsmtExposure", "BsmtUnfSF", "TotalBsmtSF", "Heating", "HeatingQC", "CentralAir", "1stFlrSF", "2ndFlrSF", "GrLivArea", "BsmtFullBath", "BsmtHalfBath", 
                     "FullBath", "HalfBath", "KitchenAbvGr", "KitchenQual", "TotRmsAbvGrd", "Functional", "Fireplaces", "FireplaceQu", "GarageCars", "GarageArea", 
                     "GarageQual", "GarageCond", "PavedDrive", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch", "ScreenPorch", "PoolArea", "PoolQC", "Fence", 
                     "MiscVal", "YrSold", "SaleType", "SaleCondition"]
filtered_data = raw_data[filtered_features]

# 3. encoding ordinal categorical variables
ordinal_columns = ["Utilities", "LandSlope", "ExterQual", "ExterCond", "BsmtQual", "BsmtCond", "BsmtExposure",
                   "HeatingQC", "KitchenQual", "Functional", "FireplaceQu", "GarageQual", "GarageCond", "PavedDrive", 
                   "PoolQC", "Fence"]

# deal with meaningful NaN
filtered_data = filtered_data.copy()
meaningful_nan = 'not_applicable'
filtered_data[ordinal_columns] = filtered_data[ordinal_columns].fillna(meaningful_nan)

ordinal_order = {
    "Utilities": [meaningful_nan, "ELO", "NoSeWa", "NoSewr", "AllPub"],
    "LandSlope": ["Sev", "Mod", "Gtl"],
    "ExterQual": ["Po", "Fa", "TA", "Gd", "Ex"],
    "ExterCond": ["Po", "Fa", "TA", "Gd", "Ex"],
    "BsmtQual": ["Po", "Fa", meaningful_nan, "TA", "Gd", "Ex"],
    "BsmtCond": ["Po", "Fa", meaningful_nan, "TA", "Gd", "Ex"],
    "BsmtExposure": ["No", "Mn", meaningful_nan, "Av", "Gd"],
    "HeatingQC": ["Po", "Fa", "TA", "Gd", "Ex"],
    "KitchenQual": ["Po", "Fa", "TA", meaningful_nan, "Gd", "Ex"],
    "Functional": [meaningful_nan, "Sal", "Sev", "Maj2", "Maj1", "Mod", "Min2", "Min1", "Typ"],
    "FireplaceQu": ["Po", "Fa", meaningful_nan, "TA", "Gd", "Ex"],
    "GarageQual": ["Po", "Fa", meaningful_nan, "TA", "Gd", "Ex"],
    "GarageCond": ["Po", "Fa", meaningful_nan, "TA", "Gd", "Ex"],
    "PavedDrive": ["N", "P", "Y"],
    "PoolQC": ["Fa", "TA", meaningful_nan, "Gd", "Ex"],
    "Fence": ["MnWw", "MnPrv", meaningful_nan, "GdWo", "GdPrv"]
}
# ordinal encoding
for col in ordinal_columns:
    print(col)
    ordinal_encoder = OrdinalEncoder(categories=[ordinal_order[col]])
    ordinal_data = ordinal_encoder.fit_transform(filtered_data[[col]])
    filtered_data.loc[:, col] = ordinal_data
    
# 4. encoding nominal categorical variable
nominal_columns = ["MSZoning", "Street", "LotShape", "LandContour", "LotConfig", "BldgType", "HouseStyle", 
                   "Foundation", "Heating", "CentralAir", "SaleType", "SaleCondition"]
# one-hot encoding
nominal_data = pd.get_dummies(filtered_data[nominal_columns])
# remove original nominal data and concat the one-hot-encoded data
encoded_data = pd.concat([filtered_data.drop(nominal_columns, axis=1), nominal_data], axis=1)

# 5. feature engineering
encoded_data["year_since_built"] = encoded_data["YrSold"] - encoded_data["YearBuilt"]
encoded_data["year_since_remod"] = encoded_data["YrSold"] - encoded_data["YearRemodAdd"]
encoded_data.drop(["YrSold", "YearBuilt", "YearRemodAdd"], axis = 1, inplace = True)

# 6. normalization
scaler = StandardScaler()
encoded_data_scaled = scaler.fit_transform(encoded_data)
encoded_data_scaled = pd.DataFrame(encoded_data_scaled, columns=encoded_data.columns)

# 7. dealing NaN for for numeric variables
numerical_columns = ['LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'MasVnrArea', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF',
                     '2ndFlrSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars',
                     'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', "year_since_built", "year_since_remod"]
for col in numerical_columns:
    encoded_data_scaled[col].fillna(encoded_data_scaled[col].mean(), inplace=True)
encoded_data_scaled[numerical_columns].isnull().sum()

encoded_data_scaled.shape
encoded_data_scaled["Heating_OthW"] = [0] * 1459
encoded_data_scaled["Heating_Floor"] = [0] * 1459
encoded_data_scaled["HouseStyle_2.5Fin"] = [0] * 1459

encoded_data_scaled = encoded_data_scaled[X_train.columns]

In [None]:
## model goes here
import lightgbm as lgb
lgbm_reg_best = lgb.LGBMRegressor(objective='regression', 
                                  learning_rate=0.01, 
                                  max_bin=200, 
                                  bagging_fraction=0.8,
                                  bagging_freq=4, 
                                  bagging_seed=8,
                                  feature_fraction=0.2,
                                  feature_fraction_seed=8,
                                  verbose=-1,
                                  random_state=42, 
                                  min_sum_hessian_in_leaf = 3,
                                  n_estimators = 5000, 
                                  num_leaves = 6)
lgbm_reg_best.fit(X_train, y_train)

In [None]:
submission_dict = {"Id": raw_data["Id"],
                   "SalePrice": lgbm_reg_best.predict(encoded_data_scaled)}
submission_df = pd.DataFrame(submission_dict)
submission_df.to_csv("lgbm_v1.csv", index=False)