In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
#from sklearn.experimental import enable_iterative_imputer 
from sklearn.model_selection import train_test_split
#from sklearn.impute import IterativeImputer
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_selection import mutual_info_regression
import seaborn as sns
import scipy.stats as stats
import lightgbm as lgb

In [4]:
def fill_missing(df):
    
    
    #94% dont have alleys, might not be super important
    df.Alley = df.Alley.fillna("None")
    
    #Could also fill with 0 - doenst have garage
    df.GarageYrBlt = df.GarageYrBlt.fillna(0)
    
    #get rid
    df.MasVnrArea = df.MasVnrArea.fillna(0)
    df.MasVnrType = df.MasVnrType.fillna("None")
    df.BsmtFinSF1 = df.BsmtFinSF1.fillna(0)
    df.BsmtFinSF2 = df.BsmtFinSF2.fillna(0)
    df.BsmtUnfSF = df.BsmtUnfSF.fillna(0)
    df.TotalBsmtSF = df.TotalBsmtSF.fillna(0)
    df.GarageCars = df.GarageCars.fillna(0)
    df.GarageArea = df.GarageArea.fillna(0)
    df.BsmtFullBath = df.BsmtFullBath.fillna(0)
    df.BsmtHalfBath = df.BsmtHalfBath.fillna(0)
    
    
    #Bunch of non-missing values
    df.BsmtQual = df.BsmtQual.fillna("NA")
    df.BsmtCond = df.BsmtCond.fillna("NA")
    df.BsmtExposure = df.BsmtExposure.fillna("NA")
    df.BsmtFinType1 = df.BsmtFinType1.fillna("NA")
    df.BsmtFinType2 = df.BsmtFinType2.fillna("NA")
    df.FireplaceQu = df.FireplaceQu.fillna("NA")
    df.GarageType = df.GarageType.fillna("NA")
    df.GarageFinish = df.GarageFinish.fillna("NA")
    df.GarageQual = df.GarageQual.fillna("NA")
    df.GarageCond = df.GarageCond.fillna("NA")
    df.PoolQC = df.PoolQC.fillna("NA")
    df.Fence = df.Fence.fillna("NA")
    df.MiscFeature = df.MiscFeature.fillna("NA")
    
    #missing 0.068% and no missing in test so bye
    df.Electrical = df.Electrical.fillna("SBrkr")
    df.Functional = df.Functional.fillna("Typ")
    
    df.MSSubClass = df.MSSubClass.apply(str)
    df.YrSold = df.YrSold.astype(str)
    df.MoSold = df.MoSold.astype(str)
    df.KitchenQual = df.KitchenQual.fillna("TA")
    
    df.Exterior1st = df.Exterior1st.fillna(df.Exterior1st.mode()[0])
    df.Exterior2nd = df.Exterior2nd.fillna(df.Exterior2nd.mode()[0])
    df.SaleType = df.SaleType.fillna(df.SaleType.mode()[0])
    df.Utilities = df.Utilities.fillna(df.Utilities.mode()[0])
    
    df.MSZoning = df.MSZoning.fillna(df.MSZoning.mode()[0])
    df.LotFrontage = df.groupby('Neighborhood')["LotFrontage"].transform(lambda x: x.fillna(x.median()))
    return df


def target_encoder(df, target, labels):
    dic = df.groupby(target)[labels].mean().to_dict()
    
    return dic


In [187]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

Y = train.SalePrice
train_features = train.drop(["SalePrice", "Id"], axis = 1) 
test_features = test.drop(["Id"], axis = 1)

features = pd.concat([train_features, test_features], axis = 0)

features = fill_missing(features)

quantitative = [f for f in features.columns if features.dtypes[f] != 'object']
qualitative = [f for f in features.columns if features.dtypes[f] == 'object']

features["Total_Bathrooms"] = features.BsmtFullBath + 0.5 * features.BsmtHalfBath + features.FullBath + 0.5 * features.HalfBath
features["Has_Pool"] = features.PoolArea.apply(lambda x: 1 if x > 0 else 0)
features["TotalSF"] = features.TotalBsmtSF + features["1stFlrSF"] + features["2ndFlrSF"]
features["Has2ndFlr"] = features["2ndFlrSF"].apply(lambda x: 1 if x > 0 else 0)
features["HasFirePlace"] = features["Fireplaces"].apply(lambda x: 1 if x > 0 else 0)
features["HasBsmt"] = features["TotalBsmtSF"].apply(lambda x: 1 if x > 0 else 0)
features["HasGarage"] = features["GarageArea"].apply(lambda x: 1 if x > 0 else 0)
features["TotalPorchSF"] = features["WoodDeckSF"] + features["OpenPorchSF"] + features["EnclosedPorch"] + features['3SsnPorch'] + features["ScreenPorch"]


scaler = StandardScaler()

neigh_dict = target_encoder(train, "Neighborhood", "SalePrice")
features.Neighborhood = features.Neighborhood.map(neigh_dict)
quantitative.append("Neighborhood")
qualitative.remove("Neighborhood")

for col in quantitative: 
    target = features[col].to_numpy().reshape(-1, 1)
    target = scaler.fit_transform(target)
    features[col] = target
    
#seriously dumbfuck dummy encoding
for col in qualitative:
    dums = pd.get_dummies(features[col], prefix = str(col), drop_first = False)
    features = features.drop([col], axis = 1)
    features = pd.concat([features, dums], axis = 1)    
    
Y = Y.to_numpy().reshape(-1, 1)
Y = np.log1p(Y)


X = features[:len(train)]
X_test = features[len(train):]

X_train, X_dev, y_train, y_dev = train_test_split(X, Y, test_size = 0.05, random_state = 30)

In [169]:
X_test.Neighborhood

0      -0.578121
1      -0.578121
2       0.218723
3       0.218723
4       2.212966
          ...   
1454   -1.379425
1455   -1.379425
1456   -0.401435
1457   -0.401435
1458   -0.401435
Name: Neighborhood, Length: 1459, dtype: float64

In [196]:
train_dataset = lgb.Dataset(X_train, label = y_train)
dev_dataset = lgb.Dataset(X_dev, label = y_dev, reference = train_dataset)

param = {'num_leaves': 30, 
         'learning_rate':0.1, 
         'max_depth':-1, 
         'min_data_in_leaf':20, 
         'objective':'regression',
        }

param['metric'] = 'rmse'

num_round = 5000
bst = lgb.train(param, train_dataset, num_round, 
                valid_sets = [train_dataset, dev_dataset], 
                valid_names = ['train', 'dev'],
                verbose_eval = 500
               )


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3897
[LightGBM] [Info] Number of data points in the train set: 1387, number of used features: 213
[LightGBM] [Info] Start training from score 12.025817
[500]	train's rmse: 0.00592034	dev's rmse: 0.105211
[1000]	train's rmse: 0.00109788	dev's rmse: 0.10482
[1500]	train's rmse: 0.000372421	dev's rmse: 0.104825
[2000]	train's rmse: 0.000159713	dev's rmse: 0.104824
[2500]	train's rmse: 7.73984e-05	dev's rmse: 0.10482
[3000]	train's rmse: 3.91487e-05	dev's rmse: 0.104819
[3500]	train's rmse: 2.03742e-05	dev's rmse: 0.104818
[4000]	train's rmse: 1.09107e-05	dev's rmse: 0.104818
[4500]	train's rmse: 5.90051e-06	dev's rmse: 0.104818
[5000]	train's rmse: 3.20812e-06	dev's rmse: 0.104818


In [197]:
def write_to_csv(filename, predictions, ID):
    
    ID = ID.to_numpy()

    new_df = pd.DataFrame({"Id": ID, "SalePrice":predictions}, index = [0] * len(ID))
    
    new_df.to_csv(filename + ".csv", index = False)
    
ID = test.Id
predictions = np.expm1(bst.predict(X_test))
write_to_csv('sub', predictions, test.Id)

In [125]:
def rmslog(y, y_pred):
    m = len(y)
    mse = np.sum((np.log(y + 1) - np.log(y_pred + 1)) ** 2) / m
    print(np.sqrt(mse))

In [126]:
rmslog(np.expm1(y_train), np.expm1(bst.predict(X_train)))
rmslog(np.expm1(y_dev), np.expm1(bst.predict(X_dev)))

19.244151309908396
4.141858730295136
