<a href="https://www.kaggle.com/code/obinna11/housing-price-feature-selection-and-engineering?scriptVersionId=99122959" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
import lightgbm as lgbm

In [2]:
def read_file(train_filePath, test_filePath):
    train = pd.read_csv(train_filePath)
    test = pd.read_csv(test_filePath)
    Id = test["Id"]
    y = train["SalePrice"]
    train.drop("SalePrice", axis = 1, inplace = True)
    allHousing_data = pd.concat([train, test])

    return Id, y, allHousing_data

In [3]:
train_filePath = ("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
test_filePath = ("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")
Id, y, allHousing_data = read_file(train_filePath, test_filePath)

print(allHousing_data.shape)
print(allHousing_data.head())
print(allHousing_data.isnull().sum())
print(allHousing_data.describe())

(2919, 80)
   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... ScreenPorch PoolArea PoolQC Fence MiscFeature  \
0         Lvl    AllPub  ...           0        0    NaN   NaN         NaN   
1         Lvl    AllPub  ...           0        0    NaN   NaN         NaN   
2         Lvl    AllPub  ...           0        0    NaN   NaN         NaN   
3         Lvl    AllPub  ...           0        0    NaN   NaN         NaN   
4         Lvl    AllPub  ...           0        0    NaN   NaN         NaN   

  MiscVal MoSold  YrSold  SaleType  SaleCondition  
0       0      2    200

In [4]:
#data cleaning
def data_cleaning(allHousing_data):

    #missing values with Mode
    missingValue_FillingWithMode = ["MSZoning", "Utilities", "Exterior1st", "Exterior2nd", "MasVnrType", "Electrical", "KitchenQual", "Functional", "SaleType",
                                    "BsmtFullBath", "BsmtHalfBath", "GarageCars"]
    for i in missingValue_FillingWithMode:
        allHousing_data[i].fillna(allHousing_data[i].mode()[0], inplace = True)

    #missing values with None
    missingValue_FillingWithNone = ["Alley", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "FireplaceQu", "GarageType", "GarageFinish",
                                    "GarageQual", "GarageCond", "PoolQC", "Fence", "MiscFeature"]
    allHousing_data[missingValue_FillingWithNone] = allHousing_data[missingValue_FillingWithNone].fillna("None")

    #missing values with median
    missingValue_FillingWithMedian = ["LotFrontage", "MasVnrArea", "TotalBsmtSF", "BsmtUnfSF", "BsmtFinSF2", "BsmtFinSF1", "GarageArea"]
    
    for i in missingValue_FillingWithMedian:
        allHousing_data[i].fillna(allHousing_data[i].median(), inplace = True)

    #missing dates with date of last remodelled
    allHousing_data["GarageYrBlt"] = allHousing_data["GarageYrBlt"].fillna(allHousing_data["YearRemodAdd"])
    
    return allHousing_data

In [5]:
allHousing_data = data_cleaning(allHousing_data)

In [6]:
#feature eng
def feature_eng(allHousing_data):
    allHousing_data["House_age"] = allHousing_data["YrSold"] - allHousing_data["YearBuilt"]
    allHousing_data["Rmod_age"] = allHousing_data["YrSold"] - allHousing_data["YearRemodAdd"]

    skewness = ["LotFrontage","LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "1stFlrSF", "2ndFlrSF", "GrLivArea", "House_age", "Rmod_age",
                "GarageArea", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch", "ScreenPorch", "PoolArea", "TotalBsmtSF", "LowQualFinSF"]
    for i in skewness:
        if allHousing_data[i].skew(axis =0) >= 1 or allHousing_data[i].skew(axis =0) <= -1:
            allHousing_data[i] = np.log10(allHousing_data[i]+1)

    convert_val = list(allHousing_data.select_dtypes(include = "object").columns.values)
    ordinal_encoder = OrdinalEncoder()
    allHousing_data[convert_val] = ordinal_encoder.fit_transform(allHousing_data[convert_val])

    return allHousing_data

In [7]:
allHousing_data = feature_eng(allHousing_data)

In [8]:
#feature selection
def feature_selection(X_train, y_train, X_test):

    # configure to select all features
    fs = SelectKBest(score_func=f_regression, k=60)
    # learn relationship from training data
    fs.fit(X_train, y_train)
    # transform train input data
    X_train_fs = fs.transform(X_train)
    # transform test input data
    X_test_fs = fs.transform(X_test)

    mask = fs.get_support()
    new_features = X_train.columns[mask]
    X_train_fs = pd.DataFrame(X_train_fs, columns=new_features)
    X_test_fs = pd.DataFrame(X_test_fs, columns=new_features)
    
    return X_train_fs, X_test_fs

In [9]:
train = allHousing_data.iloc[:1460,:]
test = allHousing_data.iloc[1460:,:]

train, test = feature_selection(train, y, test)

In [10]:
model = lgbm.LGBMRegressor(random_state=1)
model.fit(train, y)
preds = model.predict(test)

output = pd.DataFrame({'Id': Id, 'SalePrice': preds})
output.to_csv('submission.csv', index=False)