In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor


def score_dataset(X, y, model=XGBRegressor()):
    # Label encoding for categoricals
    for colname in X.select_dtypes(["category", "object"]):
        X[colname], _ = X[colname].factorize()
    # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
    score = cross_val_score(
        model, X, y, cv=5, scoring="neg_mean_squared_log_error",
    )
    score = -1 * score.mean()
    score = np.sqrt(score)
    return score

In [2]:
# Prepare data
data = pd.read_csv("C:/Users/HP/Desktop/Kaggle Assignment/Feature Engineering/mutual info/ames.csv")
df=pd.DataFrame(data)
X = df.copy()
y = X.pop("SalePrice")

In [3]:
# 1) Create Mathematical Transforms
# Create the following features:
#   LivLotRatio: the ratio of GrLivArea to LotArea
#   Spaciousness: the sum of FirstFlrSF and SecondFlrSF divided by TotRmsAbvGrd
#   TotalOutsideSF: the sum of WoodDeckSF, OpenPorchSF, EnclosedPorch, Threeseasonporch, and ScreenPorch
# YOUR CODE HERE
X_1 = pd.DataFrame()  # dataframe to hold new features

X_1["LivLotRatio"] = df.GrLivArea / df.LotArea
X_1["Spaciousness"] = (df.FirstFlrSF + df.SecondFlrSF) / df.TotRmsAbvGrd
X_1["TotalOutsideSF"] = df.WoodDeckSF + df.OpenPorchSF + df.EnclosedPorch + df.Threeseasonporch + df.ScreenPorch

In [5]:
X_3 = pd.DataFrame()

# YOUR CODE HERE
X_3["PorchTypes"] = df[[ "WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "Threeseasonporch", "ScreenPorch"]].gt(0).sum(axis=1)

In [6]:
# 4) Break Down a Categorical Feature
# MSSubClass describes the type of a dwelling:
df.MSSubClass.unique()

# You can see that there is a more general categorization described (roughly) by the first word of each category. 
# Create a feature containing only these first words by splitting MSSubClass at the first underscore _. 
# (Hint: In the split method use an argument n=1.)
X_4 = pd.DataFrame()

X_4["MSClass"] = df.MSSubClass.str.split("_", n=1, expand=True)[0]

In [7]:
# 5) Use a Grouped Transform
# The value of a home often depends on how it compares to typical homes in its neighborhood. 
# Create a feature MedNhbdArea that describes the median of GrLivArea grouped on Neighborhood.
X_5 = pd.DataFrame()

# YOUR CODE HERE
X_5["MedNhbdArea"] = (
    df.groupby("Neighborhood")  # for each Neighborhood
    ["GrLivArea"]                 # select the GrLivArea
    .transform("median")         # and compute its median
)

# Now you've made your first new feature set! If you like, you can run the cell below to score the model 
# with all of your new features added:
X_new = X.join([X_1, X_3, X_4, X_5])
score_dataset(X_new, y)

0.1421271162458208