The following code provides feature engineering for data. Here the feature engineering includes 
- Naive feature engineering to get sum, average and counts of some features
- Extract key words from features and map these key words into binary values (0, 1)
- Special designation for building_ids, manager_ids, display_address with only 1 observation, change into -1
- High-Cardinality Categorical encoding
- Factorize building_id, display_address, manager_id, street_address

Core code from rakhlin, another Python version of It is lit by Branden
  https://www.kaggle.com/rakhlin/another-python-version-of-it-is-lit-by-branden

### Import Modules

In [135]:
%matplotlib inline
import random
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from itertools import product
from sklearn.model_selection import StratifiedKFold

### Define Functions

In [136]:
def add_features(df):
    ''' naive engineering
    '''
    fmt = lambda s: s.replace("\u00a0", "").strip().lower()
    df["photo_count"] = df["photos"].apply(len)
    
    df["street_address"] = df['street_address'].apply(fmt)
    df["display_address"] = df["display_address"].apply(fmt)
    df["desc_wordcount"] = df["description"].apply(str.split).apply(len)
    
    df["pricePerBed"] = df['price'] / df['bedrooms']
    df["pricePerBath"] = df['price'] / df['bathrooms']
    df["pricePerRoom"] = df['price'] / (df['bedrooms'] + df['bathrooms'])
    
    df["bedPerBath"] = df['bedrooms'] / df['bathrooms']
    df["bedBathDiff"] = df['bedrooms'] - df['bathrooms']
    df["bedBathSum"] = df["bedrooms"] + df['bathrooms']
    df["bedsPerc"] = df["bedrooms"] / (df['bedrooms'] + df['bathrooms'])
    
    # FE_0 to FE_6 don't process NAN.
    df = df.fillna(-1).replace(np.inf, -1)
    return df

In [137]:
def factorize(df1, df2, column):
    ps = df1[column].append(df2[column])
    ''' Encode input values as an enumerated type or categorical variable (i.e., 0, 1, 2,…..)
    Return the unique values. Index is returned when passed values is Index or Series
    '''
    factors = ps.factorize()[0]
    df1[column] = factors[:len(df1)]
    df2[column] = factors[len(df1):]
    return df1, df2

In [138]:
def designate_single_observations(df1, df2, column):
    '''designate single observations into -1
    '''
    ps = df1[column].append(df2[column])
    grouped = ps.groupby(ps).size().to_frame().rename(columns={0: "size"})
    df1.loc[df1.join(grouped, on=column, how="left")["size"] <= 1, column] = -1
    df2.loc[df2.join(grouped, on=column, how="left")["size"] <= 1, column] = -1
    return df1, df2

In [139]:
def hcc_encode(train_df, test_df, variable, target, prior_prob, k, f=1, g=1, r_k=None, update_df=None):
    """
    See "A Preprocessing Scheme for High-Cardinality Categorical Attributes in
    Classification and Prediction Problems" by Daniele Micci-Barreca
    """
    hcc_name = "_".join(["hcc", variable, target])

    grouped = train_df.groupby(variable)[target].agg({"size": "size", "mean": "mean"})
    grouped["lambda"] = 1 / (g + np.exp((k - grouped["size"]) / f))
    grouped[hcc_name] = grouped["lambda"] * grouped["mean"] + (1 - grouped["lambda"]) * prior_prob

    df = test_df[[variable]].join(grouped, on = variable, how = "left")[hcc_name].fillna(prior_prob)
    if r_k: df *= np.random.uniform(1 - r_k, 1 + r_k, len(test_df))     # Add uniform noise. Not mentioned in original paper

    if update_df is None: update_df = test_df
    if hcc_name not in update_df.columns: update_df[hcc_name] = np.nan
    update_df.update(df)
    return

In [140]:
def create_binary_features(df):
    '''Extract key words from features and create binary columns including features in bows, with values either 0 or 1
    This is very similar to what I did in statistical analysis or mapping in FE2
    '''
    bows = {
        "dogs": ("dogs", "dog"),
        "cats": ("cats",),
        "nofee": ("no fee", "no-fee", "no  fee", "nofee", "no_fee"),
        "lowfee": ("reduced_fee", "low_fee", "reduced fee", "low fee"),
        "furnished": ("furnished",),
        "parquet": ("parquet", "hardwood"),
        "concierge": ("concierge", "doorman", "housekeep", "in_super"),
        "prewar": ("prewar", "pre_war", "pre war", "pre-war"),
        "laundry": ("laundry", "lndry"),
        "health": ("health", "gym", "fitness", "training"),
        "transport": ("train", "subway", "transport"),
        "parking": ("parking",),
        "utilities": ("utilities", "heat water", "water included")
    }
    def indicator(bow):
        return lambda s: int(any([x in s for x in bow]))

    features = df["features"].apply(lambda f: " ".join(f).lower())
    for key in bows:
        df["feature_" + key] = features.apply(indicator(bows[key]))

    return df

### Load Data

Here both "listing_id" and 'created' represents the order when the post was created.

In [141]:
X_train = pd.read_json("train.json")
X_test = pd.read_json("test.json")

In [142]:
# Make target integer, one hot encoded, calculate target priors
X_train = X_train.replace({"interest_level": {"low": 0, "medium": 1, "high": 2}})
X_train = X_train.join(pd.get_dummies(X_train["interest_level"], prefix="pred").astype(int))
prior_0, prior_1, prior_2 = X_train[["pred_0", "pred_1", "pred_2"]].mean()

### Feature Engineering

In [143]:
# Add common features
X_train = add_features(X_train)
X_test = add_features(X_test)

In [144]:
# Special designation for building_ids, manager_ids, display_address with only 1 observation, change into -1
for col in ('building_id', 'manager_id', 'display_address'):
    X_train, X_test = designate_single_observations(X_train, X_test, col)

In [145]:
# High-Cardinality Categorical encoding
skf = StratifiedKFold(5)
attributes = product(("building_id", "manager_id"), zip(("pred_1", "pred_2"), (prior_1, prior_2)))
for variable, (target, prior) in attributes:
    hcc_encode(X_train, X_test, variable, target, prior, k=5, r_k=None)
    for train, test in skf.split(np.zeros(len(X_train)), X_train['interest_level']):
        hcc_encode(X_train.iloc[train], X_train.iloc[test], variable, target, prior, k=5, r_k=0.01, update_df=X_train)

In [146]:
# Factorize building_id, display_address, manager_id, street_address
for col in ('building_id', 'display_address', 'manager_id', 'street_address'):
    # encode col into numerical values starting from 0, 1, 2,....
    X_train, X_test = factorize(X_train, X_test, col)

In [147]:
# Create binarized features
X_train = create_binary_features(X_train)
X_test = create_binary_features(X_test)

### Export Data

In [148]:
X_train = X_train.drop(["photos", "pred_0","pred_1", "pred_2", "description","building_id", "features", "created"], axis = 1)
X_test = X_test.drop(["photos", "description","building_id", "features", "created"], axis = 1)

In [149]:
X_train.to_json('FE6_train.json')
X_test.to_json('FE6_test.json')