CV statistics adapted from https://www.kaggle.com/guoday/cv-statistics-better-parameters-and-explaination/notebook

- This code uses five-fold cross validation statistics on 'manager_id' and used XGBoost algorithm to make predictions
- Similar rules can be applied to other features, especially building_id.
- More feature engineering needs to be done for better prediction results.

In [1]:
import os
import sys
import operator
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
import random
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

### Import data.

In [2]:
train_df=pd.read_json('train.json')
test_df=pd.read_json('test.json')

### Feature Engineering

Added five features: 
- price_t: average price per bedroom
- room_sum: sum of bedrooms and bathrooms
- num_photos: number of photos in each entry
- num_features: number of features in each entry
- num_description_words: number of words in description

In [3]:
# average price per bedrooms
train_df["price_t"] = train_df["price"]/train_df["bedrooms"]
test_df["price_t"] = test_df["price"]/test_df["bedrooms"] 

# sum of bedrooms and bathrooms
train_df["room_sum"] = train_df["bedrooms"] + train_df["bathrooms"] 
test_df["room_sum"] = test_df["bedrooms"] + test_df["bathrooms"] 

# count number of photos
train_df["num_photos"] = train_df["photos"].apply(len)
test_df["num_photos"] = test_df["photos"].apply(len)

# count of "features"
train_df["num_features"] = train_df["features"].apply(len)
test_df["num_features"] = test_df["features"].apply(len)

# count of words present in description column
train_df["num_description_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))
test_df["num_description_words"] = test_df["description"].apply(lambda x: len(x.split(" ")))

features_to_use = ["bathrooms", "bedrooms", "latitude", "longitude", "price","price_t","num_photos", "num_features",
                 "num_description_words","listing_id"]

Define XGB function
- default test_y is None
- param is a dictionary
- data is imported into XGB function as a DMatrix

In [4]:
def runXGB(train_X, train_y, test_X, test_y = None, feature_names = None, seed_val = 0, num_rounds = 1000):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.03
    param['max_depth'] = 6
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label = train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label = test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model

Engineering manager id.

In [5]:
index = list(range(train_df.shape[0]))
random.shuffle(index)

# initialize a, b, c as NAN list because Xgb could handle NaN,
a = [np.nan]*len(train_df)
b = [np.nan]*len(train_df)
c = [np.nan]*len(train_df)

# five-fold cross validation for train_df
# split the train_df into five folds, 4 folds set as train, 1 fold set as test
# add three columns to all five folds
for i in range(5):
    building_level = {}
    
    # building_level = {manager_id: [0, 0, 0]}
    for j in train_df['manager_id'].values:
        building_level[j] = [0, 0, 0]
    # test index is 1/5 of the train_df index
    test_index = index[int((i*train_df.shape[0])/5):int(((i+1)*train_df.shape[0])/5)]
    # train_index is 4/5 of the train_df index, = index - test_index
    train_index = list(set(index).difference(test_index))
    
    # update building_level to be {manager_id: [low interest count, medium interest count, high interest count]} for train_index
    for j in train_index:
        temp = train_df.iloc[j]
        if temp['interest_level'] == 'low':
            building_level[temp['manager_id']][0] += 1
        if temp['interest_level'] == 'medium':
            building_level[temp['manager_id']][1] += 1
        if temp['interest_level'] == 'high':
            building_level[temp['manager_id']][2] += 1
    
    # update building_level to be {manager_id: [low interest count%, medium interest count%, high interest count%]} for test_index
    for j in test_index:
        temp = train_df.iloc[j]
        if sum(building_level[temp['manager_id']]) != 0: # if building_level has been updated yet
            # calculate percentage of interest for each manager_id
            a[j] = building_level[temp['manager_id']][0]*1.0/sum(building_level[temp['manager_id']])
            b[j] = building_level[temp['manager_id']][1]*1.0/sum(building_level[temp['manager_id']])
            c[j] = building_level[temp['manager_id']][2]*1.0/sum(building_level[temp['manager_id']])


# update manager_level_low, manager_level_medium, manager_level_high for train_df (sum = 1)
train_df['manager_level_low'] = a
train_df['manager_level_medium'] = b
train_df['manager_level_high'] = c

a = []
b = []
c = []
building_level = {}

for j in train_df['manager_id'].values:
    building_level[j] = [0,0,0]

# update building_level = {manager_id: low, medium, high} for all train data
for j in range(train_df.shape[0]):
    temp = train_df.iloc[j]
    if temp['interest_level'] == 'low':
        building_level[temp['manager_id']][0] += 1
    if temp['interest_level'] == 'medium':
        building_level[temp['manager_id']][1] += 1
    if temp['interest_level'] == 'high':
        building_level[temp['manager_id']][2] += 1

# update building_level = {manager_id: low%, medium%, high%} for all test data
for i in test_df['manager_id'].values:
    # if the building_id is new in test_df (not shown in train_df), add NAN
    if i not in building_level.keys():
        a.append(np.nan)
        b.append(np.nan)
        c.append(np.nan)
    else:
        a.append(building_level[i][0]*1.0/sum(building_level[i]))
        b.append(building_level[i][1]*1.0/sum(building_level[i]))
        c.append(building_level[i][2]*1.0/sum(building_level[i]))

# update manager_level_low, manager_level_medium, manager_level_high for test_df (sum = 1)
test_df['manager_level_low'] = a
test_df['manager_level_medium'] = b
test_df['manager_level_high'] = c

# added feature names to features_to_use list for future use
features_to_use.append('manager_level_low') 
features_to_use.append('manager_level_medium') 
features_to_use.append('manager_level_high')


In [8]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 74659 entries, 0 to 99999
Data columns (total 22 columns):
bathrooms                74659 non-null float64
bedrooms                 74659 non-null int64
building_id              74659 non-null object
created                  74659 non-null object
description              74659 non-null object
display_address          74659 non-null object
features                 74659 non-null object
latitude                 74659 non-null float64
listing_id               74659 non-null int64
longitude                74659 non-null float64
manager_id               74659 non-null object
photos                   74659 non-null object
price                    74659 non-null int64
street_address           74659 non-null object
price_t                  74659 non-null float64
room_sum                 74659 non-null float64
num_photos               74659 non-null int64
num_features             74659 non-null int64
num_description_words    74659 non-null int64

Encode categorical values into numerical values between 0 and n_classes - 1 (different from get_dummies

In [33]:
categorical = ["display_address", "manager_id", "building_id", "street_address"]
for f in categorical:
        if train_df[f].dtype == 'object':
            # Encode labels with value between 0 and n_classes-1.
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(train_df[f].values) + list(test_df[f].values))
            train_df[f] = lbl.transform(list(train_df[f].values))
            test_df[f] = lbl.transform(list(test_df[f].values))
            features_to_use.append(f)

In [34]:
# transform 'features' into text connected with "_" and " "
train_df['features'] = train_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
test_df['features'] = test_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))

In [37]:
# Convert a collection of text documents to a matrix of token counts
tfidf = CountVectorizer(stop_words = 'english', max_features = 200)

# Learn the vocabulary dictionary and return term-document matrix.
tr_sparse = tfidf.fit_transform(train_df["features"])

# Transform documents to document-term matrix.
te_sparse = tfidf.transform(test_df["features"])

In [45]:
# Stack sparse matrices horizontally (column wise), Return a copy of this matrix in Compressed Sparse Row format
# return train_X and test_X as sparse matrix
train_X = sparse.hstack([train_df[features_to_use], tr_sparse]).tocsr()
test_X = sparse.hstack([test_df[features_to_use], te_sparse]).tocsr()

# map train_y to be 0, 1, 2
target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))
print(train_X.shape, test_X.shape) # ~ 200 comes from text information derived from 'features'

(49352, 217) (74659, 217)


Without CV statistic,to score get 0.5480 by SRK. And CV statistic get 0.5346 In fact ,you 
need to turn down the learning rate and turn up run_num

In [13]:
# cross validation, output loglos score
cv_scores = []
kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 2016)
for dev_index, val_index in kf.split(range(train_X.shape[0])):
        dev_X, val_X = train_X[dev_index,:], train_X[val_index,:]
        dev_y, val_y = train_y[dev_index], train_y[val_index]
        
        preds, model = runXGB(dev_X, dev_y, val_X, val_y)
        cv_scores.append(log_loss(val_y, preds))
        print(cv_scores)
        break

[0]	train-mlogloss:1.07839	test-mlogloss:1.07883
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 20 rounds.
[1]	train-mlogloss:1.05885	test-mlogloss:1.05975
[2]	train-mlogloss:1.04078	test-mlogloss:1.04199
[3]	train-mlogloss:1.02338	test-mlogloss:1.02503
[4]	train-mlogloss:1.00685	test-mlogloss:1.00899
[5]	train-mlogloss:0.990901	test-mlogloss:0.993441
[6]	train-mlogloss:0.975923	test-mlogloss:0.978893
[7]	train-mlogloss:0.961095	test-mlogloss:0.964456
[8]	train-mlogloss:0.946814	test-mlogloss:0.950677
[9]	train-mlogloss:0.93326	test-mlogloss:0.937526
[10]	train-mlogloss:0.920528	test-mlogloss:0.925188
[11]	train-mlogloss:0.908814	test-mlogloss:0.913848
[12]	train-mlogloss:0.897299	test-mlogloss:0.90267
[13]	train-mlogloss:0.885951	test-mlogloss:0.891741
[14]	train-mlogloss:0.874944	test-mlogloss:0.881208
[15]	train-mlogloss:0.864347	test-mlogloss:0.870999
[16]	train-mlogloss:0.854149	test-mlogl

In [47]:
preds, model = runXGB(train_X, train_y, test_X, num_rounds=1000)
out_df = pd.DataFrame(preds)
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = test_df.listing_id.values
out_df.to_csv("xgb_starter.csv", index=False)