The following code gives pipeline for machine learning using Random Forests classifier. 
- The feature engineering is quite naive here, aiming to provide a ML pipeline instead of minimum log loss.
- The Random Forests classifier gives a log loss of 0.63409.

In [9]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

### Import data

In [10]:
train_df = pd.read_json("train.json")
test_df = pd.read_json("test.json")

In [11]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49352 entries, 10 to 99994
Data columns (total 15 columns):
bathrooms          49352 non-null float64
bedrooms           49352 non-null int64
building_id        49352 non-null object
created            49352 non-null object
description        49352 non-null object
display_address    49352 non-null object
features           49352 non-null object
interest_level     49352 non-null object
latitude           49352 non-null float64
listing_id         49352 non-null int64
longitude          49352 non-null float64
manager_id         49352 non-null object
photos             49352 non-null object
price              49352 non-null int64
street_address     49352 non-null object
dtypes: float64(3), int64(3), object(9)
memory usage: 6.0+ MB


In [12]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 74659 entries, 0 to 99999
Data columns (total 14 columns):
bathrooms          74659 non-null float64
bedrooms           74659 non-null int64
building_id        74659 non-null object
created            74659 non-null object
description        74659 non-null object
display_address    74659 non-null object
features           74659 non-null object
latitude           74659 non-null float64
listing_id         74659 non-null int64
longitude          74659 non-null float64
manager_id         74659 non-null object
photos             74659 non-null object
price              74659 non-null int64
street_address     74659 non-null object
dtypes: float64(3), int64(3), object(8)
memory usage: 8.5+ MB


In [13]:
train_df.head(2)

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,longitude,manager_id,photos,price,street_address
10,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],medium,40.7145,7211212,-73.9425,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue
10000,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,Columbus Avenue,"[Doorman, Elevator, Fitness Center, Cats Allow...",low,40.7947,7150865,-73.9667,7533621a882f71e25173b27e3139d83d,[https://photos.renthop.com/2/7150865_be3306c5...,5465,808 Columbus Avenue


### Feature Engineering
#### Naive feature engineering
For both the training and testing dataset.

In [14]:
def naiveFE(df):
    ''' do naive feature engineering to both the train and test data frame
    '''
    # total number of room
    df["sum_room"] = df["bedrooms"] + df["bathrooms"]
    df["room_diff"] = df["bedrooms"] + df["bathrooms"]
    
    # average price per room
    df["price_s"] = df["price"]/df["sum_room"]
    df["price_bed"] = df["price"]/df["bedrooms"]
    df["price_bath"] = df["price"]/df["bathrooms"]
    
    # sum of bedrooms and bathrooms
    df["room_sum"] = df["bedrooms"] + df["bathrooms"] 
    
    # number of photos
    df["num_photos"] = df["photos"].apply(len)
    
    # number features
    df["num_features"] = df["features"].apply(len)
    
    # count of words present in description column
    df["num_description_words"] = df["description"].apply(lambda x: len(x.split(" ")))
    
    # created time, year = 2016 constant
    df["created"] = pd.to_datetime(df["created"])
    df["created_month"] = df["created"].dt.month
    df["created_day"] = df["created"].dt.day
    
    return df

train_df = naiveFE(train_df)
test_df = naiveFE(test_df)

In [15]:
train_df.head(20)

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,...,room_diff,price_s,price_bed,price_bath,room_sum,num_photos,num_features,num_description_words,created_month,created_day
10,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],medium,40.7145,7211212,...,4.5,666.666667,1000.0,2000.0,4.5,5,0,95,6,24
10000,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,Columbus Avenue,"[Doorman, Elevator, Fitness Center, Cats Allow...",low,40.7947,7150865,...,3.0,1821.666667,2732.5,5465.0,3.0,11,5,9,6,12
100004,1.0,1,c3ba40552e2120b0acfc3cb5730bb2aa,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",W 13 Street,"[Laundry In Building, Dishwasher, Hardwood Flo...",high,40.7388,6887163,...,2.0,1425.0,2850.0,2850.0,2.0,8,4,94,4,17
100007,1.0,1,28d9ad350afeaab8027513a3e52ac8d5,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,East 49th Street,"[Hardwood Floors, No Fee]",low,40.7539,6888711,...,2.0,1637.5,3275.0,3275.0,2.0,3,2,80,4,18
100013,1.0,4,0,2016-04-28 01:32:41,Beautifully renovated 3 bedroom flex 4 bedroom...,West 143rd Street,[Pre-War],low,40.8241,6934781,...,5.0,670.0,837.5,3350.0,5.0,3,1,68,4,28
100014,2.0,4,38a913e46c94a7f46ddf19b756a9640c,2016-04-19 04:24:47,,West 18th Street,[],medium,40.7429,6894514,...,6.0,1332.5,1998.75,3997.5,6.0,5,0,9,4,19
100016,1.0,2,3ba49a93260ca5df92fde024cb4ca61f,2016-04-27 03:19:56,Stunning unit with a great location and lots o...,West 107th Street,"[prewar, elevator, Dogs Allowed, Cats Allowed,...",low,40.8012,6930771,...,3.0,1200.0,1800.0,3600.0,3.0,10,8,87,4,27
100020,2.0,1,0372927bcb6a0949613ef5bf893bbac7,2016-04-13 06:01:42,"This huge sunny ,plenty of lights 1 bed/2 bath...",West 21st Street,"[Doorman, Elevator, Pre-War, Terrace, Laundry ...",low,40.7427,6867392,...,3.0,1881.666667,5645.0,2822.5,3.0,5,8,134,4,13
100026,1.0,1,a7efbeb58190aa267b4a9121cd0c88c0,2016-04-20 02:36:35,<p><a website_redacted,Hamilton Terrace,"[Cats Allowed, Dogs Allowed, Elevator, Laundry...",medium,40.8234,6898799,...,2.0,862.5,1725.0,1725.0,2.0,5,4,4,4,20
100027,2.0,4,0,2016-04-02 02:58:15,This is a spacious four bedroom with every bed...,522 E 11th,"[Dishwasher, Hardwood Floors]",low,40.7278,6814332,...,6.0,966.666667,1450.0,2900.0,6.0,9,2,166,4,2


#### Define get_stats function
It first merge train_df and test_df, followed by grouping the dataframe by group_column (especially manager_id), then calculating the count, mean, std, median, max, min of the target_column feature.
It returns the train and test df with the newly added columns as numpy array (selected_train, selected_test).

The following code was partially copied from Little Boat: https://www.kaggle.com/c/two-sigma-connect-rental-listing-inquiries/discussion/32123

In [45]:
def get_stats(train_df, test_df, target_column, group_column = 'manager_id'):
    '''
    target_column: numeric columns to group with (e.g. price, bedrooms, bathrooms)
    group_column: categorical columns to group on (e.g. manager_id, building_id)
    '''
    train_df['row_id'] = range(train_df.shape[0])
    test_df['row_id'] = range(test_df.shape[0])
    train_df['train'] = 1
    test_df['train'] = 0
    all_df = train_df[['row_id', 'train', target_column, group_column]].append(test_df[['row_id','train', target_column, group_column]])
    all_df = all_df.reindex()
    grouped = all_df[[target_column, group_column]].groupby(group_column)
    
    the_size = pd.DataFrame(grouped.size()).reset_index()
    the_size.columns = [group_column, '%s_size' % target_column]
    
    the_mean = pd.DataFrame(grouped.mean()).reset_index()
    the_mean.columns = [group_column, '%s_mean' % target_column]
    
    the_std = pd.DataFrame(grouped.std()).reset_index().fillna(0)
    the_std.columns = [group_column, '%s_std' % target_column]
    
    the_median = pd.DataFrame(grouped.median()).reset_index()
    the_median.columns = [group_column, '%s_median' % target_column]
    
    the_stats = pd.merge(the_size, the_mean)
    the_stats = pd.merge(the_stats, the_std)
    the_stats = pd.merge(the_stats, the_median)

    the_max = pd.DataFrame(grouped.max()).reset_index()
    the_max.columns = [group_column, '%s_max' % target_column]
    
    the_min = pd.DataFrame(grouped.min()).reset_index()
    the_min.columns = [group_column, '%s_min' % target_column]

    the_stats = pd.merge(the_stats, the_max)
    the_stats = pd.merge(the_stats, the_min)

    all_df = pd.merge(all_df, the_stats)

    selected_train = all_df[all_df['train'] == 1]
    selected_test = all_df[all_df['train'] == 0]
    
    selected_train.sort_values('row_id', inplace=True)
    selected_test.sort_values('row_id', inplace=True)
    
    selected_train.drop([target_column, group_column, 'row_id', 'train'], axis=1, inplace=True)
    selected_test.drop([target_column, group_column, 'row_id', 'train'], axis=1, inplace=True)

    return np.array(selected_train), np.array(selected_test)

#### Use the get_stats function
The following code set group_column = 'manager_id' or 'building_id', scan target_id = 'bathrooms', 'bedrooms', 'latitude', 'longitude', 'price' and update train_df and test_df correspondently.

In [73]:
target_column = ['bathrooms', 'bedrooms', 'latitude', 'longitude', 'price']
group_column = ['manager_id', 'building_id']

train_stack_list = []
test_stack_list = []
column_name_list = []

for target_col in target_column:
    for group_col in group_column:
        tmp_train, tmp_test = get_stats(train_df, test_df, target_column = target_col, group_column = group_col)
        tmp_name = target_col + '_' + group_col
        tmp_name_list = [tmp_name + '_count', tmp_name + '_mean', tmp_name + '_std', tmp_name + '_median', tmp_name + '_max', tmp_name + '_min']
        train_stack_list.append(tmp_train)
        test_stack_list.append(tmp_test)
        column_name_list.append(tmp_name_list)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


#### Add engineered statistics into original train_df and test_df
Both train_stack_list and test_stack_list are of dimension (10, 49352, 6).

In [84]:
for i in range(len(train_stack_list)):
    stat = pd.DataFrame(train_stack_list[i], columns = column_name_list[i])
    stat['row_id'] = range(stat.shape[0])
    train_df = pd.merge(train_df, stat)

for i in range(len(test_stack_list)):
    stat = pd.DataFrame(test_stack_list[i], columns = column_name_list[i])
    stat['row_id'] = range(stat.shape[0])
    test_df = pd.merge(test_df, stat)


Prepare data for ML.

In [89]:
y = train_df['interest_level'].map({'low': 0, 'medium': 1, 'high': 2})    
train_df.drop(['building_id', 'created', 'description', 'display_address', 'features', 'interest_level', 'listing_id', 
               'manager_id', 'photos', 'street_address', 'row_id', 'train'], axis = 1, inplace = True)
X = train_df.values

test_df.drop(['building_id', 'created', 'description', 'display_address', 'interest_level', 'features', 'listing_id', 
               'manager_id', 'photos', 'street_address', 'row_id', 'train'], axis = 1, inplace = True)

In [93]:
test_df.head(2)

Unnamed: 0,bathrooms,bedrooms,interest_level,latitude,longitude,price,sum_room,room_diff,price_s,price_bed,...,price_manager_id_std,price_manager_id_median,price_manager_id_max,price_manager_id_min,price_building_id_count,price_building_id_mean,price_building_id_std,price_building_id_median,price_building_id_max,price_building_id_min
0,1.5,3,medium,40.7145,-73.9425,3000,4.5,4.5,666.666667,1000.0,...,546.766308,2550.0,4400.0,1700.0,6.0,2666.666667,258.19889,2500.0,3000.0,2500.0
1,1.0,2,low,40.7947,-73.9667,5465,3.0,3.0,1821.666667,2732.5,...,1703.262088,4820.0,11220.0,1995.0,48.0,6459.791667,2728.301817,5680.0,11550.0,2695.0


In [91]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49352 entries, 0 to 49351
Data columns (total 76 columns):
bathrooms                       49352 non-null float64
bedrooms                        49352 non-null int64
latitude                        49352 non-null float64
longitude                       49352 non-null float64
price                           49352 non-null int64
sum_room                        49352 non-null float64
room_diff                       49352 non-null float64
price_s                         49352 non-null float64
price_bed                       49352 non-null float64
price_bath                      49352 non-null float64
room_sum                        49352 non-null float64
num_photos                      49352 non-null int64
num_features                    49352 non-null int64
num_description_words           49352 non-null int64
created_month                   49352 non-null int64
created_day                     49352 non-null int64
bathrooms_manager_id_coun

### Train XGBoost

In [99]:
validation_size = 0.30
seed = 2018
X_train, X_validation, Y_train, Y_validation = train_test_split(X, y, test_size = validation_size, random_state = seed)

In [106]:
from xgboost import XGBClassifier
from xgboost import plot_importance
from sklearn.cross_validation import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

model = XGBClassifier()
learning_rate = [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3]
param_grid = dict(learning_rate = learning_rate)
kfold = StratifiedKFold(Y_train, n_folds = 5, shuffle = True, random_state = 2018)
grid_search = GridSearchCV(model, param_grid, scoring = 'neg_log_loss', n_jobs = 1, cv = kfold)
result = grid_search.fit(X_train, Y_train)

# summarize results
print("BestL %f using %s" % (- result.best_score_, result.best_params_))
means, stdevs = [], []
for params, mean_score, scores in result.grid_scores_:
    stdev = scores.std()
    means.append(- mean_score)
    stdevs.append(stdev)
    print("%f (%f) with: %r" %(- mean_score, stdev, params))    


BestL 0.578077 using {'learning_rate': 0.3}
1.092800 (0.000036) with: {'learning_rate': 0.0001}
1.044577 (0.000342) with: {'learning_rate': 0.001}
0.791203 (0.002806) with: {'learning_rate': 0.01}
0.607120 (0.006793) with: {'learning_rate': 0.1}
0.586817 (0.006384) with: {'learning_rate': 0.2}
0.578077 (0.006956) with: {'learning_rate': 0.3}




In [108]:
model.fit(X_train, Y_train)
print(model.feature_importances_)
plot_importance(model)
plt.show()

XGBoostError: need to call fit beforehand

In [None]:
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%"%(accuracy*100.0))

print(model.feature_importances_)

plot_importance(model)
plt.show()

### Make Predictions

In [30]:
df = pd.read_json('test.json')
df["num_photos"] = df["photos"].apply(len)
df["num_features"] = df["features"].apply(len)
df["num_description_words"] = df["description"].apply(lambda x: len(x.split(" ")))
df["created"] = pd.to_datetime(df["created"])
df["created_year"] = df["created"].dt.year
df["created_month"] = df["created"].dt.month
df["created_day"] = df["created"].dt.day
X = df[num_feats]

y = clf.predict_proba(X)

In [31]:
labels2idx = {label: i for i, label in enumerate(clf.classes_)}
labels2idx

{'high': 0, 'low': 1, 'medium': 2}

In [32]:
sub = pd.DataFrame()
sub["listing_id"] = df["listing_id"]
for label in ["high", "medium", "low"]:
    sub[label] = y[:, labels2idx[label]]
sub.to_csv("submission_rf.csv", index=False)