The following code gives pipeline for machine learning using Random Forests classifier. 
- The feature engineering is quite naive here, aiming to provide a ML pipeline instead of minimum log loss.
- The Random Forests classifier gives a test log loss of 0.63409.

In [17]:
import numpy as np
import pandas as pd

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

### Import data

In [19]:
df = pd.read_json("train.json")

In [20]:
print(df.shape)

(49352, 15)


### Naive Feature Engineering

In [21]:
df["num_photos"] = df["photos"].apply(len)
df["num_features"] = df["features"].apply(len)
df["num_description_words"] = df["description"].apply(lambda x: len(x.split(" ")))
df["created"] = pd.to_datetime(df["created"])
df["created_year"] = df["created"].dt.year
df["created_month"] = df["created"].dt.month
df["created_day"] = df["created"].dt.day

In [22]:
num_feats = ["bathrooms", "bedrooms", "latitude", "longitude", "price",
             "num_photos", "num_features", "num_description_words",
             "created_year", "created_month", "created_day"]
x = df[num_feats]
y = df["interest_level"]
x.head()

Unnamed: 0,bathrooms,bedrooms,latitude,longitude,price,num_photos,num_features,num_description_words,created_year,created_month,created_day
10,1.5,3,40.7145,-73.9425,3000,5,0,95,2016,6,24
10000,1.0,2,40.7947,-73.9667,5465,11,5,9,2016,6,12
100004,1.0,1,40.7388,-74.0018,2850,8,4,94,2016,4,17
100007,1.0,1,40.7539,-73.9677,3275,3,2,80,2016,4,18
100013,1.0,4,40.8241,-73.9493,3350,3,1,68,2016,4,28


### Train Random Forests model

In [23]:
validation_size = 0.30
seed = 2018
X_train, X_validation, Y_train, Y_validation = train_test_split(x, y, test_size = validation_size, random_state = seed)

In [24]:
clf = RandomForestClassifier(n_estimators=1000)
clf.fit(X_train, Y_train)
y_val_pred = clf.predict_proba(X_validation)
log_loss(Y_validation, y_val_pred)

0.62935224871476181

In [27]:
Y_train.head(5)

83893    medium
55430       low
14662       low
38947       low
47886      high
Name: interest_level, dtype: object

### Make Predictions

In [28]:
df = pd.read_json('test.json')
df["num_photos"] = df["photos"].apply(len)
df["num_features"] = df["features"].apply(len)
df["num_description_words"] = df["description"].apply(lambda x: len(x.split(" ")))
df["created"] = pd.to_datetime(df["created"])
df["created_year"] = df["created"].dt.year
df["created_month"] = df["created"].dt.month
df["created_day"] = df["created"].dt.day
X = df[num_feats]

y = clf.predict_proba(X)

In [29]:
clf.classes_

array(['high', 'low', 'medium'], dtype=object)

In [30]:
labels2idx = {label: i for i, label in enumerate(clf.classes_)}
labels2idx

{'high': 0, 'low': 1, 'medium': 2}

In [31]:
sub = pd.DataFrame()
sub["listing_id"] = df["listing_id"]

In [15]:
for label in ["high", "medium", "low"]:
    sub[label] = y[:, labels2idx[label]]
sub.to_csv("submission_rf.csv", index=False)