# Kaggle - Two Sigma Connect : Rental Listing Inguiries

In [1]:
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
import random
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score



### 데이터 불러오기

In [2]:
train_df = pd.read_json("../input/train.json").sort_values('listing_id')
test_df = pd.read_json("../input/test.json").sort_values('listing_id')
image_date = pd.read_csv("../input/listing_image_time.csv")

### Random Forest

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

def runRandomForest(x_data, y_data):
    x_train, x_val, y_train, y_val = train_test_split(x_data, y_data, test_size=0.2)

    rfmodel = RandomForestClassifier(n_estimators=1000)
    rfmodel.fit(x_train, y_train)

    y_val_pred = rfmodel.predict_proba(x_val)
    
    print(log_loss(y_val, y_val_pred))
    
    return y_val, y_val_pred

### XGBoost

In [4]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=4000):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.03
    param['max_depth'] = 6
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    param['nthread'] = -1
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=50, verbose_eval=25)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model

In [5]:
from sklearn.model_selection import StratifiedKFold
from sklearn import model_selection

def train_xgboost(train_X, train_y):
    cv_scores = []
    kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2016)
    for dev_index, val_index in kf.split(range(train_X.shape[0])):
        dev_X, val_X = train_X[dev_index,:], train_X[val_index,:]
        dev_y, val_y = train_y[dev_index], train_y[val_index]
        preds, model = runXGB(dev_X, dev_y, val_X, val_y)
        cv_scores.append(log_loss(val_y, preds))
        break
        
    return val_y, preds

In [6]:
# def predict(train_X, train_y, test_X, filename, rounds=1300):

#     preds, model = runXGB(train_X, train_y, test_X, num_rounds=rounds)

#     out_df = pd.DataFrame(preds)
#     out_df.columns = ["high", "medium", "low"]
#     out_df["listing_id"] = test_df.listing_id.values
#     out_df.to_csv("sub_%s.csv" % filename, index=False)

In [7]:
# def output_results(clf, x_test, listing, fname):
#     preds = clf.predict_proba(x_test)
#     preds = pd.DataFrame(preds)
#     cols = ['low', 'medium', 'high']
#     preds.columns = cols
#     preds['listing_id'] = listing
#     preds.to_csv(fname, index=None)
#     print(preds[cols].mean().values)

## 1. BaseLine

### 1-1. 'price, bathroom, bedroom, features, description, created'

In [8]:
features_to_use=["bathrooms", "bedrooms", "latitude", "longitude", "price", "logprice","price_t","price_per_room", 
"num_photos", "num_features", "num_description_words","listing_id", "created_year", "created_month", "created_day", "created_hour", 
                 "created_dayofyear"]

In [9]:
test_df["bathrooms"].loc[19671] = 1.5
test_df["bathrooms"].loc[22977] = 2.0
test_df["bathrooms"].loc[63719] = 2.0
train_df["price"] = train_df["price"].clip(upper=13000)

train_df["logprice"] = np.log(train_df["price"])
test_df["logprice"] = np.log(test_df["price"])

train_df["price_t"] =train_df["price"]/train_df["bedrooms"]
test_df["price_t"] = test_df["price"]/test_df["bedrooms"] 

train_df['price_t'] = train_df['price_t'].replace(np.inf, train_df['price_t'][train_df['price_t']!=np.inf].mean(), regex=True)
test_df['price_t'] = test_df['price_t'].replace(np.inf, test_df['price_t'][test_df['price_t']!=np.inf].mean(), regex=True)

train_df['price_t'] = train_df['price_t'].replace(np.NaN, train_df['price_t'][train_df['price_t']!=np.NaN].mean(), regex=True)
test_df['price_t'] = test_df['price_t'].replace(np.NaN, test_df['price_t'][test_df['price_t']!=np.NaN].mean(), regex=True)

train_df["room_sum"] = train_df["bedrooms"]+train_df["bathrooms"] 
test_df["room_sum"] = test_df["bedrooms"]+test_df["bathrooms"] 

train_df['price_per_room'] = train_df['price']/train_df['room_sum']
test_df['price_per_room'] = test_df['price']/test_df['room_sum']

train_df['price_per_room'] = train_df['price_per_room'].replace(np.inf, train_df['price_per_room'][train_df['price_per_room']!=np.inf].mean(), regex=True)
test_df['price_per_room'] = test_df['price_per_room'].replace(np.inf, test_df['price_per_room'][test_df['price_per_room']!=np.inf].mean(), regex=True)

train_df['price_per_room'] = train_df['price_per_room'].replace(np.NaN, train_df['price_per_room'][train_df['price_per_room']!=np.NaN].mean(), regex=True)
test_df['price_per_room'] = test_df['price_per_room'].replace(np.NaN, test_df['price_per_room'][test_df['price_per_room']!=np.NaN].mean(), regex=True)

train_df["num_photos"] = train_df["photos"].apply(len)
test_df["num_photos"] = test_df["photos"].apply(len)

train_df["num_features"] = train_df["features"].apply(len)
test_df["num_features"] = test_df["features"].apply(len)

train_df["num_description_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))
test_df["num_description_words"] = test_df["description"].apply(lambda x: len(x.split(" ")))

train_df["created"] = pd.to_datetime(train_df["created"])
test_df["created"] = pd.to_datetime(test_df["created"])
train_df["created_year"] = train_df["created"].dt.year
test_df["created_year"] = test_df["created"].dt.year
train_df["created_month"] = train_df["created"].dt.month
test_df["created_month"] = test_df["created"].dt.month
train_df["created_day"] = train_df["created"].dt.day
test_df["created_day"] = test_df["created"].dt.day
train_df["created_hour"] = train_df["created"].dt.hour
test_df["created_hour"] = test_df["created"].dt.hour
train_df['created_dayofyear'] = train_df['created'].dt.dayofyear
test_df['created_dayofyear'] = test_df['created'].dt.dayofyear

In [10]:
target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))
print(train_df.shape, test_df.shape)

(49352, 27) (74659, 26)


### 1-2. Random Forest

In [11]:
y_val, y_val_pred = runRandomForest(train_df[features_to_use], train_y)

0.611170811358


In [12]:
pred_y = np.array([np.argmax(i) for i in y_val_pred])
pred_y, y = pd.Series(pred_y), pd.Series(y_val)

print("accuracy :", accuracy_score(pred_y, y))
print("")
print("classification_report :\n", classification_report(y, pred_y))

accuracy : 0.729814608449

classification_report :
              precision    recall  f1-score   support

          0       0.51      0.24      0.33       778
          1       0.45      0.30      0.36      2178
          2       0.79      0.92      0.85      6915

avg / total       0.69      0.73      0.70      9871



### 1-3. XGBoost

In [13]:
train_X_1 = sparse.coo_matrix(train_df[features_to_use]).tocsr()
test_X_1 = sparse.coo_matrix(test_df[features_to_use]).tocsr()

val_y, preds = train_xgboost(train_X_1, train_y)

[0]	train-mlogloss:1.08022	test-mlogloss:1.08045
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
[25]	train-mlogloss:0.806969	test-mlogloss:0.813823
[50]	train-mlogloss:0.699672	test-mlogloss:0.711459
[75]	train-mlogloss:0.648552	test-mlogloss:0.664082
[100]	train-mlogloss:0.619774	test-mlogloss:0.639224
[125]	train-mlogloss:0.601016	test-mlogloss:0.624491
[150]	train-mlogloss:0.587251	test-mlogloss:0.614851
[175]	train-mlogloss:0.57639	test-mlogloss:0.608152
[200]	train-mlogloss:0.566581	test-mlogloss:0.602691
[225]	train-mlogloss:0.558082	test-mlogloss:0.598578
[250]	train-mlogloss:0.550454	test-mlogloss:0.595372
[275]	train-mlogloss:0.543561	test-mlogloss:0.592897
[300]	train-mlogloss:0.537081	test-mlogloss:0.590663
[325]	train-mlogloss:0.53093	test-mlogloss:0.588491
[350]	train-mlogloss:0.525383	test-mlogloss:0.586947
[375]	train-mlogloss:0.52007	test-mlogloss:0.585838
[400]	train

In [14]:
pred_y = np.array([np.argmax(i) for i in preds])
pred_y, y = pd.Series(pred_y), pd.Series(val_y)

print("accuracy :", accuracy_score(pred_y, y))
print("")
print("classification_report :\n", classification_report(y, pred_y))

accuracy : 0.744808023503

classification_report :
              precision    recall  f1-score   support

          0       0.59      0.28      0.38       747
          1       0.52      0.34      0.41      2319
          2       0.79      0.93      0.86      6805

avg / total       0.71      0.74      0.72      9871



## 2. latitude, longitude 를 이용한 Feature 생성

In [16]:
import math

def cart2rho(x, y):
    rho = np.sqrt(x**2 + y**2)
    return rho


def cart2phi(x, y):
    phi = np.arctan2(y, x)
    return phi


def rotation_x(row, alpha):
    x = row['latitude']
    y = row['longitude']
    return x*math.cos(alpha) + y*math.sin(alpha)


def rotation_y(row, alpha):
    x = row['latitude']
    y = row['longitude']
    return y*math.cos(alpha) - x*math.sin(alpha)


def add_rotation(degrees, df):
    namex = "rot" + str(degrees) + "_X"
    namey = "rot" + str(degrees) + "_Y"

    df['num_' + namex] = df.apply(lambda row: rotation_x(row, math.pi/(180/degrees)), axis=1)
    df['num_' + namey] = df.apply(lambda row: rotation_y(row, math.pi/(180/degrees)), axis=1)

    return df

def operate_on_coordinates(tr_df, te_df):
    
    for df in [tr_df, te_df]:
        df["num_rho"] = df.apply(lambda x: cart2rho(x["latitude"] - 40.78222222, x["longitude"]+73.96527777), axis=1)
        df["num_phi"] = df.apply(lambda x: cart2phi(x["latitude"] - 40.78222222, x["longitude"]+73.96527777), axis=1)
    
        #rotations
        for angle in [15,30,45,60]:
            df = add_rotation(angle, df)

    return tr_df, te_df

train_df, test_df = operate_on_coordinates(train_df, test_df)

In [17]:
features_to_use.extend(['num_rho', 'num_phi', 'num_rot15_X', 'num_rot15_Y', 'num_rot30_X',
       'num_rot30_Y', 'num_rot45_X', 'num_rot45_Y', 'num_rot60_X',
       'num_rot60_Y'])

In [18]:
location_dict = {
    'manhattan_loc': [40.728333, -73.994167],
    'brooklyn_loc': [40.624722, -73.952222],
    'bronx_loc': [40.837222, -73.886111],
    'queens_loc': [40.75, -73.866667],
    'staten_loc': [40.576281, -74.144839]}

for location in location_dict.keys():
    dlat = location_dict[location][0] - train_df['latitude']
    dlon = (location_dict[location][1] - train_df['longitude']) * np.cos(np.deg2rad(41))  #  adjust for NYC latitude
    train_df['distance_' + location] = np.sqrt(dlat ** 2 + dlon ** 2) * 60
    
    dlat = location_dict[location][0] - test_df['latitude']
    dlon = (location_dict[location][1] - test_df['longitude']) * np.cos(np.deg2rad(41))  #  adjust for NYC latitude
    test_df['distance_' + location] = np.sqrt(dlat ** 2 + dlon ** 2) * 60     # distance in nautical miles
    
    features_to_use.append('distance_' + location)

In [19]:
train_df["longitude"] = train_df["longitude"].clip(upper=-60)
train_df["latitude"] = train_df["latitude"].clip(lower=35)

test_df["longitude"] = test_df["longitude"].clip(upper=-60)
test_df["latitude"] = test_df["latitude"].clip(lower=35)

train_df["pos"] = train_df.longitude.round(3).astype(str) + '_' + train_df.latitude.round(3).astype(str)
test_df["pos"] = test_df.longitude.round(3).astype(str) + '_' + test_df.latitude.round(3).astype(str)

vals = train_df['pos'].value_counts()
dvals = vals.to_dict()

train_df["density"] = train_df['pos'].apply(lambda x: dvals.get(x, vals.min()))
test_df["density"] = test_df['pos'].apply(lambda x: dvals.get(x, vals.min()))

features_to_use.append("density")

In [20]:
target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))
print(train_df.shape, test_df.shape)

(49352, 44) (74659, 43)


In [21]:
val_y, preds = runRandomForest(train_df[features_to_use], train_y)

0.616920213711


In [22]:
pred_y = np.array([np.argmax(i) for i in preds])
pred_y, y = pd.Series(pred_y), pd.Series(val_y)

print("accuracy :", accuracy_score(pred_y, y))
print("")
print("classification_report :\n", classification_report(y, pred_y))

accuracy : 0.729814608449

classification_report :
              precision    recall  f1-score   support

          0       0.53      0.27      0.36       807
          1       0.47      0.32      0.38      2258
          2       0.79      0.92      0.85      6806

avg / total       0.70      0.73      0.70      9871



In [23]:
train_X_2 = sparse.coo_matrix(train_df[features_to_use]).tocsr()
test_X_2 = sparse.coo_matrix(test_df[features_to_use]).tocsr()

val_y, preds = train_xgboost(train_X_2, train_y)

[0]	train-mlogloss:1.08004	test-mlogloss:1.08045
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
[25]	train-mlogloss:0.803152	test-mlogloss:0.811003
[50]	train-mlogloss:0.692695	test-mlogloss:0.706735
[75]	train-mlogloss:0.639423	test-mlogloss:0.658431
[100]	train-mlogloss:0.609877	test-mlogloss:0.633352
[125]	train-mlogloss:0.589778	test-mlogloss:0.617866
[150]	train-mlogloss:0.57516	test-mlogloss:0.608088
[175]	train-mlogloss:0.562765	test-mlogloss:0.600815
[200]	train-mlogloss:0.552071	test-mlogloss:0.595401
[225]	train-mlogloss:0.542511	test-mlogloss:0.590968
[250]	train-mlogloss:0.533944	test-mlogloss:0.587603
[275]	train-mlogloss:0.526819	test-mlogloss:0.585477
[300]	train-mlogloss:0.51983	test-mlogloss:0.583366
[325]	train-mlogloss:0.513203	test-mlogloss:0.581582
[350]	train-mlogloss:0.506984	test-mlogloss:0.579844
[375]	train-mlogloss:0.501382	test-mlogloss:0.578636
[400]	trai

In [24]:
pred_y = np.array([np.argmax(i) for i in preds])
pred_y, y = pd.Series(pred_y), pd.Series(val_y)

print("accuracy :", accuracy_score(pred_y, y))
print("")
print("classification_report :\n", classification_report(y, pred_y))

accuracy : 0.746834160673

classification_report :
              precision    recall  f1-score   support

          0       0.60      0.32      0.42       747
          1       0.52      0.35      0.42      2319
          2       0.80      0.93      0.86      6805

avg / total       0.72      0.75      0.72      9871



## 3. manager_id, building_id 를 이용한 feature 생성

### 3-1. manager_level(low, medium, high)

In [26]:
index=list(range(train_df.shape[0]))
random.shuffle(index)
a=[np.nan]*len(train_df)
b=[np.nan]*len(train_df)
c=[np.nan]*len(train_df)

for i in range(5):
    building_level={}
    for j in train_df['manager_id'].values:
        building_level[j]=[0,0,0]
    
    test_index=index[int((i*train_df.shape[0])/5):int(((i+1)*train_df.shape[0])/5)]
    train_index=list(set(index).difference(test_index))
    
    for j in train_index:
        temp=train_df.iloc[j]
        if temp['interest_level']=='low':
            building_level[temp['manager_id']][0]+=1
        if temp['interest_level']=='medium':
            building_level[temp['manager_id']][1]+=1
        if temp['interest_level']=='high':
            building_level[temp['manager_id']][2]+=1
            
    for j in test_index:
        temp=train_df.iloc[j]
        if sum(building_level[temp['manager_id']])!=0:
            a[j]=building_level[temp['manager_id']][0]*1.0/sum(building_level[temp['manager_id']])
            b[j]=building_level[temp['manager_id']][1]*1.0/sum(building_level[temp['manager_id']])
            c[j]=building_level[temp['manager_id']][2]*1.0/sum(building_level[temp['manager_id']])
            
train_df['manager_level_low']=a
train_df['manager_level_medium']=b
train_df['manager_level_high']=c

a=[]
b=[]
c=[]
building_level={}
for j in train_df['manager_id'].values:
    building_level[j]=[0,0,0]

for j in range(train_df.shape[0]):
    temp=train_df.iloc[j]
    if temp['interest_level']=='low':
        building_level[temp['manager_id']][0]+=1
    if temp['interest_level']=='medium':
        building_level[temp['manager_id']][1]+=1
    if temp['interest_level']=='high':
        building_level[temp['manager_id']][2]+=1

for i in test_df['manager_id'].values:
    if i not in building_level.keys():
        a.append(np.nan)
        b.append(np.nan)
        c.append(np.nan)
    else:
        a.append(building_level[i][0]*1.0/sum(building_level[i]))
        b.append(building_level[i][1]*1.0/sum(building_level[i]))
        c.append(building_level[i][2]*1.0/sum(building_level[i]))
test_df['manager_level_low']=a
test_df['manager_level_medium']=b
test_df['manager_level_high']=c

features_to_use.append('manager_level_low') 
features_to_use.append('manager_level_medium') 
features_to_use.append('manager_level_high')

### 3-2. builing_id, manager_id 를 Count 하여 log 값 취한 후 feature 추가

In [27]:
def pre_processing(data):
    
    build_counts = pd.DataFrame(data.building_id.value_counts())
    build_counts['b_counts'] = build_counts['building_id']
    build_counts['building_id'] = build_counts.index
    build_counts['b_count_log'] = np.log2(build_counts['b_counts'])
    data = pd.merge(data, build_counts, on='building_id')
    
    man_counts = pd.DataFrame(data.manager_id.value_counts())
    man_counts['m_counts'] = man_counts['manager_id']
    man_counts['manager_id'] = man_counts.index
    man_counts['m_count_log'] = np.log10(man_counts['m_counts'])
    data = pd.merge(data, man_counts, on='manager_id')
    
    return data

train_df['Source'] = "train"
test_df['Source'] = "test"

data = pd.concat([train_df, test_df])

pre_data = pre_processing(data)

train_df = pre_data[pre_data['Source'] == "train"]
test_df = pre_data[pre_data['Source'] == "test"]

features_to_use += ['b_counts', 'm_counts', 'b_count_log', 'm_count_log']

In [28]:
target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))
print(train_df.shape, test_df.shape)

(49352, 52) (74659, 52)


In [29]:
feature_append = ['manager_level_low','manager_level_medium','manager_level_high']

for name in feature_append:
    train_df[name] = train_df[name].replace(np.inf, train_df[name][train_df[name]!=np.inf].mean(), regex=True)
    test_df[name] = test_df[name].replace(np.inf, test_df[name][test_df[name]!=np.inf].mean(), regex=True)

    train_df[name] = train_df[name].replace(np.NaN, train_df[name][train_df[name]!=np.NaN].mean(), regex=True)
    test_df[name] = test_df[name].replace(np.NaN, test_df[name][test_df[name]!=np.NaN].mean(), regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


### 3-3. Random Forest

In [30]:
val_y, preds = runRandomForest(train_df[features_to_use], train_y)

0.566731964978


In [31]:
pred_y = np.array([np.argmax(i) for i in preds])
pred_y, y = pd.Series(pred_y), pd.Series(val_y)

print("accuracy :", accuracy_score(pred_y, y))
print("")
print("classification_report :\n", classification_report(y, pred_y))

accuracy : 0.749468138993

classification_report :
              precision    recall  f1-score   support

          0       0.59      0.33      0.42       781
          1       0.51      0.37      0.43      2271
          2       0.81      0.92      0.86      6819

avg / total       0.72      0.75      0.73      9871



### 3-4. XGBoost

In [32]:
train_X_3 = sparse.coo_matrix(train_df[features_to_use]).tocsr()
test_X_3 = sparse.coo_matrix(test_df[features_to_use]).tocsr()

val_y, preds = train_xgboost(train_X_3, train_y)

[0]	train-mlogloss:1.07828	test-mlogloss:1.07877
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
[25]	train-mlogloss:0.773066	test-mlogloss:0.782023
[50]	train-mlogloss:0.65163	test-mlogloss:0.667114
[75]	train-mlogloss:0.593965	test-mlogloss:0.615374
[100]	train-mlogloss:0.562029	test-mlogloss:0.588926
[125]	train-mlogloss:0.541523	test-mlogloss:0.573883
[150]	train-mlogloss:0.526102	test-mlogloss:0.564002
[175]	train-mlogloss:0.513724	test-mlogloss:0.556888
[200]	train-mlogloss:0.502986	test-mlogloss:0.551513
[225]	train-mlogloss:0.493432	test-mlogloss:0.547248
[250]	train-mlogloss:0.48494	test-mlogloss:0.544164
[275]	train-mlogloss:0.476962	test-mlogloss:0.541476
[300]	train-mlogloss:0.46935	test-mlogloss:0.539069
[325]	train-mlogloss:0.46213	test-mlogloss:0.536997
[350]	train-mlogloss:0.455675	test-mlogloss:0.53545
[375]	train-mlogloss:0.449702	test-mlogloss:0.534141
[400]	train-m

In [33]:
pred_y = np.array([np.argmax(i) for i in preds])
pred_y, y = pd.Series(pred_y), pd.Series(val_y)

print("accuracy :", accuracy_score(pred_y, y))
print("")
print("classification_report :\n", classification_report(y, pred_y))

accuracy : 0.763347178604

classification_report :
              precision    recall  f1-score   support

          0       0.60      0.35      0.44       775
          1       0.53      0.44      0.48      2256
          2       0.83      0.92      0.87      6840

avg / total       0.74      0.76      0.75      9871



## 4. Image Date 와 display_address

### 4-1. 방에 대한 사진이 생성된 날짜를 Feature 로 추가

In [35]:
image_date.columns = ["listing_id", "time_stamp"]
image_date.loc[80240,"time_stamp"] = 1478129766 

image_date["img_date"] = pd.to_datetime(image_date["time_stamp"], unit="s")
image_date["img_days_passed"] = (image_date["img_date"].max() - image_date["img_date"]).astype("timedelta64[D]").astype(int)
image_date["img_date_month"] = image_date["img_date"].dt.month
image_date["img_date_week"] = image_date["img_date"].dt.week
image_date["img_date_day"] = image_date["img_date"].dt.day
image_date["img_date_dayofweek"] = image_date["img_date"].dt.dayofweek
image_date["img_date_dayofyear"] = image_date["img_date"].dt.dayofyear
image_date["img_date_hour"] = image_date["img_date"].dt.hour
image_date["img_date_monthBeginMidEnd"] = image_date["img_date_day"].apply(lambda x: 1 if x<10 else 2 if x<20 else 3)

train_df = pd.merge(train_df, image_date, on="listing_id", how="left")
test_df = pd.merge(test_df, image_date, on="listing_id", how="left")

img_date_column = image_date.columns.values.tolist()

img_date_column.remove('listing_id')
img_date_column.remove('time_stamp')
img_date_column.remove('img_date')

features_to_use += img_date_column

### 4-2. display address 를 같은 형태로 변환

- W 13 Street, West 13 Street
- Columbus Avenue, columbus ave 등

In [36]:
import re
train_df['display_address'] = train_df['display_address'].apply(lambda x: x.lower())
train_df['display_address'] = train_df['display_address'].apply(lambda x: re.sub("[^a-zA-Z0-9 ]", "", x))
train_df['display_address'] = train_df['display_address'].apply(lambda x: re.sub("street", "st", x))
train_df['display_address'] = train_df['display_address'].apply(lambda x: re.sub("west", "w", x))
train_df['display_address'] = train_df['display_address'].apply(lambda x: re.sub("east", "e", x))
train_df['display_address'] = train_df['display_address'].apply(lambda x: re.sub("avenue", "ave", x))
train_df['display_address'] = train_df['display_address'].apply(lambda x: (" ".join(x.split())).strip())

test_df['display_address'] = test_df['display_address'].apply(lambda x: x.lower())
test_df['display_address'] = test_df['display_address'].apply(lambda x: re.sub("[^a-zA-Z0-9 ]", "", x))
test_df['display_address'] = test_df['display_address'].apply(lambda x: re.sub("street", "st", x))
test_df['display_address'] = test_df['display_address'].apply(lambda x: re.sub("west", "w", x))
test_df['display_address'] = test_df['display_address'].apply(lambda x: re.sub("east", "e", x))
test_df['display_address'] = test_df['display_address'].apply(lambda x: re.sub("avenue", "ave", x))
test_df['display_address'] = test_df['display_address'].apply(lambda x: (" ".join(x.split())).strip())

In [37]:
categorical = ["display_address", "manager_id", "building_id"]
for f in categorical:
        if train_df[f].dtype=='object':
            lbl = LabelEncoder()
            lbl.fit(list(train_df[f].values) + list(test_df[f].values))
            train_df[f] = lbl.transform(list(train_df[f].values))
            test_df[f] = lbl.transform(list(test_df[f].values))
            features_to_use.append(f)

In [38]:
train_df['features'] = train_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
test_df['features'] = test_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))

tfidf = CountVectorizer(stop_words='english', max_features=200)
tr_sparse = tfidf.fit_transform(train_df["features"])
te_sparse = tfidf.transform(test_df["features"])

In [39]:
target_num_map = {'high':0, 'medium':1, 'low':2}
train_Y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))

### 4-3. Random Forest

In [40]:
y_val, y_val_pred = runRandomForest(train_df[features_to_use], train_y)

0.557704508563


In [41]:
pred_y = np.array([np.argmax(i) for i in y_val_pred])
pred_y, y = pd.Series(pred_y), pd.Series(y_val)

print("accuracy :", accuracy_score(pred_y, y))
print("")
print("classification_report :\n", classification_report(y, pred_y))

accuracy : 0.750177287002

classification_report :
              precision    recall  f1-score   support

          0       0.56      0.27      0.36       794
          1       0.50      0.38      0.43      2206
          2       0.81      0.93      0.86      6871

avg / total       0.72      0.75      0.73      9871



### 4-4. XGBoost

In [42]:
train_X = sparse.hstack([train_df[features_to_use], tr_sparse]).tocsr()
test_X = sparse.hstack([test_df[features_to_use], te_sparse]).tocsr()

In [43]:
val_y, preds = train_xgboost(train_X, train_Y)

[0]	train-mlogloss:1.07758	test-mlogloss:1.078
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
[25]	train-mlogloss:0.766519	test-mlogloss:0.775943
[50]	train-mlogloss:0.641937	test-mlogloss:0.659129
[75]	train-mlogloss:0.580935	test-mlogloss:0.604771
[100]	train-mlogloss:0.546446	test-mlogloss:0.576634
[125]	train-mlogloss:0.523436	test-mlogloss:0.559693
[150]	train-mlogloss:0.506475	test-mlogloss:0.548775
[175]	train-mlogloss:0.492295	test-mlogloss:0.540765
[200]	train-mlogloss:0.480482	test-mlogloss:0.534561
[225]	train-mlogloss:0.469915	test-mlogloss:0.529671
[250]	train-mlogloss:0.460963	test-mlogloss:0.52568
[275]	train-mlogloss:0.452436	test-mlogloss:0.522165
[300]	train-mlogloss:0.444752	test-mlogloss:0.519491
[325]	train-mlogloss:0.43742	test-mlogloss:0.517305
[350]	train-mlogloss:0.43016	test-mlogloss:0.51531
[375]	train-mlogloss:0.423417	test-mlogloss:0.513431
[400]	train-ml

In [44]:
pred_y = np.array([np.argmax(i) for i in preds])
pred_y, y = pd.Series(pred_y), pd.Series(val_y)

print("accuracy :", accuracy_score(pred_y, y))
print("")
print("classification_report :\n", classification_report(y, pred_y))

accuracy : 0.779252355384

classification_report :
              precision    recall  f1-score   support

          0       0.63      0.38      0.47       775
          1       0.56      0.48      0.51      2256
          2       0.84      0.93      0.88      6840

avg / total       0.76      0.78      0.77      9871



### 결과(Log Loss)

 * * * 
|                       |              Feature 1 |            Feature 2 | Feature 3              | Feature 4             |
| :---:                  | :---:                       | :---:                       | :---:                      | :---:                     |
|Random Forest |0.614368474766 | 0.616603386848|0.560194517882 |0.54787088605 |
|XGBoost           | 0.576168            |0.57073              |0.527771              | 0.50247           |

![kaggle_submission](./hyeonju_submission.png)