# Kaggle
### Two Sigma Connect : Rental Listing Inquiries

Finding the perfect place to call your new home should be more than browsing through endless listings.<br> 
RentHop makes apartment search smarter by using data to sort rental listings by quality. <br>
Two Sigma Ventures, invite Kagglers to unleash their creative engines to uncover business value in this unique recruiting competition.

In [1]:
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
import random
import string
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import CountVectorizer

Loading dataset

In [2]:
train_set = pd.read_json("train.json")
test_set = pd.read_json("test.json")

### XGBoost 사용

In [3]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=321, num_rounds=2000):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.02
    param['max_depth'] = 6
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model

### Add Features (1)
Logprice, Bedrooms per price, Total number of rooms, Rooms per price
In bathrooms column, some test dataset has different value compared with description column. You can see using .iloc
<br>
Let's change some values.

In [4]:
test_set["bathrooms"].loc[19671] = 1.5
test_set["bathrooms"].loc[22977] = 2.0
test_set["bathrooms"].loc[63719] = 2.0

train_set["logprice"] = np.log(train_set["price"])
test_set["logprice"] = np.log(test_set["price"])

train_set["per_bed"] =train_set["price"]/train_set["bedrooms"]
test_set["per_bed"] = test_set["price"]/test_set["bedrooms"] 

train_set["room_sum"] = train_set["bedrooms"]+train_set["bathrooms"] 
test_set["room_sum"] = test_set["bedrooms"]+test_set["bathrooms"] 

train_set['per_room'] = train_set['price']/train_set['room_sum']
test_set['per_room'] = test_set['price']/test_set['room_sum']

### Add Features (2)
Measure len of photos,features,decription values

In [5]:
train_set["num_photos"] = train_set["photos"].apply(len)
test_set["num_photos"] = test_set["photos"].apply(len)

train_set["num_features"] = train_set["features"].apply(len)
test_set["num_features"] = test_set["features"].apply(len)

train_set["num_description_words"] = train_set["description"].apply(lambda x: len(x.split(" ")))
test_set["num_description_words"] = test_set["description"].apply(lambda x: len(x.split(" ")))

### Add Features (3)
Create_year,Create_month,Create_day and Create_hour

In [6]:
train_set["created"] = pd.to_datetime(train_set["created"])
test_set["created"] = pd.to_datetime(test_set["created"])

train_set["created_year"] = train_set["created"].dt.year
test_set["created_year"] = test_set["created"].dt.year

train_set["created_month"] = train_set["created"].dt.month
test_set["created_month"] = test_set["created"].dt.month

train_set["created_day"] = train_set["created"].dt.day
test_set["created_day"] = test_set["created"].dt.day

train_set["created_hour"] = train_set["created"].dt.hour
test_set["created_hour"] = test_set["created"].dt.hour

### Add Features (4)
Density, Longitude and Latitude

In [7]:
train_set["pos"] = train_set.longitude.round(3).astype(str) + '_' + train_set.latitude.round(3).astype(str)
test_set["pos"] = test_set.longitude.round(3).astype(str) + '_' + test_set.latitude.round(3).astype(str)

vals = train_set['pos'].value_counts()
dvals = vals.to_dict()
train_set["density"] = train_set['pos'].apply(lambda x: dvals.get(x, vals.min()))
test_set["density"] = test_set['pos'].apply(lambda x: dvals.get(x, vals.min()))

In [8]:
features_to_use=["bathrooms", "bedrooms", "latitude", "longitude", "price","per_bed","per_room", "logprice", "density",
"num_photos", "num_features", "num_description_words","listing_id", "created_year", "created_month", "created_day", "created_hour"]

### Add Features (5)
Change word in display_address column and add manager_level column

In [9]:
index=list(range(train_set.shape[0]))
random.shuffle(index)
a=[np.nan]*len(train_set)
b=[np.nan]*len(train_set)
c=[np.nan]*len(train_set)

for i in range(5):
    building_level={}
    for j in train_set['manager_id'].values:
        building_level[j]=[0,0,0]
    
    test_index=index[int((i*train_set.shape[0])/5):int(((i+1)*train_set.shape[0])/5)]
    train_index=list(set(index).difference(test_index))
    
    for j in train_index:
        temp=train_set.iloc[j]
        if temp['interest_level']=='low':
            building_level[temp['manager_id']][0]+=1
        if temp['interest_level']=='medium':
            building_level[temp['manager_id']][1]+=1
        if temp['interest_level']=='high':
            building_level[temp['manager_id']][2]+=1
            
    for j in test_index:
        temp=train_set.iloc[j]
        if sum(building_level[temp['manager_id']])!=0:
            a[j]=building_level[temp['manager_id']][0]*1.0/sum(building_level[temp['manager_id']])
            b[j]=building_level[temp['manager_id']][1]*1.0/sum(building_level[temp['manager_id']])
            c[j]=building_level[temp['manager_id']][2]*1.0/sum(building_level[temp['manager_id']])
            
train_set['manager_level_low']=a
train_set['manager_level_medium']=b
train_set['manager_level_high']=c

a=[]
b=[]
c=[]
building_level={}
for j in train_set['manager_id'].values:
    building_level[j]=[0,0,0]

for j in range(train_set.shape[0]):
    temp=train_set.iloc[j]
    if temp['interest_level']=='low':
        building_level[temp['manager_id']][0]+=1
    if temp['interest_level']=='medium':
        building_level[temp['manager_id']][1]+=1
    if temp['interest_level']=='high':
        building_level[temp['manager_id']][2]+=1

for i in test_set['manager_id'].values:
    if i not in building_level.keys():
        a.append(np.nan)
        b.append(np.nan)
        c.append(np.nan)
    else:
        a.append(building_level[i][0]*1.0/sum(building_level[i]))
        b.append(building_level[i][1]*1.0/sum(building_level[i]))
        c.append(building_level[i][2]*1.0/sum(building_level[i]))
test_set['manager_level_low']=a
test_set['manager_level_medium']=b
test_set['manager_level_high']=c

features_to_use.append('manager_level_low') 
features_to_use.append('manager_level_medium') 
features_to_use.append('manager_level_high')

In [10]:
train_set['address'] = train_set['display_address'].apply(lambda x: x.lower())
test_set['address'] = test_set['display_address'].apply(lambda x: x.lower())

remove_punct_map = dict.fromkeys(map(ord, string.punctuation))
address_map = {
    'w': 'west',
    'st.': 'street',
    'ave': 'avenue',
    'st': 'street',
    'e': 'east',
    'n': 'north',
    's': 'south'
}


def address_map_func(s):
    s = s.split(' ')
    out = []
    for x in s:
        if x in address_map:
            out.append(address_map[x])
        else:
            out.append(x)
    return ' '.join(out)

train_set['address'] = train_set['address'].apply(lambda x: x.translate(remove_punct_map))
test_set['address'] = test_set['address'].apply(lambda x: address_map_func(x))

train_set['address'] = train_set['address'].apply(lambda x: x.translate(remove_punct_map))
test_set['address'] = test_set['address'].apply(lambda x: address_map_func(x))

### LabelEncoder

In [11]:
categorical = ["address", "manager_id", "building_id"]
for f in categorical:
        if train_set[f].dtype=='object':
            lbl = LabelEncoder()
            lbl.fit(list(train_set[f].values) + list(test_set[f].values))
            train_set[f] = lbl.transform(list(train_set[f].values))
            test_set[f] = lbl.transform(list(test_set[f].values))
            features_to_use.append(f)

### CounterVectorizer

In [12]:
train_set['features'] = train_set["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
test_set['features'] = test_set["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))

tfidf = CountVectorizer(stop_words='english', max_features=200)
tr_sparse = tfidf.fit_transform(train_set["features"])
te_sparse = tfidf.transform(test_set["features"])

In [13]:
train_X = sparse.hstack([train_set[features_to_use], tr_sparse]).tocsr()
test_X = sparse.hstack([test_set[features_to_use], te_sparse]).tocsr()

target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(train_set['interest_level'].apply(lambda x: target_num_map[x]))

preds, model = runXGB(train_X, train_y, test_X, num_rounds=2000)
out_df = pd.DataFrame(preds)
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = test_set.listing_id.values
out_df.to_csv("final_yed.csv", index=False)

### Score

![점수](https://github.com/yejiiii/Competition_2017/blob/master/kaggle_score.png?raw=true)