<h1>Kaggle - Two Sigma Connect: Rental Listing Inquiries_Yehoon</h1>

<h2>모듈 호출</h2>

In [1]:
import os
import sys
import operator
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
import random
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn_pandas import DataFrameMapper
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, Imputer, LabelBinarizer
from sklearn.cluster import KMeans

<h2>데이터 호출</h2>

In [2]:
train_df=pd.read_json('train.json')
test_df=pd.read_json('test.json')

train_df = train_df.sort_index(axis=1)
test_df = test_df.sort_index(axis=1)

<h2>XGB 호출</h2>

In [3]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=1500):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.02
    param['max_depth'] = 6
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model

<h2>Basic Feature 추가</h2>

In [4]:
features_to_use=["bathrooms", "bedrooms", "latitude", "longitude", "price",
                 "price_t","photo_count",  "num_description_words","listing_id",
                 "display_address", "manager_id", "building_id", "street_address",'price_per_room']

<h2>Feature Engineering</h2>

<h3>1. Numeric Features - room, bedroom, bathroom, price, description, photo data </h3><br>
- room, bed, bathroom, price 등의 개별적 데이터들을 이용해 pricePedSth Data를 생성함<br>
- photo와 description 등 non-numerical data를 numerical data로 전환

In [5]:
def add_features(df):
    fmt = lambda s: s.replace("\u00a0", "").strip().lower()
    df["photo_count"] = df["photos"].apply(len)
    df["price_t"] =df["price"]/df["bedrooms"]
    df["pricePerBed"] = df['price'] / df['bedrooms']
    df["pricePerBath"] = df['price'] / df['bathrooms']
    df["pricePerRoom"] = df['price'] / (df['bedrooms'] + df['bathrooms'])
    df["bedPerBath"] = df['bedrooms'] / df['bathrooms']
    df["bedBathDiff"] = df['bedrooms'] - df['bathrooms']
    df["bedBathSum"] = df["bedrooms"] + df['bathrooms']
    df["bedsPerc"] = df["bedrooms"] / (df['bedrooms'] + df['bathrooms'])
    df["num_description_words"] = df["description"].apply(lambda x: len(x.split(" ")))
    df["room_sum"] = df["bedrooms"]+df["bathrooms"] 
    df['price_per_room'] = df['price']/df['room_sum']

    return df

In [6]:
train_df = add_features(train_df)
test_df = add_features(test_df)

<h3>2. Manager Id Classify Using Interest Level</h3>
- Manager Id를 Interest Level을 이용해 'high', 'medium', 'low'로 구분

In [7]:
index=list(range(train_df.shape[0]))
random.shuffle(index)
a=[np.nan]*len(train_df)
b=[np.nan]*len(train_df)
c=[np.nan]*len(train_df)

for i in range(5):
    building_level={}
    for j in train_df['manager_id'].values:
        building_level[j]=[0,0,0]
    test_index=index[int((i*train_df.shape[0])/5):int(((i+1)*train_df.shape[0])/5)]
    train_index=list(set(index).difference(test_index))
    for j in train_index:
        temp=train_df.iloc[j]
        if temp['interest_level']=='low':
            building_level[temp['manager_id']][0]+=1
        if temp['interest_level']=='medium':
            building_level[temp['manager_id']][1]+=1
        if temp['interest_level']=='high':
            building_level[temp['manager_id']][2]+=1
    for j in test_index:
        temp=train_df.iloc[j]
        if sum(building_level[temp['manager_id']])!=0:
            a[j]=building_level[temp['manager_id']][0]*1.0/sum(building_level[temp['manager_id']])
            b[j]=building_level[temp['manager_id']][1]*1.0/sum(building_level[temp['manager_id']])
            c[j]=building_level[temp['manager_id']][2]*1.0/sum(building_level[temp['manager_id']])
train_df['manager_level_low']=a
train_df['manager_level_medium']=b
train_df['manager_level_high']=c



a=[]
b=[]
c=[]
building_level={}
for j in train_df['manager_id'].values:
    building_level[j]=[0,0,0]
for j in range(train_df.shape[0]):
    temp=train_df.iloc[j]
    if temp['interest_level']=='low':
        building_level[temp['manager_id']][0]+=1
    if temp['interest_level']=='medium':
        building_level[temp['manager_id']][1]+=1
    if temp['interest_level']=='high':
        building_level[temp['manager_id']][2]+=1

for i in test_df['manager_id'].values:
    if i not in building_level.keys():
        a.append(np.nan)
        b.append(np.nan)
        c.append(np.nan)
    else:
        a.append(building_level[i][0]*1.0/sum(building_level[i]))
        b.append(building_level[i][1]*1.0/sum(building_level[i]))
        c.append(building_level[i][2]*1.0/sum(building_level[i]))
test_df['manager_level_low']=a
test_df['manager_level_medium']=b
test_df['manager_level_high']=c

features_to_use.append('manager_level_low') 
features_to_use.append('manager_level_medium') 
features_to_use.append('manager_level_high')

<h3>3. Select Key-Featuers of Room</h3>
- Feature 중 출현 빈도수가 높아 결과에 영향을 끼치는 정도가 큰 Feature들을 TF-IDF 후 추가함

In [8]:
targets_1= ["No Fee"]
train_df["No_Fee"]=train_df.features.apply(lambda sentence: any(word in sentence for word in targets_1))
train_df.No_Fee=train_df.No_Fee.astype(int)

targets_2= ["Cat Allowed"]
train_df["Cat_Allowed"]=train_df.features.apply(lambda sentence: any(word in sentence for word in targets_2))
train_df.Cat_Allowed=train_df.Cat_Allowed.astype(int)

targets_3= ["Dog Allowed"]
train_df["Dog_Allowed"]=train_df.features.apply(lambda sentence: any(word in sentence for word in targets_3))
train_df.Dog_Allowed=train_df.Dog_Allowed.astype(int)

targets_4= ["Elevator"]
train_df["Elevator"]=train_df.features.apply(lambda sentence: any(word in sentence for word in targets_4))
train_df.Elevator=train_df.Elevator.astype(int)


targets_1= ["No Fee"]
test_df["No_Fee"]=test_df.features.apply(lambda sentence: any(word in sentence for word in targets_1))
test_df.No_Fee=test_df.No_Fee.astype(int)

targets_2= ["Cat Allowed"]
test_df["Cat_Allowed"]=test_df.features.apply(lambda sentence: any(word in sentence for word in targets_2))
test_df.Cat_Allowed=test_df.Cat_Allowed.astype(int)

targets_3= ["Dog Allowed"]
test_df["Dog_Allowed"]=test_df.features.apply(lambda sentence: any(word in sentence for word in targets_3))
test_df.Dog_Allowed=test_df.Dog_Allowed.astype(int)

targets_4= ["Elevator"]
test_df["Elevator"]=test_df.features.apply(lambda sentence: any(word in sentence for word in targets_4))
test_df.Elevator=test_df.Elevator.astype(int)

features_to_use.extend(["No_Fee", "Cat_Allowed", "Dog_Allowed", "Elevator"])

In [10]:
train_df['features'] = train_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
test_df['features'] = test_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
print(train_df["features"].head())
tfidf = CountVectorizer(stop_words='english', max_features=200)
tr_sparse = tfidf.fit_transform(train_df["features"])
te_sparse = tfidf.transform(test_df["features"])

10                                                         
10000     Doorman Elevator Fitness_Center Cats_Allowed D...
100004    Laundry_In_Building Dishwasher Hardwood_Floors...
100007                               Hardwood_Floors No_Fee
100013                                              Pre-War
Name: features, dtype: object


<h3>4. Density and Distance</h3>
- longitude와 latitude를 이용하여 중심지로부터의 거리와 건물들 간의 밀집 정도를 구함

In [9]:
x = np.sqrt(((train_df.latitude - train_df.latitude.median())**2) + (train_df.longitude - train_df.longitude.median())**2)
train_df['num_dist_from_center'] = x.values
x = np.sqrt(((test_df.latitude - train_df.latitude.median())**2) + (test_df.longitude - train_df.longitude.median())**2)
test_df['num_dist_from_center'] = x.values
features_to_use.extend(["num_dist_from_center"])


train_df["pos"] = train_df.longitude.round(3).astype(str) + '_' + train_df.latitude.round(3).astype(str)
test_df["pos"] = test_df.longitude.round(3).astype(str) + '_' + test_df.latitude.round(3).astype(str)


vals = train_df['pos'].value_counts()
dvals = vals.to_dict()
train_df["density"] = train_df['pos'].apply(lambda x: dvals.get(x, vals.min()))
test_df["density"] = test_df['pos'].apply(lambda x: dvals.get(x, vals.min()))
features_to_use.extend(["density"])


<h3>5. Categorize with Dispaly Address, Manager Id, Building Id and Street Address</h3>
- display address, manager id, building_id, street address를 categorize함

In [11]:
categorical = ["display_address", "manager_id", "building_id", "street_address"]
for f in categorical:
        if train_df[f].dtype=='object':
            #print(f)
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(train_df[f].values) + list(test_df[f].values))
            train_df[f] = lbl.transform(list(train_df[f].values))
            test_df[f] = lbl.transform(list(test_df[f].values))
            features_to_use.append(f)

<h2>Train Set, Test Set</h2>

In [12]:
train_X = sparse.hstack([train_df[features_to_use], tr_sparse]).tocsr()
test_X = sparse.hstack([test_df[features_to_use], te_sparse]).tocsr()

target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))
print(train_X.shape, test_X.shape)

(49352, 227) (74659, 227)


<h2>Logloss and Score</h2>

<h3>Logloss : 0.522379 <br>
Score : 0.53337</h3>