# 필요한 모듈 호출

In [1]:
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
import random
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import CountVectorizer



## 1.데이터 호출

In [None]:
train_df = pd.read_json("./data/train.json").sort_values(by='listing_id')
test_df = pd.read_json("./data/test.json").sort_values(by='listing_id')

## 2.XGB 모델 생성

In [None]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=321, num_rounds=3000):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.03
    param['max_depth'] = 6
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=30)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model

In [2]:
test_df["bathrooms"].loc[19671] = 1.5
test_df["bathrooms"].loc[22977] = 2.0
test_df["bathrooms"].loc[63719] = 2.0
train_df["price"] = train_df["price"].clip(upper=13000)

train_df["logprice"] = np.log(train_df["price"])
test_df["logprice"] = np.log(test_df["price"])

train_df["price_t"] =train_df["price"]/train_df["bedrooms"]
test_df["price_t"] = test_df["price"]/test_df["bedrooms"] 

train_df["room_sum"] = train_df["bedrooms"]+train_df["bathrooms"] 
test_df["room_sum"] = test_df["bedrooms"]+test_df["bathrooms"] 

train_df['price_per_room'] = train_df['price']/train_df['room_sum']
test_df['price_per_room'] = test_df['price']/test_df['room_sum']

train_df["num_photos"] = train_df["photos"].apply(len)
test_df["num_photos"] = test_df["photos"].apply(len)

train_df["num_features"] = train_df["features"].apply(len)
test_df["num_features"] = test_df["features"].apply(len)

train_df["num_description_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))
test_df["num_description_words"] = test_df["description"].apply(lambda x: len(x.split(" ")))

train_df["created"] = pd.to_datetime(train_df["created"])
test_df["created"] = pd.to_datetime(test_df["created"])
train_df["created_year"] = train_df["created"].dt.year
test_df["created_year"] = test_df["created"].dt.year
train_df["created_month"] = train_df["created"].dt.month
test_df["created_month"] = test_df["created"].dt.month
train_df["created_day"] = train_df["created"].dt.day
test_df["created_day"] = test_df["created"].dt.day
train_df["created_hour"] = train_df["created"].dt.hour
test_df["created_hour"] = test_df["created"].dt.hour

train_df["created_doy"] = train_df["created"].dt.dayofyear
test_df["created_doy"] = test_df["created"].dt.dayofyear

train_df["pos"] = train_df.longitude.round(3).astype(str) + '_' + train_df.latitude.round(3).astype(str)
test_df["pos"] = test_df.longitude.round(3).astype(str) + '_' + test_df.latitude.round(3).astype(str)

vals = train_df['pos'].value_counts()
dvals = vals.to_dict()
train_df["density"] = train_df['pos'].apply(lambda x: dvals.get(x, vals.min()))
test_df["density"] = test_df['pos'].apply(lambda x: dvals.get(x, vals.min()))

features_to_use=["bathrooms", "bedrooms", "latitude", "longitude", "price","price_t","price_per_room", "logprice", "density",
"num_photos", "num_features", "num_description_words","listing_id", "created_year", "created_month", "created_day", "created_hour","created_doy"]

## 3.위도, 경도를 활용하여 feature 생성

In [3]:
import math
def cart2rho(x, y):
    rho = np.sqrt(x**2 + y**2)
    return rho


def cart2phi(x, y):
    phi = np.arctan2(y, x)
    return phi


def rotation_x(row, alpha):
    x = row['latitude']
    y = row['longitude']
    return x*math.cos(alpha) + y*math.sin(alpha)


def rotation_y(row, alpha):
    x = row['latitude']
    y = row['longitude']
    return y*math.cos(alpha) - x*math.sin(alpha)


def add_rotation(degrees, df):
    namex = "rot" + str(degrees) + "_X"
    namey = "rot" + str(degrees) + "_Y"

    df['num_' + namex] = df.apply(lambda row: rotation_x(row, math.pi/(180/degrees)), axis=1)
    df['num_' + namey] = df.apply(lambda row: rotation_y(row, math.pi/(180/degrees)), axis=1)

    return df

def operate_on_coordinates(tr_df, te_df):
    for df in [tr_df, te_df]:
        #polar coordinates system
        df["num_rho"] = df.apply(lambda x: cart2rho(x["latitude"] - 40.78222222, x["longitude"]+73.96527777), axis=1)
        df["num_phi"] = df.apply(lambda x: cart2phi(x["latitude"] - 40.78222222, x["longitude"]+73.96527777), axis=1)
        #rotations
        for angle in [15,30,45,60]:
            df = add_rotation(angle, df)

    return tr_df, te_df

train_df, test_df = operate_on_coordinates(train_df, test_df)

In [5]:
import s2sphere

lat = train_df.latitude.values.tolist()
lon = train_df.longitude.values.tolist()

cellId1 = []
cellId2 = []
cellId3 = []
for i in range(0,len(lat)):
    p1 = s2sphere.LatLng.from_degrees(lat[i], lon[i])
    cell = s2sphere.CellId.from_lat_lng(p1)
    cid = str(cell.id())
    #print(cid)
    ##cid is a 19 digit number so python storing it as Object, not number
    ##So I am converting it into 3 numbers
    cellId1.append(int(cid[:6]))
    cellId2.append(int(cid[6:12]))
    cellId3.append(int(cid[12:19]))
    

se = pd.Series(cellId1)
train_df['cellId1'] = se.values

se = pd.Series(cellId2)
train_df['cellId2'] = se.values

se = pd.Series(cellId3)
train_df['cellId3'] = se.values

lat = test_df.latitude.values.tolist()
lon = test_df.longitude.values.tolist()

cellId1 = []
cellId2 = []
cellId3 = []
for i in range(0,len(lat)):
    p1 = s2sphere.LatLng.from_degrees(lat[i], lon[i])
    cell = s2sphere.CellId.from_lat_lng(p1)
    cid = str(cell.id())
    #print(cid)
    cellId1.append(int(cid[:6]))
    cellId2.append(int(cid[6:12]))
    cellId3.append(int(cid[12:19]))
    

se = pd.Series(cellId1)
test_df['cellId1'] = se.values

se = pd.Series(cellId2)
test_df['cellId2'] = se.values

se = pd.Series(cellId1)
test_df['cellId3'] = se.values
#selectedFeatures.extend(['cellId1','cellId2','cellId3'])

## 4.Description 요소를 정규화하여 Feature 생성
#### - Description에 대문자 갯수,  줄 수, 수정여부, 이메일 여부, 전화번호가 평가에 영향을 미친다

In [4]:
import re

def cap_share(x):
    return sum(1 for c in x if c.isupper())/float(len(x)+1)

for df in [train_df, test_df]:
    # do you think that users might feel annoyed BY A DESCRIPTION THAT IS SHOUTING AT THEM?
    df['num_cap_share'] = df['description'].apply(cap_share)
    
    # how long in lines the desc is?
    df['num_nr_of_lines'] = df['description'].apply(lambda x: x.count('<br /><br />'))
   
    # is the description redacted by the website?        
    df['num_redacted'] = 0
    df['num_redacted'].ix[df['description'].str.contains('website_redacted')] = 1

    
    # can we contact someone via e-mail to ask for the details?
    df['num_email'] = 0
    df['num_email'].ix[df['description'].str.contains('@')] = 1
    
    #and... can we call them?
    
    reg = re.compile(".*?(\(?\d{3}\D{0,3}\d{3}\D{0,3}\d{4}).*?", re.S)
    def try_and_find_nr(description):
        if reg.match(description) is None:
            return 0
        return 1

    df['num_phone_nr'] = df['description'].apply(try_and_find_nr)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


## 5.생성된 Feature 확인

In [6]:
train_df.columns

Index(['bathrooms', 'bedrooms', 'building_id', 'created', 'description',
       'display_address', 'features', 'interest_level', 'latitude',
       'listing_id', 'longitude', 'manager_id', 'photos', 'price',
       'street_address', 'logprice', 'price_t', 'room_sum', 'price_per_room',
       'num_photos', 'num_features', 'num_description_words', 'created_year',
       'created_month', 'created_day', 'created_hour', 'created_doy', 'pos',
       'density', 'num_rho', 'num_phi', 'num_rot15_X', 'num_rot15_Y',
       'num_rot30_X', 'num_rot30_Y', 'num_rot45_X', 'num_rot45_Y',
       'num_rot60_X', 'num_rot60_Y', 'num_cap_share', 'num_nr_of_lines',
       'num_redacted', 'num_email', 'num_phone_nr', 'cellId1', 'cellId2',
       'cellId3'],
      dtype='object')

## 6.생성한 feature 추가

In [7]:
features_to_use.append("num_rho")
features_to_use.append("num_phi")
features_to_use.append("num_rot15_X")
features_to_use.append("num_rot15_Y")
features_to_use.append("num_rot30_X")
features_to_use.append("num_rot30_Y")
features_to_use.append("num_rot45_X")
features_to_use.append("num_rot45_Y")
features_to_use.append("num_rot60_X")
features_to_use.append("num_rot60_Y")
features_to_use.append("num_cap_share")
features_to_use.append("num_nr_of_lines")
features_to_use.append("num_redacted")
features_to_use.append("num_email")
features_to_use.append("num_phone_nr")
features_to_use.append("cellId1")
features_to_use.append("cellId2")
features_to_use.append("cellId3")


## 7.Manager ID로 Feature 생성
#### - Manager에 대한 평가가 전체적인 평가에 영향을 미친다

In [8]:
index=list(range(train_df.shape[0]))
random.shuffle(index)
a=[np.nan]*len(train_df)
b=[np.nan]*len(train_df)
c=[np.nan]*len(train_df)

pr=[np.nan]*len(train_df)
for i in range(5):
    building_level={}
    price={}
    for j in train_df['manager_id'].values:
        building_level[j]=[0,0,0]
        price[j]=0
    
    test_index=index[int((i*train_df.shape[0])/5):int(((i+1)*train_df.shape[0])/5)]
    train_index=list(set(index).difference(test_index))
    
    for j in train_index:
        temp=train_df.iloc[j]
        if temp['interest_level']=='low':
            building_level[temp['manager_id']][0]+=1
            price[temp['manager_id']]+=temp['price']

        if temp['interest_level']=='medium':
            building_level[temp['manager_id']][1]+=1
            price[temp['manager_id']]+=temp['price']
        if temp['interest_level']=='high':
            building_level[temp['manager_id']][2]+=1
            price[temp['manager_id']]+=temp['price']
            
    for j in test_index:
        temp=train_df.iloc[j]
        if sum(building_level[temp['manager_id']])!=0:
            a[j]=building_level[temp['manager_id']][0]*1.0/sum(building_level[temp['manager_id']])
            b[j]=building_level[temp['manager_id']][1]*1.0/sum(building_level[temp['manager_id']])
            c[j]=building_level[temp['manager_id']][2]*1.0/sum(building_level[temp['manager_id']])
            pr[j]=price[temp['manager_id']]*1.0/sum(building_level[temp['manager_id']])
train_df['manager_level_low']=a
train_df['manager_level_medium']=b
train_df['manager_level_high']=c
train_df['avg_price']=pr

a=[]
b=[]
c=[]
pr=[]

building_level={}
for j in train_df['manager_id'].values:
    building_level[j]=[0,0,0]
    price[j]=0

for j in range(train_df.shape[0]):
    temp=train_df.iloc[j]
    if temp['interest_level']=='low':
        building_level[temp['manager_id']][0]+=1
        price[temp['manager_id']]+=temp['price']

    if temp['interest_level']=='medium':
        building_level[temp['manager_id']][1]+=1
        price[temp['manager_id']]+=temp['price']

    if temp['interest_level']=='high':
        building_level[temp['manager_id']][2]+=1
        price[temp['manager_id']]+=temp['price']


for i in test_df['manager_id'].values:
    if i not in building_level.keys():
        a.append(np.nan)
        b.append(np.nan)
        c.append(np.nan)
        pr.append(np.nan)
    else:
        a.append(building_level[i][0]*1.0/sum(building_level[i]))
        b.append(building_level[i][1]*1.0/sum(building_level[i]))
        c.append(building_level[i][2]*1.0/sum(building_level[i]))
        pr.append(price[i]*1.0/sum(building_level[i]))
        
test_df['manager_level_low']=a
test_df['manager_level_medium']=b
test_df['manager_level_high']=c
test_df['avg_price']=pr
features_to_use.append('manager_level_low') 
features_to_use.append('manager_level_medium') 
features_to_use.append('manager_level_high')
features_to_use.append('avg_price')

categorical = ["display_address", "manager_id", "building_id"]
for f in categorical:
        if train_df[f].dtype=='object':
            lbl = LabelEncoder()
            lbl.fit(list(train_df[f].values) + list(test_df[f].values))
            train_df[f] = lbl.transform(list(train_df[f].values))
            test_df[f] = lbl.transform(list(test_df[f].values))
            features_to_use.append(f)

train_df['features'] = train_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
test_df['features'] = test_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))

## 8.CountVectorizer로 출몰 빈도수 높은 feature 추출
#### - 너무 많은 feature를 생성하였기 때문에, feature 수를 줄여줌

In [9]:
tfidf = CountVectorizer(stop_words='english', max_features=200)
tr_sparse = tfidf.fit_transform(train_df["features"])
te_sparse = tfidf.transform(test_df["features"])

train_X = sparse.hstack([train_df[features_to_use], tr_sparse]).tocsr()
test_X = sparse.hstack([test_df[features_to_use], te_sparse]).tocsr()

target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))


## 9.모델 학습
#### - XGB 사용

In [12]:
from sklearn import model_selection, preprocessing, ensemble

cv_scores = []
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2016)
for dev_index, val_index in kf.split(range(train_X.shape[0])):
        dev_X, val_X = train_X[dev_index,:], train_X[val_index,:]
        dev_y, val_y = train_y[dev_index], train_y[val_index]
        preds, model = runXGB(dev_X, dev_y, val_X, val_y)
        cv_scores.append(log_loss(val_y, preds))
        print(cv_scores)
        break
        

[0]	train-mlogloss:1.07798	test-mlogloss:1.07842
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 30 rounds.
[1]	train-mlogloss:1.05811	test-mlogloss:1.05909
[2]	train-mlogloss:1.03957	test-mlogloss:1.04102
[3]	train-mlogloss:1.02245	test-mlogloss:1.02429
[4]	train-mlogloss:1.00523	test-mlogloss:1.00755
[5]	train-mlogloss:0.989158	test-mlogloss:0.991879
[6]	train-mlogloss:0.973429	test-mlogloss:0.976557
[7]	train-mlogloss:0.958882	test-mlogloss:0.962449
[8]	train-mlogloss:0.944618	test-mlogloss:0.948532
[9]	train-mlogloss:0.931016	test-mlogloss:0.935342
[10]	train-mlogloss:0.918237	test-mlogloss:0.922969
[11]	train-mlogloss:0.905633	test-mlogloss:0.910835
[12]	train-mlogloss:0.893453	test-mlogloss:0.899089
[13]	train-mlogloss:0.881882	test-mlogloss:0.887888
[14]	train-mlogloss:0.870744	test-mlogloss:0.877097
[15]	train-mlogloss:0.860179	test-mlogloss:0.866909
[16]	train-mlogloss:0.85011	test-mlog

## 성능 평가

In [14]:
from sklearn.metrics import classification_report

pred_y = np.array([np.argmax(i) for i in preds])
pred_y, y = pd.Series(pred_y), pd.Series(val_y)

print(classification_report(y, pred_y))

             precision    recall  f1-score   support

          0       0.60      0.37      0.46       747
          1       0.54      0.43      0.48      2319
          2       0.83      0.92      0.87      6805

avg / total       0.74      0.76      0.75      9871



In [11]:
preds, model = runXGB(train_X, train_y, test_X, num_rounds=1550)
out_df = pd.DataFrame(preds)
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = test_df.listing_id.values
out_df.to_csv("0.53208_7.csv", index=False)

## 제출 결과 0.52952 나옴

<img width="800px" height="500px" src="https://github.com/jinongkim/pycon_image_data/blob/master/0.52952.png?raw=true"></img>
