In [401]:
import pandas as pd 
import numpy as np
import seaborn as sns; sns.set(font_scale=1.7) 
import matplotlib.pyplot as plt
import sklearn
import statsmodels.api as sm
import math
import json
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier 
import xgboost as xgb
import re
from sklearn.metrics import log_loss
%matplotlib inline

# Data Cleaning & Feature Engineering

### Read in JSON data, combine into one dataframe total

In [402]:
json_obj = json.load(open('../Data/train.json'))
json_obj.keys()

dict_keys(['display_address', 'photos', 'created', 'description', 'interest_level', 'street_address', 'building_id', 'bathrooms', 'listing_id', 'price', 'longitude', 'features', 'latitude', 'bedrooms', 'manager_id'])

In [403]:
train =pd.DataFrame()
for key in json_obj.keys():
    train[key] = pd.Series(json_obj[key])

In [404]:
json_obj2 = json.load(open('../Data/test.json'))
test = pd.DataFrame()
for key in json_obj2.keys():
    test[key] = pd.Series(json_obj2[key])

In [405]:
train_ori_index = train.index
test_ori_index = test.index

In [406]:
train['label'] = 'Train'
test['label'] = 'Test'
total = pd.concat([train, test], axis = 0)

In [407]:
listing_image = pd.read_csv('listing_image_time.csv')

In [408]:
listing_image.head()

Unnamed: 0,Listing_Id,time_stamp
0,6811957,1479785186
1,6811958,1479786880
2,6811960,1479780964
3,6811964,1479783510
4,6811965,1479786168


In [409]:
total = pd.merge(total, listing_image, left_on = 'listing_id', right_on = 'Listing_Id', how = 'left')

In [410]:
total = total.drop('Listing_Id', axis = 1)

In [411]:
total.head()

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,label,latitude,listing_id,longitude,manager_id,photos,price,street_address,time_stamp
0,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],medium,Train,40.7145,7211212,-73.9425,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue,1478091590
1,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,Columbus Avenue,"[Doorman, Elevator, Fitness Center, Cats Allow...",low,Train,40.7947,7150865,-73.9667,7533621a882f71e25173b27e3139d83d,[https://photos.renthop.com/2/7150865_be3306c5...,5465,808 Columbus Avenue,1478129766
2,1.0,1,c3ba40552e2120b0acfc3cb5730bb2aa,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",W 13 Street,"[Laundry In Building, Dishwasher, Hardwood Flo...",high,Train,40.7388,6887163,-74.0018,d9039c43983f6e564b1482b273bd7b01,[https://photos.renthop.com/2/6887163_de85c427...,2850,241 W 13 Street,1478714436
3,1.0,1,28d9ad350afeaab8027513a3e52ac8d5,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,East 49th Street,"[Hardwood Floors, No Fee]",low,Train,40.7539,6888711,-73.9677,1067e078446a7897d2da493d2f741316,[https://photos.renthop.com/2/6888711_6e660cee...,3275,333 East 49th Street,1478714444
4,1.0,4,0,2016-04-28 01:32:41,Beautifully renovated 3 bedroom flex 4 bedroom...,West 143rd Street,[Pre-War],low,Train,40.8241,6934781,-73.9493,98e13ad4b495b9613cef886d79a6291f,[https://photos.renthop.com/2/6934781_1fa4b41a...,3350,500 West 143rd Street,1478714464


### Covert 'Created' to 'Date' datetime format

In [412]:
import datetime as dt
def get_datetime(strings):
    try:
        date = dt.datetime.strptime(strings, '%Y-%m-%d %H:%M:%S')
        return date
    except:
        return None

In [413]:
total['date'] = total['created'].apply(get_datetime)

In [414]:
def get_month(dt):
    try:
        return dt.month
    except:
        return None

In [415]:
total['month'] = total['date'].apply(get_month)

In [416]:
day_min = total['date'].min()

In [417]:
def get_timediff(date):
    return (date - day_min).days

In [418]:
total['day_diff'] = total['date'].apply(get_timediff)

### Deal with 'photos', 'descriptions'

In [419]:
total['photos_counts'] = total['photos'].apply(len)

In [420]:
total['description_word_count'] = total['description'].apply(lambda x:len(x.split(" ")))

### Deal with Categorical Features: 

In [421]:
from sklearn import preprocessing

In [422]:
categorical = ["display_address", "manager_id", "building_id", "street_address"]
for f in categorical:
        if total[f].dtype=='object':
            #print(f)
            lbl = preprocessing.LabelEncoder()
            total[f] = lbl.fit_transform(total[f])

### Deal with features

In [423]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [424]:
total.ix[1000,'features']

['Swimming Pool',
 'Roof Deck',
 'Doorman',
 'Elevator',
 'Fitness Center',
 'Laundry in Building',
 'High Speed Internet',
 'Dishwasher',
 'Hardwood Floors',
 'No Fee',
 'Dogs Allowed',
 'Cats Allowed']

In [425]:
total['features2'] = total['features'].apply(lambda x:' '.join(['_'.join(i.split(" ")) for i in x]))

In [426]:
cv = CountVectorizer(stop_words='english', max_features=200)

In [427]:
total_cvfeatures = pd.DataFrame(cv.fit_transform(total['features2']).toarray())
total_cvfeatures.index = total.index

In [428]:
total = pd.concat([total, total_cvfeatures], axis = 1)

In [184]:
#  def clean(text):
#     clean1 = re.sub('[^a-zA-Z0-9]',' ',text)
#     clean2 = re.sub(' +', ' ', clean1)
#     clean3 = clean2.lower()
#     return clean3


In [185]:
## tfidf = TfidfVectorizer(max_features= 500, stop_words= 'english')

#  total['features2'] = total['features'].apply(' '.join)

# # total_features = tfidf.fit_transform(total['features2'])

# total_features = pd.DataFrame(total_features.toarray())
# total_features.index = total.index

# total = pd.concat([total, total_features], axis = 1)



### Deal with price

In [429]:
total['total_room'] = total['bedrooms'] + total['bathrooms']

In [430]:
total['price_per_room'] = total['price'] / total['total_room']

### Deal with manager_id

In [431]:
for month in [4,5,6]:
    month_count = total.ix[total['month'] == month, ['manager_id','listing_id']].groupby('manager_id').count()
    month_count.columns = month_count.columns.str.replace('listing_id', 'listings_by_month_'+str(month))
    total = pd.merge(total, month_count, left_on = 'manager_id', right_index = True, how = 'left')

In [432]:
total['manager_listings'] = total.groupby('manager_id')['listing_id'].transform('count')

In [433]:
total.shape

(124011, 229)

In [434]:
total['listings_by_month_4_perc'] = total['listings_by_month_4'] / total['manager_listings']
total['listings_by_month_5_perc'] = total['listings_by_month_5'] / total['manager_listings']
total['listings_by_month_6_perc'] = total['listings_by_month_6'] / total['manager_listings']

In [435]:
total_2 = total.copy()

In [436]:
total_2.shape

(124011, 232)

In [437]:
total = total_2

In [438]:
total.shape

(124011, 232)

In [439]:
total['average_price_by_manager'] = total.groupby('manager_id')['price'].transform('mean')

In [440]:
for month in [4,5,6]:
    month_price = total.ix[total['month'] == month, ['manager_id','price']].groupby('manager_id').mean()
    month_price.columns = month_price.columns.str.replace('price', 'price_by_month_'+str(month))
    total = pd.merge(total, month_price, left_on = 'manager_id', right_index = True, how = 'left')

In [441]:
total.shape

(124011, 236)

### Add listings count by interest level

In [442]:
from sklearn.model_selection import KFold

In [443]:
kfold = KFold(n_splits=5, shuffle=True)

In [444]:
train = total[total['label'] == 'Train']
test = total[total['label'] == 'Test']

In [445]:
for count, (tr_index, te_index) in enumerate(kfold.split(train)):
    train_tr = train.iloc[tr_index]
    train_val = train.iloc[te_index]
    for level in train['interest_level'].unique():
        listing_count = train_tr.ix[train_tr['interest_level'] == level, ['manager_id','listing_id']].groupby('manager_id').count()
        listing_count.columns = listing_count.columns.str.replace('listing_id', 'listing_count_'+ str(level))
        train_val= pd.merge(train_val,  listing_count, left_on = 'manager_id', right_index = True, how = 'left')
    listing_count_alllevel = train_tr.ix[:,['manager_id','listing_id']].groupby('manager_id').count()
    listing_count_alllevel.columns = listing_count_alllevel.columns.str.replace('listing_id', 'listing_count')
    train_val= pd.merge(train_val,  listing_count_alllevel, left_on = 'manager_id', right_index = True, how = 'left')
    if count == 0:
        train_new = train_val
    else:
        train_new = pd.concat((train_new, train_val), axis = 0)


In [446]:
train_new.shape

(49352, 240)

In [447]:
for level in train['interest_level'].unique():
        listing_count = train.ix[train['interest_level'] == level, ['manager_id','listing_id']].groupby('manager_id').count()
        listing_count.columns = listing_count.columns.str.replace('listing_id', 'listing_count_'+ str(level))
        test= pd.merge(test,  listing_count, left_on = 'manager_id', right_index = True, how = 'left')
listing_count_alllevel = train.ix[:, ['manager_id','listing_id']].groupby('manager_id').count()
listing_count_alllevel.columns = listing_count_alllevel.columns.str.replace('listing_id', 'listing_count')
test = pd.merge(test,  listing_count_alllevel, left_on = 'manager_id', right_index = True, how = 'left')

In [448]:
test.shape

(74659, 240)

In [449]:
train_new['listing_count_high_perc'] = train_new['listing_count_high'] / train_new['listing_count']
train_new['listing_count_medium_perc'] = train_new['listing_count_medium'] / train_new['listing_count']
train_new['listing_count_low_perc'] = train_new['listing_count_low'] / train_new['listing_count']


In [450]:
test['listing_count_high_perc'] = test['listing_count_high'] / test['listing_count']
test['listing_count_medium_perc'] = test['listing_count_medium'] / test['listing_count']
test['listing_count_low_perc'] = test['listing_count_low'] / test['listing_count']


In [451]:
total = pd.concat([train_new, test], axis = 0)

In [452]:
total.shape

(124011, 243)

In [453]:
train = total[total['label'] == 'Train']
test = total[total['label'] == 'Test']

for count, (tr_index, te_index) in enumerate(kfold.split(total[total['label'] == 'Train'])):
    train_tr = train.iloc[tr_index]
    train_val = train.iloc[te_index]
    for level in train['interest_level'].unique():
        price = train_tr.ix[train_tr['interest_level'] == level, ['manager_id','price']].groupby('manager_id').mean()
        price.columns = price.columns.str.replace('price', 'price_average_'+ str(level))
        train_val= pd.merge(train_val,  price, left_on = 'manager_id', right_index = True, how = 'left')  
    if count == 0:
        train_new = train_val
    else:
        train_new = pd.concat((train_new, train_val), axis = 0)


In [454]:
for level in train['interest_level'].unique():
        price = train.ix[train['interest_level'] == level, ['manager_id','price']].groupby('manager_id').mean()
        price.columns = price.columns.str.replace('price', 'price_average_'+ str(level))
        test= pd.merge(test,  price, left_on = 'manager_id', right_index = True, how = 'left')

In [455]:
total = pd.concat([train_new, test], axis = 0)

In [456]:
train = total[total['label'] == 'Train']
test = total[total['label'] == 'Test']

for count, (tr_index, te_index) in enumerate(kfold.split(total[total['label'] == 'Train'])):
    train_tr = train.iloc[tr_index]
    train_val = train.iloc[te_index]
    for level in train['interest_level'].unique():
        price = train_tr.ix[train_tr['interest_level'] == level, ['manager_id','price']].groupby('manager_id').median()
        price.columns = price.columns.str.replace('price', 'price_median_'+ str(level))
        train_val= pd.merge(train_val,  price, left_on = 'manager_id', right_index = True, how = 'left')  
    if count == 0:
        train_new = train_val
    else:
        train_new = pd.concat((train_new, train_val), axis = 0)


In [457]:
for level in train['interest_level'].unique():
        price = train.ix[train['interest_level'] == level, ['manager_id','price']].groupby('manager_id').median()
        price.columns = price.columns.str.replace('price', 'price_median_'+ str(level))
        test= pd.merge(test,  price, left_on = 'manager_id', right_index = True, how = 'left')

In [458]:
total = pd.concat([train_new, test], axis = 0)

In [459]:
total.columns.tolist()

['bathrooms',
 'bedrooms',
 'building_id',
 'created',
 'description',
 'display_address',
 'features',
 'interest_level',
 'label',
 'latitude',
 'listing_id',
 'longitude',
 'manager_id',
 'photos',
 'price',
 'street_address',
 'time_stamp',
 'date',
 'month',
 'day_diff',
 'photos_counts',
 'description_word_count',
 'features2',
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,


In [460]:
total.shape

(124011, 249)

In [461]:
## make a copy of the total df after data cleaning
total_copy = total.copy()

### Deal with street_address, display_address, building_id

# Modeling

In [462]:
fe =  ['bathrooms', 'bedrooms', 'latitude', 'longitude', 'price', 'day_diff', 'photos_counts', 'description_word_count'
      , "display_address", "manager_id", "building_id", "street_address", 'listing_id', 'time_stamp']

In [463]:
fe2 = list(range(200))

In [464]:
fe3 = ['total_room','price_per_room', 'manager_listings','listings_by_month_4_perc',
       'listings_by_month_5_perc','listings_by_month_6_perc','average_price_by_manager',
       'price_by_month_4','price_by_month_5','price_by_month_6', 'listing_count_high_perc',
      'listing_count_medium_perc','listing_count_low_perc', 'listing_count',
       'price_average_low','price_average_medium','price_average_high','price_median_low',
       'price_median_medium','price_median_high']

In [465]:
col = fe + fe2 + fe3

In [466]:
col.append('interest_level')

In [467]:
train_x = train_new[col]
test_x = test[col]

In [468]:
train_x = train_x.drop([ 'interest_level'], axis = 1)
test_x = test_x.drop([ 'interest_level'], axis = 1)


In [469]:
train_y = train_new['interest_level'].map({'low':2, 'medium':1, 'high':0})

In [472]:
%store total

Stored 'total' (DataFrame)


In [474]:
total.to_csv('total.csv')

## XGBoost

In [475]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=1000):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.03
    param['max_depth'] = 6
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model


In [78]:
for x in ['manager_listings',
 'listings_by_month_4_perc',
 'listings_by_month_5_perc',
 'listings_by_month_6_perc',
 'average_price_by_manager',
 'price_by_month_4',
 'price_by_month_5',
 'price_by_month_6','listing_count_alllevel',
 'price_average_low',
 'price_average_medium',
 'price_average_high',
 'price_median_low',
 'price_median_medium',
 'price_median_high',
 'interest_level']:
    col.remove(x)


ValueError: list.remove(x): x not in list

In [471]:
%%time
##Round 3
p, model = runXGB(train_x, train_y, test_x, num_rounds=2000)
out_df = pd.DataFrame(p)
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = test.listing_id.values
out_df.to_csv("xgb_3.csv", index=False)

KeyboardInterrupt: 

In [337]:
%%time
p, model = runXGB(train_x, train_y, test_x, num_rounds=2000)

CPU times: user 38min 37s, sys: 13.3 s, total: 38min 50s
Wall time: 39min 47s


In [None]:
out_df = pd.DataFrame(p)
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = test.listing_id.values
out_df.to_csv("xgb_1.csv", index=False)

In [79]:
## rounds = 1000, step = 0.03

In [83]:
%%time
p, model = runXGB(train_x, train_y, test_x, num_rounds=1000)

CPU times: user 18min 39s, sys: 2.83 s, total: 18min 41s
Wall time: 18min 44s


In [84]:
out_df = pd.DataFrame(p)
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = test.listing_id.values
out_df.to_csv("xgb_2.csv", index=False)

In [359]:
### rounds = 1000, step = 0.03

In [82]:
%%time
cv_scores = []
kf = KFold(n_splits=5, shuffle=True, random_state=2016)
for dev_index, val_index in kf.split(range(train_x.shape[0])):
        dev_X, val_X = train_x.ix[dev_index,:], train_x.ix[val_index,:]
        dev_y, val_y = train_y.ix[dev_index], train_y.ix[val_index]
        preds, model = runXGB(dev_X, dev_y, val_X, val_y)
        cv_scores.append(log_loss(val_y, preds))
        print(cv_scores)
        break

[0]	train-mlogloss:1.07802	test-mlogloss:1.07876
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 20 rounds.
[1]	train-mlogloss:1.05839	test-mlogloss:1.05986
[2]	train-mlogloss:1.03985	test-mlogloss:1.04183
[3]	train-mlogloss:1.02209	test-mlogloss:1.02482
[4]	train-mlogloss:1.00511	test-mlogloss:1.00858
[5]	train-mlogloss:0.988978	test-mlogloss:0.993057
[6]	train-mlogloss:0.974028	test-mlogloss:0.978841
[7]	train-mlogloss:0.959208	test-mlogloss:0.964617
[8]	train-mlogloss:0.944819	test-mlogloss:0.950804
[9]	train-mlogloss:0.93107	test-mlogloss:0.937631
[10]	train-mlogloss:0.917863	test-mlogloss:0.925031
[11]	train-mlogloss:0.905299	test-mlogloss:0.913034
[12]	train-mlogloss:0.893312	test-mlogloss:0.901642
[13]	train-mlogloss:0.882428	test-mlogloss:0.891333
[14]	train-mlogloss:0.871185	test-mlogloss:0.880658
[15]	train-mlogloss:0.860473	test-mlogloss:0.870427
[16]	train-mlogloss:0.850462	test-mlog

In [358]:
### rounds = 2000, step = 0.02

In [293]:
%%time
cv_scores = []
kf = KFold(n_splits=5, shuffle=True, random_state=2016)
for dev_index, val_index in kf.split(range(train_x.shape[0])):
        dev_X, val_X = train_x.ix[dev_index,:], train_x.ix[val_index,:]
        dev_y, val_y = train_y.ix[dev_index], train_y.ix[val_index]
        preds, model = runXGB(dev_X, dev_y, val_X, val_y)
        cv_scores.append(log_loss(val_y, preds))
        print(cv_scores)
        break

[0]	train-mlogloss:1.08489	test-mlogloss:1.08481
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 20 rounds.
[1]	train-mlogloss:1.07186	test-mlogloss:1.07176
[2]	train-mlogloss:1.05937	test-mlogloss:1.05931
[3]	train-mlogloss:1.04713	test-mlogloss:1.04702
[4]	train-mlogloss:1.03507	test-mlogloss:1.03496
[5]	train-mlogloss:1.02393	test-mlogloss:1.0238
[6]	train-mlogloss:1.01271	test-mlogloss:1.01256
[7]	train-mlogloss:1.00195	test-mlogloss:1.00177
[8]	train-mlogloss:0.991179	test-mlogloss:0.990978
[9]	train-mlogloss:0.980853	test-mlogloss:0.980698
[10]	train-mlogloss:0.971284	test-mlogloss:0.971122
[11]	train-mlogloss:0.961602	test-mlogloss:0.961496
[12]	train-mlogloss:0.952262	test-mlogloss:0.952197
[13]	train-mlogloss:0.942946	test-mlogloss:0.942918
[14]	train-mlogloss:0.933957	test-mlogloss:0.934027
[15]	train-mlogloss:0.925512	test-mlogloss:0.925669
[16]	train-mlogloss:0.917042	test-mlogloss:0

## Store result 1

In [352]:
%store p

Stored 'p' (ndarray)


In [354]:
%store train_x

Stored 'train_x' (DataFrame)


In [355]:
%store test_x

Stored 'test_x' (DataFrame)


# Stacking

In [275]:
## https://www.kaggle.com/mmueller/allstate-claims-severity/stacking-starter/run/390867

In [476]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

In [477]:
train_st = train_x.copy()
test_st = test_x.copy()

In [478]:
train_st.shape

(49352, 234)

In [479]:
test_st.shape

(74659, 234)

In [480]:
ntrain = train_st.shape[0]
ntest = test_st.shape[0]

In [481]:
train_test = pd.concat((train_st, test_st)).reset_index(drop=True)

In [482]:
x_train = np.array(train_test.iloc[:ntrain,:])
x_test = np.array(train_test.iloc[ntrain:,:])
y_train = train_y

In [483]:
NFOLDS = 5
SEED = 0
kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)

In [484]:
class SklearnWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def fit2(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict2(self, x):
        return self.clf.predict_proba(x)


In [485]:
class XgbWrapper(object):
    def __init__(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params['nrounds']

    def fit2(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label=y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)

    def predict2(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))

In [486]:
def get_oof(clf, xtrain, ytrain, xtest):
    oof_train = np.zeros((ntrain,3))
    oof_test = np.zeros((ntest,3))
    oof_test_temp =  np.zeros((ntest,3))
    for i, (train_index, test_index) in enumerate(kf.split(xtrain)):
        x_tr = xtrain.iloc[train_index]
        x_te = xtrain.iloc[test_index]
        y_tr = ytrain.iloc[train_index]
        
        clf.fit2(x_tr, y_tr)
        oof_train[test_index] = clf.predict2(x_te)
        oof_test_temp = clf.predict2(xtest)
        oof_test = oof_test + oof_test_temp
    oof_test = oof_test / NFOLDS
    return oof_train, oof_test

In [487]:
## http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html#sklearn.ensemble.ExtraTreesClassifier
et_params = {
    'n_jobs': -1,
    'n_estimators': 1000,
    'max_features': 0.5,
    'max_depth': 10,
    'min_samples_leaf': 2,
}

In [488]:
%%time
et_oof_train, et_oof_test = get_oof(et, train_st_sklearn, y_train, test_st_sklearn)

CPU times: user 4.54 s, sys: 1.15 s, total: 5.69 s
Wall time: 4.35 s


In [489]:
et_oof_train.shape

(49352, 3)

In [490]:
et_oof_train[:5]

array([[ 0.07815705,  0.22718247,  0.69466048],
       [ 0.08014748,  0.23340559,  0.68644693],
       [ 0.07775808,  0.22472896,  0.69751296],
       [ 0.07767902,  0.22819671,  0.69412428],
       [ 0.07729347,  0.22888381,  0.69382272]])

In [491]:
## http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
rf_params = {
    'n_jobs': 16,
    'n_estimators': 1000,
    'max_features': 0.5,
    'max_depth': 10,
    'min_samples_leaf': 2,
}

In [492]:
%%time
rf_oof_train, rf_oof_test = get_oof(rf, train_st_sklearn, y_train, test_st_sklearn)

CPU times: user 5.22 s, sys: 1.2 s, total: 6.42 s
Wall time: 5.08 s


In [493]:
rf_oof_train.shape

(49352, 3)

In [494]:
rf_oof_train[:5]

array([[ 0.07742713,  0.23507993,  0.68749294],
       [ 0.08596793,  0.23777304,  0.67625902],
       [ 0.07946921,  0.23062626,  0.68990453],
       [ 0.07691489,  0.22776303,  0.69532208],
       [ 0.07675497,  0.23102401,  0.69222102]])

In [310]:
## 
knn_params = {}
knn_params

In [315]:
## http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
svm_params = {}
svm_params['C'] = 1.0
svm_params['Probability'] = False

In [313]:
svm = SklearnWrapper(clf=SVC, seed=SEED, params=svm_params)

In [None]:
%%time
svm_oof_train, svm_oof_test = get_oof(svm, train_st_sklearn, y_train, test_st_sklearn)

In [None]:
svm_oof_train

In [306]:
## http://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html#sklearn.neural_network.MLPClassifier
nn_params = {}
nn_params['hidden_layer_sizes'] = 100
nn_params['activation'] = 'relu'
nn_params['alpha'] = 0.001
nn_params['max_iter'] = 5000
nn_params['random_state'] = SEED
nn_params['learning_rate_init'] = 0.001
nn_params['early_stopping'] = False


In [307]:
nn = SklearnWrapper(clf=MLPClassifier, seed=SEED, params=nn_params)

In [308]:
%%time
nn_oof_train, nn_oof_test = get_oof(nn, train_st_sklearn, y_train, test_st_sklearn)

CPU times: user 32 s, sys: 4.23 s, total: 36.2 s
Wall time: 30.2 s


In [309]:
nn_oof_train

array([[ 0.,  0.,  1.],
       [ 0.,  0.,  1.],
       [ 0.,  0.,  1.],
       ..., 
       [ 0.,  0.,  1.],
       [ 0.,  0.,  1.],
       [ 0.,  0.,  1.]])

In [496]:
xgb_params = {}
xgb_params['objective']='multi:softprob'
xgb_params['eta']=0.03
xgb_params['max_depth']=6
xgb_params['silent']=1
xgb_params['num_class']=3
xgb_params['eval_metric']="mlogloss"
xgb_params['min_child_weight']=1
xgb_params['subsample']=0.7
xgb_params['colsample_bytree']=0.7
xgb_params['seed']=0
xgb_params['nrounds']=1000


In [497]:
xg = XgbWrapper(seed=SEED, params=xgb_params)
et = SklearnWrapper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
rf = SklearnWrapper(clf=RandomForestClassifier, seed=SEED, params=rf_params)

In [225]:
train_st_sklearn = train_st.copy()
test_st_sklearn = test_st.copy()

In [228]:
test_st_sklearn = test_st_sklearn.replace(np.inf, np.nan)
train_st_sklearn = train_st_sklearn.replace(np.inf, np.nan)

In [230]:
train_st_sklearn = train_st_sklearn.fillna(train_st_sklearn.median())

test_st_sklearn = test_st_sklearn.fillna(test_st_sklearn.median())

In [499]:
%%time
xg_oof_train, xg_oof_test = get_oof(xg, train_x, train_y, test_x)

CPU times: user 1h 30min 10s, sys: 1min 11s, total: 1h 31min 22s
Wall time: 1h 59min 44s


In [177]:
xg_oof_train.shape

(49352, 3)

In [178]:
xg_oof_test.shape

(74659, 3)

In [500]:
x_train_after_stack = np.concatenate((xg_oof_train, et_oof_train, rf_oof_train), axis=1)
x_test_after_stack = np.concatenate((xg_oof_test, et_oof_test, rf_oof_test), axis=1)

In [503]:
%%time
p, model = runXGB(x_train_after_stack, train_y, x_test_after_stack, num_rounds=1000)
out_df = pd.DataFrame(p)
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = test.listing_id.values
out_df.to_csv("xgb_4.csv", index=False)

CPU times: user 2min 8s, sys: 219 ms, total: 2min 8s
Wall time: 2min 9s
