In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn import cross_validation
from sklearn.grid_search import GridSearchCV




In [2]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

In [3]:
from sklearn import metrics

In [4]:
ds = pd.read_csv('../../data/ml4/train_data.csv')

In [25]:
ds_test = pd.read_csv('../../data/ml4/test_data.csv')

In [5]:
predictors = [x for x in ds.columns if x not in ['connection_id','target']]

In [6]:
def modelfitxg(alg,dtrain,predictors,useTrainCV=False,cv_folds=5,early_stopping_rounds=50):
    
    if useTrainCV:
        
        xgb_params = alg.get_xgb_params()
        xgb_params['num_class'] = 3
        print(xgb_params)
        xgtrain = xgb.DMatrix(dtrain[predictors].values,label=dtrain['target'].values)
        print('prepared Dmatrix')
        cvresult = xgb.cv(xgb_params, xgtrain, num_boost_round = alg.get_params()['n_estimators'],
                         nfold=cv_folds,early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
        
    # fit the algorithm
        print(cvresult)
    
    print('Fitting Model')
    alg.fit(dtrain[predictors],dtrain['target'],eval_metric='auc')
    print('Model Fitted')
    #predict training set
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
    
    print ("Accuracy: %.4f"%metrics.accuracy_score(dtrain['target'].values,dtrain_predictions))
#     print ("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain[''], dtrain_predprob))
    
    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar',title='Feature Importance')
    plt.ylabel('Feature Importance Score')
    
#     return cvresult

In [9]:
xgb1 = XGBClassifier(learning_rate=0.1,
              n_estimators=500,
              max_depth=6,
              min_child_weight=5,
              gamma=0.1,
              subsample=0.9,
              colsample_bytree=0.7,
              objective='multi:softmax',
              nthread=4,
              reg_alpha = 0.05,
              scale_pos_weight=1,
              seed=27)

In [10]:
params = xgb1.get_xgb_params()

In [11]:
params['num_class'] = 3
params

{'base_score': 0.5,
 'colsample_bylevel': 1,
 'colsample_bytree': 0.7,
 'gamma': 2,
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 6,
 'min_child_weight': 5,
 'missing': None,
 'n_estimators': 500,
 'nthread': 4,
 'num_class': 3,
 'objective': 'multi:softmax',
 'reg_alpha': 0.05,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'seed': 27,
 'silent': 1,
 'subsample': 0.9}

In [12]:
xgtrain = xgb.DMatrix(ds[predictors].values,label=ds['target'].values)

In [12]:
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(4,9)
    for min_child_weight in range(4,8)
]

In [15]:
min_mae = float("Inf")
# 4894<min_mae

In [None]:
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))

    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight

    # Run CV
    cvresult = xgb.cv(params, xgtrain, num_boost_round = xgb1.get_params()['n_estimators'],
                         nfold=5,early_stopping_rounds=20)

    # Update best MAE
    mean_mae = cvresult['test-merror-mean'].min()
    boost_rounds = cvresult['test-merror-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (max_depth,min_child_weight)

print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

CV with max_depth=4, min_child_weight=4

	MAE 0.2191606 for 119 rounds
CV with max_depth=4, min_child_weight=6
	MAE 0.2191074 for 81 rounds
CV with max_depth=5, min_child_weight=6
	MAE 0.2190838 for 100 rounds
CV with max_depth=6, min_child_weight=4
	MAE 0.21906600000000004 for 60 rounds
CV with max_depth=6, min_child_weight=5
	MAE 0.21904240000000003 for 73 rounds
CV with max_depth=6, min_child_weight=6
	MAE 0.21906620000000002 for 63 rounds
CV with max_depth=6, min_child_weight=7
	MAE 0.21910159999999998 for 60 rounds
CV with max_depth=7, min_child_weight=4
	MAE 0.2190782 for 60 rounds
CV with max_depth=7, min_child_weight=5
	MAE 0.21906019999999998 for 61 rounds
CV with max_depth=7, min_child_weight=6
	MAE 0.2190722 for 72 rounds
CV with max_depth=7, min_child_weight=7
	MAE 0.21907800000000002 for 58 rounds
CV with max_depth=8, min_child_weight=4
	MAE 0.21907800000000002 for 69 rounds
CV with max_depth=8, min_child_weight=5
	MAE 0.2191072 for 42 rounds
CV with max_depth=8, min_child

In [22]:
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(7,11)
    for min_child_weight in range(6,9)
]

In [25]:
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))

    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight

    # Run CV
    cvresult = xgb.cv(params, xgtrain, num_boost_round = xgb1.get_params()['n_estimators'],
                         nfold=5,early_stopping_rounds=10)

    # Update best MAE
    mean_mae = cvresult['test-merror-mean'].min()
    boost_rounds = cvresult['test-merror-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (max_depth,min_child_weight)

print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

CV with max_depth=7, min_child_weight=6
	MAE 0.2190782 for 46 rounds
CV with max_depth=7, min_child_weight=7
	MAE 0.21907800000000002 for 58 rounds
CV with max_depth=7, min_child_weight=8
	MAE 0.2190778 for 62 rounds
CV with max_depth=8, min_child_weight=6
	MAE 0.21910779999999996 for 46 rounds
CV with max_depth=8, min_child_weight=7
	MAE 0.21913100000000002 for 41 rounds
CV with max_depth=8, min_child_weight=8
	MAE 0.2191724 for 33 rounds
CV with max_depth=9, min_child_weight=6
	MAE 0.21914280000000003 for 24 rounds
CV with max_depth=9, min_child_weight=7
	MAE 0.2190956 for 44 rounds
CV with max_depth=9, min_child_weight=8
	MAE 0.21914899999999998 for 30 rounds
CV with max_depth=10, min_child_weight=6
	MAE 0.2190956 for 43 rounds
CV with max_depth=10, min_child_weight=7
	MAE 0.2190896 for 41 rounds
CV with max_depth=10, min_child_weight=8
	MAE 0.2191252 for 28 rounds
Best params: 7, 8, MAE: 0.2190778


In [17]:
grid_search_params2 = [(subsample,colsample)
                       for subsample in [i/10 for i in range(9,11)]
                      for colsample in [i/10 for i in range(7,11)]
                      ]

In [14]:
grid_search_params2

[(0.7, 0.7),
 (0.7, 0.8),
 (0.7, 0.9),
 (0.7, 1.0),
 (0.8, 0.7),
 (0.8, 0.8),
 (0.8, 0.9),
 (0.8, 1.0),
 (0.9, 0.7),
 (0.9, 0.8),
 (0.9, 0.9),
 (0.9, 1.0),
 (1.0, 0.7),
 (1.0, 0.8),
 (1.0, 0.9),
 (1.0, 1.0)]

In [16]:
best_params = None
for subsample, colsample in grid_search_params2:
    print("CV with max_depth={}, min_child_weight={}".format(
                             subsample,
                             colsample))

    # Update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample

    # Run CV
    cvresult = xgb.cv(params, xgtrain, num_boost_round = xgb1.get_params()['n_estimators'],
                         nfold=5,early_stopping_rounds=20)

    # Update best MAE
    mean_mae = cvresult['test-merror-mean'].min()
    boost_rounds = cvresult['test-merror-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (subsample,colsample)

print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

CV with max_depth=0.7, min_child_weight=0.7
	MAE 0.21911339999999999 for 86 rounds
CV with max_depth=0.7, min_child_weight=0.8
	MAE 0.2190602 for 110 rounds
CV with max_depth=0.7, min_child_weight=0.9
	MAE 0.21906019999999998 for 74 rounds
CV with max_depth=0.7, min_child_weight=1.0
	MAE 0.21910179999999996 for 62 rounds
CV with max_depth=0.8, min_child_weight=0.7
	MAE 0.21907800000000002 for 69 rounds
CV with max_depth=0.8, min_child_weight=0.8
	MAE 0.21904839999999998 for 83 rounds
CV with max_depth=0.8, min_child_weight=0.9
	MAE 0.2190664 for 68 rounds
CV with max_depth=0.8, min_child_weight=1.0
	MAE 0.21907799999999997 for 75 rounds
CV with max_depth=0.9, min_child_weight=0.7
	MAE 0.21903679999999998 for 87 rounds
CV with max_depth=0.9, min_child_weight=0.8


KeyboardInterrupt: 

In [None]:
best_params = None
for subsample, colsample in grid_search_params2:
    print("CV with max_depth={}, min_child_weight={}".format(
                             subsample,
                             colsample))

    # Update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample

    # Run CV
    cvresult = xgb.cv(params, xgtrain, num_boost_round = xgb1.get_params()['n_estimators'],
                         nfold=5,early_stopping_rounds=10)

    # Update best MAE
    mean_mae = cvresult['test-merror-mean'].min()
    boost_rounds = cvresult['test-merror-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (subsample,colsample)

print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

CV with max_depth=0.9, min_child_weight=0.7
	MAE 0.21903679999999998 for 87 rounds
CV with max_depth=0.9, min_child_weight=0.8
	MAE 0.2190956 for 64 rounds
CV with max_depth=0.9, min_child_weight=0.9
	MAE 0.2190722 for 62 rounds
CV with max_depth=0.9, min_child_weight=1.0
	MAE 0.21911339999999999 for 46 rounds
CV with max_depth=1.0, min_child_weight=0.7
	MAE 0.2190484 for 63 rounds
CV with max_depth=1.0, min_child_weight=0.8


In [7]:
grid_search_params3 = [(subsample,colsample)
                       for subsample in [0.8,0.85,0.9,0.95]
                      for colsample in [0.7,0.75,0.8,0.85]
                      ]

In [19]:
best_params = None
for subsample, colsample in grid_search_params3:
    print("CV with subsample={}, colsample_bytree={}".format(
                             subsample,
                             colsample))

    # Update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample

    # Run CV
    cvresult = xgb.cv(params, xgtrain, num_boost_round = xgb1.get_params()['n_estimators'],
                         nfold=5,early_stopping_rounds=10)

    # Update best MAE
    mean_mae = cvresult['test-merror-mean'].min()
    boost_rounds = cvresult['test-merror-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    print("Mean MAE:",cvresult['test-merror-mean'].mean())
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (subsample,colsample)

print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

CV with subsample=0.8, colsample_bytree=0.7
	MAE 0.21907800000000002 for 69 rounds
Mean MAE: 0.219528748571
CV with subsample=0.8, colsample_bytree=0.75
	MAE 0.21914280000000003 for 59 rounds
Mean MAE: 0.219614466667
CV with subsample=0.8, colsample_bytree=0.8
	MAE 0.21904839999999998 for 83 rounds
Mean MAE: 0.219432311905
CV with subsample=0.8, colsample_bytree=0.85


KeyboardInterrupt: 

In [21]:
grid_search_params3 =[(0.8, 0.85),
 (0.85, 0.7),
 (0.85, 0.75),
 (0.85, 0.8),
 (0.85, 0.85),
 (0.9, 0.7),
 (0.9, 0.75),
 (0.9, 0.8),
 (0.9, 0.85),
 (0.95, 0.7),
 (0.95, 0.75),
 (0.95, 0.8),
 (0.95, 0.85)]





In [22]:
best_params = None
for subsample, colsample in grid_search_params3:
    print("CV with subsample={}, colsample_bytree={}".format(
                             subsample,
                             colsample))

    # Update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample

    # Run CV
    cvresult = xgb.cv(params, xgtrain, num_boost_round = xgb1.get_params()['n_estimators'],
                         nfold=5,early_stopping_rounds=10)

    # Update best MAE
    mean_mae = cvresult['test-merror-mean'].min()
    boost_rounds = cvresult['test-merror-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    print("Mean MAE:",cvresult['test-merror-mean'].mean())
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (subsample,colsample)

print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

CV with subsample=0.8, colsample_bytree=0.85
	MAE 0.21907799999999997 for 73 rounds
Mean MAE: 0.219475962162
CV with subsample=0.85, colsample_bytree=0.7
	MAE 0.21906019999999998 for 78 rounds
Mean MAE: 0.219474982278
CV with subsample=0.85, colsample_bytree=0.75
	MAE 0.2190426 for 79 rounds
Mean MAE: 0.219463815
CV with subsample=0.85, colsample_bytree=0.8
	MAE 0.21916639999999998 for 37 rounds
Mean MAE: 0.219783647368
CV with subsample=0.85, colsample_bytree=0.85
	MAE 0.21904240000000003 for 73 rounds
Mean MAE: 0.219471575676
CV with subsample=0.9, colsample_bytree=0.7
	MAE 0.21903679999999998 for 87 rounds
Mean MAE: 0.219429736364
CV with subsample=0.9, colsample_bytree=0.75
	MAE 0.21906620000000002 for 75 rounds
Mean MAE: 0.219488671053
CV with subsample=0.9, colsample_bytree=0.8
	MAE 0.2190956 for 64 rounds
Mean MAE: 0.219501058462
CV with subsample=0.9, colsample_bytree=0.85
	MAE 0.2191312 for 39 rounds
Mean MAE: 0.219788395
CV with subsample=0.95, colsample_bytree=0.7
	MAE 0.219

In [None]:
modelfitxg(xgb1,ds,predictors)

Fitting Model
Model Fitted
Accuracy: 0.7819


In [39]:
test_targets = xgb1.predict(ds_test[predictors])

In [38]:
test_connections = ds_test['connection_id']

In [35]:
feat_imp = pd.Series(xgb1.booster().get_fscore()).sort_values(ascending=False)

In [29]:
feat_imp

cat_20     1688
cat_2       917
cont_2      797
cat_23      764
cat_21      687
cont_12     518
cont_11     484
cont_8      467
cont_3      440
cont_9      370
cont_17     314
cat_22      300
cont_1      293
cont_13     253
cont_14     194
cat_3       163
cat_7       138
cat_1       120
cont_15     120
cont_4       99
cont_6       94
cont_16      79
cat_9        59
cont_10      51
cont_5       40
cont_18      39
cat_10       33
cat_5        30
cont_7       23
cat_19       17
cat_8        11
cat_13        2
dtype: int64

In [36]:
feat_imp

cat_20     1673
cat_2       932
cont_2      845
cat_23      802
cat_21      663
cont_12     530
cont_8      503
cont_11     487
cont_3      446
cont_9      380
cont_17     332
cont_1      303
cont_13     302
cat_22      275
cont_14     206
cat_7       158
cont_15     137
cont_4      121
cat_1       117
cont_6      105
cont_16     100
cat_9        77
cont_5       63
cont_18      57
cont_10      55
cat_5        29
cat_10       24
dtype: int64

In [8]:
grid_search_params4 = [g for g in [0.5,0.6,0.7,0.8,1]]

In [17]:
best_params = None
for gamma in grid_search_params4:
    print("CV with gamma={}".format(gamma))

    # Update our parameters
    params['gamma'] = gamma
#     params['colsample_bytree'] = colsample

    # Run CV
    cvresult = xgb.cv(params, xgtrain, num_boost_round = xgb1.get_params()['n_estimators'],
                         nfold=5,early_stopping_rounds=10)

    # Update best MAE
    mean_mae = cvresult['test-merror-mean'].min()
    boost_rounds = cvresult['test-merror-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    print("Mean MAE:",cvresult['test-merror-mean'].mean())
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = gamma

print("Best params: {}, MAE: {}".format(best_params, min_mae))

CV with gamma=0
	MAE 0.21913739999999998 for 55 rounds
Mean MAE: 0.219638396429
CV with gamma=0.1
	MAE 0.21912499999999996 for 59 rounds
Mean MAE: 0.219602306667
CV with gamma=0.2
	MAE 0.2191312 for 55 rounds
Mean MAE: 0.219636292857
CV with gamma=0.3
	MAE 0.21920199999999998 for 36 rounds
Mean MAE: 0.219870610811
CV with gamma=0.4
	MAE 0.21912539999999997 for 69 rounds
Mean MAE: 0.219537014286
Best params: 0.1, MAE: 0.21912499999999996


In [40]:
output = []
for i in range(len(test_connections)):
    output.append((test_connections[i],test_targets[i]))

In [41]:
np.savetxt('./ml4output9.csv',output,fmt='%s,%s',delimiter=',',header='connection_id,target')