# Import libraries

In [577]:
import numpy as np # calculations with arrays
import pandas as pd # user-friendly DataFrames for data representation
import sklearn # machine learning algorithms
from sklearn import ensemble, linear_model
from sklearn.metrics import log_loss

import matplotlib.pyplot as plt # import plot functions
# necessary to plot in jupyter notebook:
%matplotlib inline
import seaborn as sns # make plots beautiful

# Download data from competition's page

https://inclass.kaggle.com/c/data-mining-in-action-2016-competitions-01/data

# Load data using pandas

In [578]:
train = pd.read_csv('competition_zero/train2.csv')
test = pd.read_csv('competition_zero/test2.csv')
sample_submission = pd.read_csv('competition_zero/sample_submission.csv')

# Data

In [285]:
# print first row
train[:1]

Unnamed: 0,year,day,team1,team2,score1,score2,target
0,2998,19,317,131,336,278,True


In [286]:
test[:1]

Unnamed: 0,Id,year,team1,team2
0,0,3021,363,161


In [287]:
# Target variable is "target" and this means we will be predicting it
sample_submission[:1]

Unnamed: 0,Id,target
0,0,0.5


## Quick look at the unique values in data...

In [288]:
for c in train.columns:
    print (c, train[c].unique()[:5])

year [2998 2999 3000 3001 3002]
day [19 28 30 31 33]
team1 [317  61 110 352 229]
team2 [131  29 141 146  91]
score1 [336 301 359 309 332]
score2 [278 259 267 410 220]
target [True False]


# Cross-validation

### Lets split data randomly to train and validatation. We will train our algorithms on selected train set and validate them on validation set. Easy as it can be!

In [289]:
# train size
train.shape 

(101609, 7)

train is quite big, so for example purposes we'll sample only part of it

In [423]:
from sklearn.cross_validation import ShuffleSplit, train_test_split

for itr, ite in ShuffleSplit(len(train), n_iter=1, train_size=0.75, test_size=0.25, random_state=42):
    pass

information about all functions can be found on the internet, for example

http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ShuffleSplit.html

In [359]:
# or you can open it in you Jupyter notebook executing function in this manner
?ShuffleSplit()

In [424]:
len(itr), len(ite)

(76206, 25403)

In [425]:
itr[:5], ite[:5]

(array([64794, 42486, 54539, 39466, 36905]),
 array([73749, 26995, 52944, 55854, 35538]))

now we have validation set "ite" to check the quality of our solution

# features and target

In [362]:
sample_submission[:2]

Unnamed: 0,Id,target
0,0,0.5
1,1,0.5


we need to change 'target' column in "sample_submission" to our predictions.

For now we will select only features that are present in both train and test:

In [363]:
features = []
for c in train.columns:
    if c in test.columns and c!='target' and c!='year':
        features += [c]
        print ('"{}" is present in test and train'.format(c))
    else:
        print ('"{}" is NOT present in test'.format(c))
        
features

"year" is NOT present in test
"day" is NOT present in test
"team1" is present in test and train
"team2" is present in test and train
"score1" is NOT present in test
"score2" is NOT present in test
"target" is NOT present in test


['team1', 'team2']

here we split train on "train" and "validation" parts

In [426]:
xtrain = train.loc[itr, features]    
ytrain = train.loc[itr, 'target']

xval = train.loc[ite, features]
yval = train.loc[ite, 'target']

# Baseline solution

lets make baseline first by predicting the mean value

In [365]:
train.target.mean()

0.50096940231672393

In [334]:
constant_prediction = yval * 0 + train.target.mean()
constant_prediction = constant_prediction.values
constant_prediction

array([ 0.5009694,  0.5009694,  0.5009694, ...,  0.5009694,  0.5009694,
        0.5009694])

In [335]:
log_loss(yval, constant_prediction)

0.6931565015839517

In [336]:
submission = sample_submission.copy()
submission.target = train['target'].mean() # notice here that we can refer to a column 'target' in two ways
submission.to_csv('constant_submission.csv', index=False)

Now this should score like "Baseline - Constant" on Leaderboard!
You can submit this by going to 

https://inclass.kaggle.com/c/data-mining-in-action-2016-competitions-01/submissions/attach

# Machine learning

Finally, lets try machine learning!

In [337]:
alg = linear_model.LogisticRegression()
alg.fit(xtrain, ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [338]:
prediction = alg.predict_proba(xval)[:,1]

In [339]:
log_loss(yval, prediction)

0.69278158850082583

### Well, not so far from the constant solution... Let's try to understand why.

What's a linear model such as LogisticRegression is trying to do is multiply each variable on some coefficient and add add it up, in our case:

y_predicted = column1 \* coef1 + column2 \* coef2 + column3 \* coef3 + bias

We can print coefficients and bias:

In [340]:
alg.coef_, alg.intercept_

(array([[ 0.00033022, -0.00029718]]), array([  9.62607300e-08]))

But clearly, "team1" and "team2" are _categorical_ columns, just like names of the teams. 

So we need to turn "team" columns to something linear algorithm can work with. For example first few rows from here

In [341]:
train.loc[:2, 'team1']

0    317
1     61
2    110
Name: team1, dtype: int64

To this:

In [342]:
pd.get_dummies(train.loc[:2, 'team1'])

Unnamed: 0,61,110,317
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,1.0,0.0


So each team name now has it's own column. Read about "pd.get_dummies" here:

http://pandas.pydata.org/pandas-docs/stable/generated/pandas.get_dummies.html

### But let's come back to more interesting stuff for now
### We are competition's solvers, remember? Lets dive into the space of more complicated models!

In [343]:
alg = ensemble.RandomForestClassifier(15, n_jobs=4)
alg.fit(xtrain, ytrain)
prediction = alg.predict_proba(xval)[:,1]

In [344]:
log_loss(yval, prediction)

1.602708141747019

Surprisingly, this doesn't work very well. Now, like competition pro, let's make our models bigger!

In [345]:
alg = ensemble.RandomForestClassifier(150, n_jobs=4)
alg.fit(xtrain, ytrain)
prediction = alg.predict_proba(xval)[:,1]

In [346]:
log_loss(yval, prediction)

1.0174345124319784

### Almost there! But for now let's skip this model too and go to _real_ competitions stuff

In [347]:
import xgboost

In [348]:
str_team1 = [str(i) for i in train['team1'].values]
str_team2 = [str(i) for i in train['team2'].values]
str_team1_test = [str(i) for i in test['team1'].values]
str_team2_test = [str(i) for i in test['team2'].values]
df_str_team1 = pd.DataFrame(data=str_team1, columns=['team1'])
df_str_team2 = pd.DataFrame(data=str_team2, columns=['team2'])
df_str_team1_test = pd.DataFrame(data=str_team1_test, columns=['team1'])
df_str_team2_test = pd.DataFrame(data=str_team2_test, columns=['team2'])

In [349]:
X_train = train[['team1', 'team2', 'target']]
X_test = test[['team1', 'team2']]
X_train_cat = pd.concat([df_str_team1, df_str_team2], axis=1)
X_test_cat = pd.concat([df_str_team1_test, df_str_team2_test], axis=1)

In [350]:
from sklearn.feature_extraction import DictVectorizer as DV
encoder = DV(sparse = False)
X_cat_oh = encoder.fit_transform(X_train_cat.T.to_dict().values())
X_test_cat_oh = encoder.transform(X_test_cat.T.to_dict().values())

In [351]:
X_cat_oh.shape

(101609, 706)

In [352]:
y = X_train['target'].values

In [353]:
delim = int(0.2 * X_cat_oh.shape[0])
delim

20321

In [354]:
xtrain = X_cat_oh[:delim, :]
ytrain = y[:delim]
xval = X_cat_oh[delim:, :]
yval = y[delim:]

In [355]:
xtrain.shape

(20321, 706)

In [453]:
percentage_of_win = {}
for year, day, team1, team2, score1, score2, target in train.values:
    if team1 not in percentage_of_win:
        percentage_of_win[team1] = [int(target), 0, 1, 0]
    else:
        percentage_of_win[team1][0] += int(target)
        percentage_of_win[team1][2] += 1
    if team2 not in percentage_of_win:
        percentage_of_win[team1] = [0, 1 - int(target), 0, 1]
    else:
        percentage_of_win[team1][1] += (1 - int(target))
        percentage_of_win[team1][3] += 1

In [463]:
test[test['team2']==113]

Unnamed: 0,Id,year,team1,team2
278,278,3021,303,113
519,519,3020,349,113
691,691,3021,154,113
781,781,3020,258,113
907,907,3020,265,113
924,924,3020,155,113
1173,1173,3020,281,113
1418,1418,3020,134,113
1641,1641,3021,226,113
1781,1781,3020,316,113


In [459]:
new_train = []
for row in train.values:
    a = list(row) + [percentage_of_win[row[2]][0]/percentage_of_win[row[2]][2], percentage_of_win[row[3]][1]/percentage_of_win[row[3]][3]]
    new_train.append(a)
new_train

[[2998, 19, 317, 131, 336, 278, True, 0.6871165644171779, 0.38484848484848483],
 [2998, 28, 61, 29, 301, 259, True, 0.5050505050505051, 0.44904458598726116],
 [2998, 28, 110, 141, 359, 267, True, 0.5253731343283582, 0.5047021943573667],
 [2998, 28, 352, 146, 309, 410, False, 0.5841584158415841, 0.3088235294117647],
 [2998, 28, 229, 91, 332, 220, True, 0.6873065015479877, 0.48589341692789967],
 [2998,
  28,
  164,
  238,
  236,
  278,
  False,
  0.38738738738738737,
  0.3106508875739645],
 [2998, 28, 184, 243, 181, 224, False, 0.4358974358974359, 0.3853820598006645],
 [2998, 28, 245, 23, 216, 185, True, 0.6314363143631436, 0.46394984326018807],
 [2998, 28, 300, 349, 402, 321, True, 0.6363636363636364, 0.43790849673202614],
 [2998, 30, 61, 110, 259, 325, False, 0.5050505050505051, 0.4732142857142857],
 [2998, 30, 229, 245, 294, 185, True, 0.6873065015479877, 0.3675675675675676],
 [2998, 30, 300, 243, 220, 178, True, 0.6363636363636364, 0.3853820598006645],
 [2998, 31, 10, 310, 282, 232, 

In [464]:
new_test = []
for row in test.values:
    b = []
    if row[2] not in percentage_of_win:
        b.append(0)
    else:
        b.append(percentage_of_win[row[2]][0]/percentage_of_win[row[2]][2])
    if row[3] not in percentage_of_win:
        b.append(0)
    else:
        b.append(percentage_of_win[row[3]][1]/percentage_of_win[row[3]][3])
    a = list(row) + b
    new_test.append(a)
new_test

[[0, 3021, 363, 161, 0.42857142857142855, 0.4491017964071856],
 [1, 3021, 286, 2, 0.5953079178885631, 0.6185567010309279],
 [2, 3020, 232, 52, 0.5520504731861199, 0.7738853503184714],
 [3, 3020, 84, 11, 0.46568627450980393, 0.5016722408026756],
 [4, 3021, 305, 39, 0.47928994082840237, 0.2774566473988439],
 [5, 3020, 159, 152, 0.39039039039039036, 0.5705882352941176],
 [6, 3021, 198, 181, 0.4414715719063545, 0.3415384615384615],
 [7, 3021, 353, 221, 0.5623003194888179, 0.5813253012048193],
 [8, 3020, 364, 363, 0.3310104529616725, 0.5714285714285714],
 [9, 3020, 113, 105, 0, 0.6143790849673203],
 [10, 3020, 225, 168, 0.5681818181818182, 0.3508771929824561],
 [11, 3020, 198, 164, 0.4414715719063545, 0.6160714285714286],
 [12, 3020, 256, 83, 0.5783132530120482, 0.6006944444444444],
 [13, 3020, 200, 133, 0.27631578947368424, 0.4375],
 [14, 3021, 270, 217, 0.4492753623188406, 0.6052631578947368],
 [15, 3021, 152, 2, 0.4294117647058823, 0.6185567010309279],
 [16, 3020, 184, 148, 0.43589743589

In [498]:
x_train_new = pd.DataFrame(data=new_train, columns=['year', 'day', 'team1', 'team2', 'score1', 'score2', 'target', 'p1', 'p2'])

In [499]:
x_test_new = pd.DataFrame(data=new_test, columns=['ind', 'year', 'team1', 'team2', 'p1', 'p2'])

# Logistic regression

In [548]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

In [550]:
X_train_p1p2 = x_train_new[['team1', 'team2', 'p1', 'p2']].values
X_train = x_train_new[['team1', 'team2']].values
y_train = x_train_new['target'].values
X_test_p1p2 = x_test_new[['team1', 'team2', 'p1', 'p2']].values
X_test = x_test_new[['team1', 'team2']].values

In [565]:
reg = LogisticRegression()

In [567]:
?LogisticRegression

In [566]:
reg.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'max_iter': 100,
 'multi_class': 'ovr',
 'n_jobs': 1,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'liblinear',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [554]:
reg.fit(X_train_p1p2, y_train)

LogisticRegressionCV(Cs=10, class_weight=None, cv=5, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring='log_loss', solver='lbfgs', tol=0.0001,
           verbose=0)

In [559]:
arr = reg.predict_proba(X_test_p1p2)[:, 1]

In [572]:
param_grid = {
    'C': [0.01, 0.1, 0.2, 0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.5, 2, 3, 4, 5, 7, 9, 10],
    'penalty': ['l2'],
    'max_iter': np.linspace(50, 150, 5, dtype=int),
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag'],
}

In [573]:
opt_reg = GridSearchCV(reg, param_grid, scoring='log_loss', cv=3)

In [574]:
opt_reg.fit(X_train_p1p2, y_train)



GridSearchCV(cv=3, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'penalty': ['l2'], 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag'], 'max_iter': array([ 50,  75, 100, 125, 150]), 'C': [0.01, 0.1, 0.2, 0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.5, 2, 3, 4, 5, 7, 9, 10]},
       pre_dispatch='2*n_jobs', refit=True, scoring='log_loss', verbose=0)

In [576]:
opt_reg.best_params_

{'C': 10, 'max_iter': 75, 'penalty': 'l2', 'solver': 'lbfgs'}

Xgboost parameters

https://github.com/dmlc/xgboost/blob/master/doc/parameter.md

In [427]:
param = {}
param['max_depth'] = 8
param['booster'] = 'gbtree'
param['objective'] = 'binary:logistic'
param['eval_metric'] = 'logloss'
param['eta'] = 0.01

numround = 200

In [370]:
from sklearn.grid_search import GridSearchCV

In [371]:
clf = xgboost.XGBClassifier()

In [386]:
param_grid = {
    'max_depth': np.linspace(1, 15, 15, dtype=int),
    'n_estimators': np.linspace(10, 150, 15, dtype=int),
}

In [387]:
opt = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='log_loss', cv=5)

In [388]:
opt.fit(xtrain.values, ytrain.values)

GridSearchCV(cv=5, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15]), 'n_estimators': array([ 10,  20,  30,  40,  50,  60,  70,  80,  90, 100, 110, 120, 130,
       140, 150])},
       pre_dispatch='2*n_jobs', refit=True, scoring='log_loss', verbose=0)

In [390]:
opt.best_score_

-0.65431641172351407

In [391]:
opt.best_estimator_

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=8,
       min_child_weight=1, missing=None, n_estimators=150, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [485]:
#[1999]	train-logloss:0.585557	eval-logloss:0.634051
param = {}
param['max_depth'] = 9
param['booster'] = 'gbtree'
param['objective'] = 'binary:logistic'
param['eval_metric'] = 'logloss'
param['eta'] = 0.1

numround = 340

In [486]:
Xdatatrain = xgboost.DMatrix(data = xtrain, label = ytrain)
Xdatatest = xgboost.DMatrix(data = xval, label = yval)

plst = list(param.items())
watchlist = [(Xdatatrain, 'train'), (Xdatatest, 'eval')]            

bst = xgboost.train(plst, Xdatatrain, numround, evals = watchlist, verbose_eval = 10)
print(bst.best_iteration)
# ypredxgb_tr = bst.predict(Xdatatrain)

[0]	train-logloss:0.690034	eval-logloss:0.690888
[10]	train-logloss:0.668431	eval-logloss:0.675843
[20]	train-logloss:0.654625	eval-logloss:0.666155
[30]	train-logloss:0.646366	eval-logloss:0.661209
[40]	train-logloss:0.640452	eval-logloss:0.657902
[50]	train-logloss:0.633774	eval-logloss:0.654248
[60]	train-logloss:0.629010	eval-logloss:0.651628
[70]	train-logloss:0.624447	eval-logloss:0.649329
[80]	train-logloss:0.620285	eval-logloss:0.647191
[90]	train-logloss:0.614487	eval-logloss:0.644127
[100]	train-logloss:0.611106	eval-logloss:0.642714
[110]	train-logloss:0.606981	eval-logloss:0.640835
[120]	train-logloss:0.603590	eval-logloss:0.639596
[130]	train-logloss:0.600450	eval-logloss:0.638167
[140]	train-logloss:0.597954	eval-logloss:0.637420
[150]	train-logloss:0.594607	eval-logloss:0.636128
[160]	train-logloss:0.591651	eval-logloss:0.634989
[170]	train-logloss:0.589069	eval-logloss:0.634360
[180]	train-logloss:0.586721	eval-logloss:0.633644
[190]	train-logloss:0.584725	eval-logloss:

339


[339]	train-logloss:0.561843	eval-logloss:0.630689


In [526]:
#[1999]	train-logloss:0.585557	eval-logloss:0.634051
param = {}
param['max_depth'] = 10
param['booster'] = 'gbtree'
param['objective'] = 'binary:logistic'
param['eval_metric'] = 'logloss'
param['eta'] = 0.01

numround = 320

In [527]:
features_new = features + ['p1', 'p2']
features_new

['team1', 'team2', 'p1', 'p2']

In [528]:
xtrain_new = x_train_new.loc[itr, features_new]    
ytrain_new = x_train_new.loc[itr, 'target']

xval_new = x_train_new.loc[ite, features_new]
yval_new = x_train_new.loc[ite, 'target']

In [529]:
Xdatatrain = xgboost.DMatrix(data = xtrain_new, label = ytrain_new)
Xdatatest = xgboost.DMatrix(data = xval_new, label = yval_new)

plst = list(param.items())
watchlist = [(Xdatatrain, 'train'), (Xdatatest, 'eval')]            

bst = xgboost.train(plst, Xdatatrain, numround, evals = watchlist, verbose_eval = 10)
print(bst.best_iteration)
# ypredxgb_tr = bst.predict(Xdatatrain)

[0]	train-logloss:0.691577	eval-logloss:0.691828
[10]	train-logloss:0.677617	eval-logloss:0.679808
[20]	train-logloss:0.665856	eval-logloss:0.669987
[30]	train-logloss:0.655762	eval-logloss:0.661771
[40]	train-logloss:0.647091	eval-logloss:0.654905
[50]	train-logloss:0.639557	eval-logloss:0.649195
[60]	train-logloss:0.633071	eval-logloss:0.644432
[70]	train-logloss:0.627431	eval-logloss:0.640396
[80]	train-logloss:0.622476	eval-logloss:0.636988
[90]	train-logloss:0.618045	eval-logloss:0.634139
[100]	train-logloss:0.614114	eval-logloss:0.631662
[110]	train-logloss:0.610636	eval-logloss:0.629574
[120]	train-logloss:0.607542	eval-logloss:0.627845
[130]	train-logloss:0.604807	eval-logloss:0.626440
[140]	train-logloss:0.602308	eval-logloss:0.625199
[150]	train-logloss:0.600042	eval-logloss:0.624180
[160]	train-logloss:0.597952	eval-logloss:0.623295
[170]	train-logloss:0.596024	eval-logloss:0.622556
[180]	train-logloss:0.594253	eval-logloss:0.621937
[190]	train-logloss:0.592643	eval-logloss:

319


[319]	train-logloss:0.578788	eval-logloss:0.618748


Wow! Finally our model better than constant predictions! Congratulations! Don't hesitate, submit!

In [536]:
arr = bst.predict(xgboost.DMatrix(x_test_new[['team1', 'team2', 'p1', 'p2']]))
arr

array([ 0.16356187,  0.74639529,  0.94774258, ...,  0.82250255,
        0.38163385,  0.21870503], dtype=float32)

In [392]:
opt.predict_proba(test[['team1', 'team2']].values)[:,1]

array([ 0.41481793,  0.78602779,  0.71633184, ...,  0.67208821,
        0.49065748,  0.33231843], dtype=float32)

In [472]:
clf = xgboost.XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.07, max_delta_step=0, max_depth=8,
       min_child_weight=1, missing=None, n_estimators=130, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [473]:
clf.fit(train[['team1', 'team2']].values , train['target'].values)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.07, max_delta_step=0, max_depth=8,
       min_child_weight=1, missing=None, n_estimators=130, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [475]:
arr = clf.predict_proba(test[['team1', 'team2']].values)[:,1]
arr

array([ 0.42683405,  0.68022805,  0.65880686, ...,  0.67071056,
        0.54231179,  0.3874718 ], dtype=float32)

In [534]:
for i in range(len(arr)):
    if arr[i] > 0.97:
        print(arr[i])
        arr[i] = 1
    if arr[i] < 0.03:
        print(arr[i])
        arr[i] = 0

0.97366
0.973637
0.970839
0.971368
0.973643
0.97366
0.970681
0.973305
0.97366
0.972618
0.970648
0.97366
0.973637
0.97118
0.0294439
0.973637
0.0295878
0.97074
0.972847
0.973667
0.973105
0.970007
0.97118
0.97366
0.97118
0.973643
0.970616
0.971368
0.973637
0.970285
0.970353
0.973105
0.970285
0.973637
0.97118
0.973305
0.973637
0.971119
0.970285
0.972819
0.971368
0.973305
0.971439
0.97074
0.97366
0.972163
0.97366
0.973637
0.973667
0.973637
0.97366
0.971368
0.97366
0.97099
0.973637
0.971368
0.971368
0.972274
0.972974
0.973637
0.973637
0.970871
0.970616
0.97118
0.973637
0.970232
0.970846
0.970333
0.970681
0.970871
0.97366
0.970285
0.970206
0.970333
0.972301
0.972591
0.971368
0.97366
0.970839
0.971368
0.971368
0.97366
0.973637
0.0294439
0.97074
0.972507
0.973305
0.970285
0.973643
0.971368
0.973637
0.972659
0.97366
0.97366
0.97366
0.970839
0.97118
0.973643
0.970871
0.971368
0.970871
0.97022
0.970285
0.0299488
0.972818
0.97343
0.97366
0.970648
0.973305
0.973637
0.970871
0.973305
0.970871
0.97363

In [540]:
x_test_new.values

array([[  0.00000000e+00,   3.02100000e+03,   3.63000000e+02,
          1.61000000e+02,   4.28571429e-01,   4.49101796e-01],
       [  1.00000000e+00,   3.02100000e+03,   2.86000000e+02,
          2.00000000e+00,   5.95307918e-01,   6.18556701e-01],
       [  2.00000000e+00,   3.02000000e+03,   2.32000000e+02,
          5.20000000e+01,   5.52050473e-01,   7.73885350e-01],
       ..., 
       [  1.25204000e+05,   3.02100000e+03,   2.31000000e+02,
          1.99000000e+02,   5.94339623e-01,   6.01941748e-01],
       [  1.25205000e+05,   3.02100000e+03,   3.50000000e+02,
          6.60000000e+01,   4.40729483e-01,   3.57575758e-01],
       [  1.25206000e+05,   3.02000000e+03,   3.13000000e+02,
          2.88000000e+02,   2.91970803e-01,   1.87500000e-01]])

In [547]:
for i in range(len(x_test_new.values) ):
    if x_test_new.values[i, 4] < 0.00001:
        arr[i] = 1
    if x_test_new.values[i, 5] < 0.00001:
        arr[i] = 1

In [560]:
ss = sample_submission.copy()

ss.target = arr
ss.to_csv('mighty_xgboost.csv', index=False)

### Strange, but it seems like we got 0.658 instead of 0.649! 

### What could it be? Perhabs we need to train on all data instead of just 40% of it? Or may be should think over our cross-validation process?

### Let's overview now what we just did here:
1) made cross-validation

2) tried linear models, they didn't work, but we figured out how to tackle this problem

3) tried random forest and almost beat constant benchmark

4) tried xgboost and finally beat constant prediction!

### But there is the last thing you must know before you'll start this challenge by trying to make the most thorough parameter tuning: the data has it's secrets and those who will find them will be generously rewarded...

### now, good luck with it!