# Import libraries

In [87]:
import numpy as np # calculations with arrays
import pandas as pd # user-friendly DataFrames for data representation
import sklearn # machine learning algorithms
from sklearn import ensemble, linear_model
from sklearn.metrics import log_loss
from sklearn import cross_validation
from sklearn import grid_search
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import DictVectorizer as DV
import xgboost as xgboost
import matplotlib.pyplot as plt # import plot functions
# necessary to plot in jupyter notebook:
%matplotlib inline
import seaborn as sns # make plots beautiful
from sklearn.grid_search import GridSearchCV
import xgboost

# Download data from competition's page

https://inclass.kaggle.com/c/data-mining-in-action-2016-competitions-01/data

# Load data using pandas

In [54]:
train = pd.read_csv('train2.csv')
test = pd.read_csv('test2.csv')
sample_submission = pd.read_csv('sample_submission.csv')

# Data

In [55]:
# print first row
train[:1]

Unnamed: 0,year,day,team1,team2,score1,score2,target
0,2998,19,317,131,336,278,True


In [56]:
test[:1]

Unnamed: 0,Id,year,team1,team2
0,0,3021,363,161


In [57]:
# Target variable is "target" and this means we will be predicting it
sample_submission[:1]

Unnamed: 0,Id,target
0,0,0.5


## Quick look at the unique values in data...

In [58]:
for c in train.columns:
    print c, train[c].unique()[:5]

SyntaxError: invalid syntax (<ipython-input-58-beeea4677966>, line 2)

# Cross-validation

### Lets split data randomly to train and validatation. We will train our algorithms on selected train set and validate them on validation set. Easy as it can be!

In [59]:
# train size
train.shape 

(101609, 7)

train is quite big, so for example purposes we'll sample only part of it

In [60]:
from sklearn.cross_validation import ShuffleSplit

for itr, ite in ShuffleSplit(len(train), n_iter=1, train_size=0.4, test_size=0.1, random_state=0):
    pass

information about all functions can be found on the internet, for example

http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ShuffleSplit.html

In [61]:
# or you can open it in you Jupyter notebook executing function in this manner
?ShuffleSplit()

In [62]:
len(itr), len(ite)

(40643, 10161)

In [63]:
itr[:5], ite[:5]

(array([22710, 41665, 91975, 57348, 39931]),
 array([ 37078, 101474,  29858,  61674,   1049]))

now we have validation set "ite" to check the quality of our solution

# features and target

In [64]:
sample_submission[:2]

Unnamed: 0,Id,target
0,0,0.5
1,1,0.5


we need to change 'target' column in "sample_submission" to our predictions.

For now we will select only features that are present in both train and test:

In [65]:
features = []
for c in train.columns:
    if c in test.columns and c!='target':
        features += [c]
        #print '"{}" is present in test and train'.format(c)
    else:
        pass
        #print '"{}" is NOT present in test'.format(c)
        
features.pop(0)
features

['team1', 'team2']

here we split train on "train" and "validation" parts

In [66]:
xtrain = train.loc[itr, features]    
ytrain = train.loc[itr, 'target']

xval = train.loc[ite, features]
yval = train.loc[ite, 'target']

# Baseline solution

lets make baseline first by predicting the mean value

In [67]:
train.target.mean()

0.50096940231672393

In [68]:
constant_prediction = yval * 0 + train.target.mean()
constant_prediction = constant_prediction.values
constant_prediction

array([ 0.5009694,  0.5009694,  0.5009694, ...,  0.5009694,  0.5009694,
        0.5009694])

In [69]:
log_loss(yval, constant_prediction)

0.6931565015839517

In [70]:
submission = sample_submission.copy()
submission.target = train['target'].mean() # notice here that we can refer to a column 'target' in two ways
submission.to_csv('constant_submission.csv', index=False)

Now this should score like "Baseline - Constant" on Leaderboard!
You can submit this by going to 

https://inclass.kaggle.com/c/data-mining-in-action-2016-competitions-01/submissions/attach

# Machine learning

Finally, lets try machine learning!

In [71]:
train_ns = train[['team1', 'team2']]
target = train['target']
test_ns = test[['team1', 'team2']]

In [75]:

scaler = StandardScaler()
train_scaled = pd.DataFrame(scaler.fit_transform(train_ns), columns=list(train_ns.columns))
test_scaled = pd.DataFrame(scaler.transform(test_ns), columns=list(test_ns.columns))

In [76]:
enc = OneHotEncoder(sparse=False)

In [77]:
train_team1 = pd.DataFrame(enc.fit_transform(train[['team1']]))

In [78]:
train_team2 = pd.DataFrame(enc.transform(train[['team2']]))

In [79]:
train_teams = pd.concat([train_team1, train_team2], axis=1)

In [80]:
test_team1 = pd.DataFrame(enc.transform(test[['team1']]))

In [81]:
test_team2 = pd.DataFrame(enc.transform(test[['team2']]))

In [82]:
test_teams = pd.concat([test_team1, test_team2], axis=1)

In [88]:
alg = linear_model.LogisticRegression(C=0.8, tol=0.00001, max_iter=150)
alg.fit(train_teams, target.values)

LogisticRegression(C=0.8, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=150, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=1e-05,
          verbose=0, warm_start=False)

In [89]:
prediction = alg.predict_proba(test_teams)[:,1]

In [90]:
for i in range(len(prediction)):
    if prediction[i] > 0.97:
        prediction[i] = 1
    if prediction[i] < 0.03:
        prediction[i] = 0

In [91]:
ss = sample_submission.copy()

ss.target = prediction
ss.to_csv('lin_reg.csv', index=False)

In [25]:
log_loss(yval, prediction)

0.69278091662349195

### Well, not so far from the constant solution... Let's try to understand why.

What's a linear model such as LogisticRegression is trying to do is multiply each variable on some coefficient and add add it up, in our case:

y_predicted = column1 \* coef1 + column2 \* coef2 + column3 \* coef3 + bias

We can print coefficients and bias:

In [26]:
alg.coef_, alg.intercept_

(array([[  2.31737378e-07,   3.29139376e-04,  -2.98254396e-04]]),
 array([  5.62890115e-09]))

But clearly, "team1" and "team2" are _categorical_ columns, just like names of the teams. 

So we need to turn "team" columns to something linear algorithm can work with. For example first few rows from here

In [27]:
train.loc[:2, 'team1']

0    317
1     61
2    110
Name: team1, dtype: int64

To this:

In [28]:
pd.get_dummies(train.loc[:2, 'team1'])

Unnamed: 0,61,110,317
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,1.0,0.0


So each team name now has it's own column. Read about "pd.get_dummies" here:

http://pandas.pydata.org/pandas-docs/stable/generated/pandas.get_dummies.html

### But let's come back to more interesting stuff for now
### We are competition's solvers, remember? Lets dive into the space of more complicated models!

In [29]:
alg = ensemble.RandomForestClassifier(15, n_jobs=4)
alg.fit(xtrain, ytrain)
prediction = alg.predict_proba(xval)[:,1]

In [30]:
log_loss(yval, prediction)

1.1744940647416549

Surprisingly, this doesn't work very well. Now, like competition pro, let's make our models bigger!

In [31]:
alg = ensemble.RandomForestClassifier(150, n_jobs=4)
alg.fit(xtrain, ytrain)
prediction = alg.predict_proba(xval)[:,1]

In [32]:
log_loss(yval, prediction)

0.74131637899967551

### Almost there! But for now let's skip this model too and go to _real_ competitions stuff

In [33]:
import xgboost

In [34]:
param = {}
param['max_depth'] = 8
param['booster'] = 'gbtree'
param['objective'] = 'binary:logistic'
param['eval_metric'] = 'logloss'
param['eta'] = 0.1

numround = 100

Xgboost parameters

https://github.com/dmlc/xgboost/blob/master/doc/parameter.md

In [35]:
Xdatatrain = xgboost.DMatrix(data = xtrain, label = ytrain)
Xdatatest = xgboost.DMatrix(data = xval, label = yval)

plst = list(param.items())
watchlist = [(Xdatatrain, 'train'), (Xdatatest, 'eval')]            

bst = xgboost.train(plst, Xdatatrain, numround, evals = watchlist, verbose_eval = 10)
# ypredxgb_tr = bst.predict(Xdatatrain)

[0]	train-logloss:0.689826	eval-logloss:0.691345
[10]	train-logloss:0.670759	eval-logloss:0.681402
[20]	train-logloss:0.658005	eval-logloss:0.67538
[30]	train-logloss:0.647507	eval-logloss:0.669858
[40]	train-logloss:0.638459	eval-logloss:0.66616
[50]	train-logloss:0.628218	eval-logloss:0.661736
[60]	train-logloss:0.619701	eval-logloss:0.658285
[70]	train-logloss:0.6126	eval-logloss:0.655751
[80]	train-logloss:0.604423	eval-logloss:0.652431
[90]	train-logloss:0.597331	eval-logloss:0.649311


Wow! Finally our model better than constant predictions! Congratulations! Don't hesitate, submit!

### Strange, but it seems like we got 0.658 instead of 0.649! 

### What could it be? Perhabs we need to train on all data instead of just 40% of it? Or may be should think over our cross-validation process?

### Let's overview now what we just did here:
1) made cross-validation

2) tried linear models, they didn't work, but we figured out how to tackle this problem

3) tried random forest and almost beat constant benchmark

4) tried xgboost and finally beat constant prediction!

### But there is the last thing you must know before you'll start this challenge by trying to make the most thorough parameter tuning: the data has it's secrets and those who will find them will be generously rewarded...

### now, good luck with it!