# Predictions
## Import dependencies

In [1]:
# data preprocessing
import pandas as pd
# produces a prediction model in the form of an ensemble of weak prediction models, typically decision tree
import xgboost as xgb
# the outcome (dependent variable) has only a limited number of possible values.
# Logistic Regression is used when response variable is categorical in nature.
from sklearn.linear_model import LogisticRegression
# A random forest is a meta estimator that fits a number of decision tree classifiers
# on various sub-samples of the dataset and use averaging to improve the predictive
# accuracy and control over-fitting.
from sklearn.ensemble import RandomForestClassifier
# a discriminative classifier formally defined by a separating hyperplane.
from sklearn.svm import SVC
# displayd data
from IPython.display import display
%matplotlib inline

## Explore Data

In [2]:
# Read data and drop redundant column.
data = pd.read_csv('results.csv')

# Preview data.
display(data.head())

Unnamed: 0,season,date,home_team,guest_team,score,home_score,guest_score,odds_1,odds_x,odds_2,explore_id,outcome,ftr
0,Handbollsligan 2017/2018,14.03.2018,Alingsas,Savehof,26:29,26,29,1.38,9.57,3.78,ELL63qws,,2
1,Handbollsligan 2017/2018,14.03.2018,Hammarby,Helsingborg,26:31,26,31,1.29,10.2,4.64,hCKA23hm,,2
2,Handbollsligan 2017/2018,14.03.2018,Karlskrona,Aranas,31:26,31,26,1.33,9.92,4.2,zyAF1N7g,,1
3,Handbollsligan 2017/2018,14.03.2018,Kristianstad,Ricoh,30:22,30,22,1.01,23.45,19.68,nDhwLB8E,,1
4,Handbollsligan 2017/2018,14.03.2018,Lugi,Guif,28:20,28,20,1.46,9.1,3.38,hEBnbl11,,1


## What is the winrate for the home team?

In [3]:
# Total number of matches.
n_matches = data.shape[0]

# Calculate number of features. -1 because we are saving one as the target variable (win/lose/draw)
n_features = data.shape[1] - 1

# Calculate matches won by home team.
n_homewins = len(data[data.ftr == '1'])

# Calculate win rate for home team.
win_rate = (float(n_homewins) / (n_matches)) * 100

# Print results
print("Total number of matches: {}".format(n_matches))
print("Number of features: {}".format(n_features))
print("Number of matches won by home team: {}".format(n_homewins))
print("Win rate of home team: {:.2f}%".format(win_rate))

Total number of matches: 1666
Number of features: 12
Number of matches won by home team: 913
Win rate of home team: 54.80%


## Preparing the Data

In [4]:
# Separate into feature set and target variable
#FTR = Full Time Result (H=Home Win, D=Draw, A=Away Win)
data = data.dropna(subset=['odds_1', 'odds_x', 'odds_2'], how='any')
X_all = data.drop(['season','date','ftr','home_team','guest_team','score','home_score','guest_score','explore_id','outcome'],1)
y_all = data['ftr']

# Standardising the data.
from sklearn.preprocessing import scale

#Center to the mean and component wise scale to unit variance.
cols = [['odds_1','odds_x','odds_2']]
for col in cols:
    X_all[col] = scale(X_all[col])

In [5]:
#we want continous vars that are integers for our input data, so lets remove any categorical vars
def preprocess_features(X):
    ''' Preprocesses the football data and converts catagorical variables into dummy variables. '''
    
    # Initialize new output DataFrame
    output = pd.DataFrame(index = X.index)

    # Investigate each feature column for the data
    for col, col_data in X.iteritems():

        # If data type is categorical, convert to dummy variables
        if col_data.dtype == object:
            col_data = pd.get_dummies(col_data, prefix = col)
                    
        # Collect the revised columns
        output = output.join(col_data)
    
    return output

X_all = preprocess_features(X_all)
print("Processed feature columns ({} total features):\n{}".format(len(X_all.columns), list(X_all.columns)))

Processed feature columns (3 total features):
['odds_1', 'odds_x', 'odds_2']


In [6]:
# Show the feature information by printing the first five rows
print("\nFeature values:")
display(X_all)

print("\nLabels:")
display(y_all)


Feature values:


Unnamed: 0,odds_1,odds_x,odds_2
0,-0.466846,-0.522443,-0.032545
1,-0.515276,-0.303862,0.202692
2,-0.493752,-0.401009,0.082338
3,-0.665948,4.293279,4.316605
4,-0.423798,-0.685511,-0.141958
5,-0.402273,-0.845110,-0.180252
6,0.776191,-0.518974,-0.683550
7,-0.165504,-1.008179,-0.456519
8,2.923256,1.174162,-0.760139
9,1.029103,-0.213654,-0.702697



Labels:


0       2
1       2
2       1
3       1
4       1
5       2
6       2
7       1
8       2
9       2
10      1
11      1
13      2
14      1
15      X
16      X
17      2
18      1
19      1
20      2
21      1
22      2
23      2
24      2
25      1
26      1
27      1
28      1
29      1
30      1
       ..
1603    1
1612    2
1613    2
1614    2
1615    2
1616    2
1617    1
1618    2
1619    1
1620    2
1621    1
1622    1
1623    1
1624    2
1625    1
1626    X
1627    2
1628    1
1629    1
1630    2
1631    X
1632    2
1633    2
1634    1
1635    1
1636    2
1637    1
1638    1
1639    1
1647    2
Name: ftr, Length: 1433, dtype: object

In [7]:
from sklearn.model_selection import train_test_split

# Shuffle and split the dataset into training and testing set.
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, 
                                                    test_size = 50,
                                                    random_state = 2,
                                                    stratify = y_all)

In [8]:
#for measuring training time
from time import time 
# F1 score (also F-score or F-measure) is a measure of a test's accuracy. 
#It considers both the precision p and the recall r of the test to compute 
#the score: p is the number of correct positive results divided by the number of 
#all positive results, and r is the number of correct positive results divided by 
#the number of positive results that should have been returned. The F1 score can be 
#interpreted as a weighted average of the precision and recall, where an F1 score 
#reaches its best value at 1 and worst at 0.
from sklearn.metrics import f1_score

def train_classifier(clf, X_train, y_train):
    ''' Fits a classifier to the training data. '''
    
    # Start the clock, train the classifier, then stop the clock
    start = time()
    clf.fit(X_train, y_train)
    end = time()
    
    # Print the results
    print("Trained model in {:.4f} seconds".format(end - start))

    
def predict_labels(clf, features, target):
    ''' Makes predictions using a fit classifier based on F1 score. '''
    
    # Start the clock, make predictions, then stop the clock
    start = time()
    y_pred = clf.predict(features)
    
    end = time()
    # Print and return results
    print("Made predictions in {:.4f} seconds.".format(end - start))
    
    return f1_score(target, y_pred, average=None), sum(target == y_pred) / float(len(y_pred))


def train_predict(clf, X_train, y_train, X_test, y_test):
    ''' Train and predict using a classifer based on F1 score. '''
    
    # Indicate the classifier and the training set size
    print("Training a {} using a training set size of {}. . .".format(clf.__class__.__name__, len(X_train)))
    
    # Train the classifier
    train_classifier(clf, X_train, y_train)
    
    # Print the results of prediction for both training and testing
    f1, acc = predict_labels(clf, X_train, y_train)
    print(f1, acc)
    
    f1, acc = predict_labels(clf, X_test, y_test)
    print(f1, acc)

In [9]:
# Initialize the three models (XGBoost is initialized later)
clf_A = LogisticRegression(random_state = 42)
clf_B = SVC(random_state = 912, kernel='rbf')
#Boosting refers to this general problem of producing a very accurate prediction rule 
#by combining rough and moderately inaccurate rules-of-thumb
clf_C = xgb.XGBClassifier(seed = 82)

train_predict(clf_A, X_train, y_train, X_test, y_test,)
print('')
train_predict(clf_B, X_train, y_train, X_test, y_test)
print('')
train_predict(clf_C, X_train, y_train, X_test, y_test)
print('')

Training a LogisticRegression using a training set size of 1383. . .
Trained model in 0.0085 seconds
Made predictions in 0.0015 seconds.
[0.76317383 0.61966236 0.        ] 0.6811279826464208
Made predictions in 0.0005 seconds.
[0.77419355 0.64705882 0.        ] 0.7

Training a SVC using a training set size of 1383. . .
Trained model in 0.0461 seconds
Made predictions in 0.0208 seconds.
[0.7656066  0.61041667 0.        ] 0.6818510484454086
Made predictions in 0.0012 seconds.
[0.76190476 0.60606061 0.        ] 0.68

Training a XGBClassifier using a training set size of 1383. . .


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Trained model in 0.1302 seconds
Made predictions in 0.0101 seconds.
[0.79207921 0.69097889 0.        ] 0.7230657989877078
Made predictions in 0.0011 seconds.
[0.79365079 0.66666667 0.        ] 0.72



In [12]:
# TODO: Import 'GridSearchCV' and 'make_scorer'
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer


# TODO: Create the parameters list you wish to tune
parameters = { 'learning_rate' : [0.1],
               'n_estimators' : [40],
               'max_depth': [3],
               'min_child_weight': [3],
               'gamma':[0.4],
               'subsample' : [0.8],
               'colsample_bytree' : [0.8],
               'scale_pos_weight' : [1],
               'reg_alpha':[1e-5]
             }  

# TODO: Initialize the classifier
clf = xgb.XGBClassifier(seed=2)

# TODO: Make an f1 scoring function using 'make_scorer' 
f1_scorer = make_scorer(f1_score,pos_label='1')

# TODO: Perform grid search on the classifier using the f1_scorer as the scoring method
grid_obj = GridSearchCV(clf,
                        scoring=f1_scorer,
                        param_grid=parameters,
                        cv=5)

# TODO: Fit the grid search object to the training data and find the optimal parameters
grid_obj = grid_obj.fit(X_train,y_train)

# Get the estimator
clf = grid_obj.best_estimator_
print(clf)

# Report the final F1 score for training and testing after parameter tuning
f1, acc = predict_labels(clf, X_train, y_train)
print("F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc))
    
f1, acc = predict_labels(clf, X_test, y_test)
print("F1 score and accuracy score for test set: {:.4f} , {:.4f}.".format(f1 , acc))

NameError: name 'GridSearchCV' is not defined