# 04.0 Starter Model

## Purpose: 

Develop a quick and dirty working model:
- Is the custom cross validation splitter working correctly within the sklearn framework?
- What is the estimated time to run a full dataset through thr model?
- Are custom transformers working within a sklearn pipeline?
- How to get better visibility of the data processing pipeline? - Play with `ibex`
- Set up Feature Union
- Set up Model Ensembles

### Structure
This is for testing workflows, so structure is fluid 

### Standard Imports

In [1]:
% matplotlib inline

import os
import sys

import pandas as pd
import numpy as np
#import xarray as xr

pd.set_option("display.width",100)
pd.options.display.float_format = '{:,.2f}'.format

# add the 'src' directory to path to import modules
src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)

### Import Custom Tools

In [2]:
from utilities.utilities import get_fixture_list, get_team_list, get_dataset
from cross_val.LeagueKFold import LeagueKFold

### Import Custom Transformers

In [3]:
from features.GoalExpectancyExtractor import GoalExpectancyExtractor

### Import Sklearn Tools

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, cross_val_predict, cross_validate

### Import Sklearn Transformers wrapped with Ibex

In [5]:
from ibex.sklearn.preprocessing import StandardScaler
from ibex.sklearn.naive_bayes import GaussianNB

### Set up Data & Objects

In [6]:
test_dir = '../data/test/'
datacube_path = test_dir + 'XArrayDataSet_1.nc'
team_list_path = test_dir + 'team_list.pickle'

fixture_list = get_fixture_list(datacube_path)
#fixture_results = np.ones(len(fixture_list))

team_list = get_team_list(team_list_path)
dataset = get_dataset(datacube_path)

In [7]:
import xarray as xr
def get_fixture_results(full_path):
    """
    Opens a Datacube at the specified path
    Looks into the DataCube, and gets all played fixtures home and away teams,
    sorted on index value (set at Data Processing)
    Returns a 2 column sequential integer indexed DataFrame of strings |h_team | a_team |
    where h_team represents home team, and away team reps away team
    Integer Index represents sequence of games played. Earliest (lowest) to most recent (highest)
    """
    ds = xr.open_dataset(full_path)
    fixture_results = ds[['Idx', 'coded_result']].to_dataframe().dropna()\
                    .sort_values('Idx').reset_index().drop('Idx', axis=1)

    return fixture_results

fixture_results = get_fixture_results(datacube_path)
X = fixture_results[['h_team', 'a_team']]
y = fixture_results['coded_result'].astype('int')
print(X.head())
print(X.shape)
print()
print(y.head())
print(y.shape)

                     h_team             a_team
0                   Arsenal     Leicester City
1  Brighton and Hove Albion    Manchester City
2                   Chelsea            Burnley
3            Crystal Palace  Huddersfield Town
4                   Everton         Stoke City
(170, 2)

0    2
1    0
2    0
3    0
4    2
Name: coded_result, dtype: int64
(170,)


### Set up Transformers

In [8]:
goalexpectancyextractor = GoalExpectancyExtractor(dataset, team_list)
standardscaler = StandardScaler()
gaussiannb = GaussianNB(priors = np.array([0.6,0.1, 0.3]))

### Set up Pipeline

In [9]:
pipe = Pipeline([('goalexpectancyextractor',goalexpectancyextractor),
                 ('standardscaler', standardscaler),
                 ('gaussiannb',gaussiannb)])

### Set up Cross Validation 

In [10]:
lkf = LeagueKFold(X, team_list, pretrained=False)
cv = lkf.split(X)
for train, test in cv:
    print('train', train, '*TEST*', test, '\n')
    break

train [50 51 52 53 54 55 56 57 58 59] *TEST* [60 61 62 63 64 65 66 67 68 69] 



In [11]:
leaguekfold = LeagueKFold(X,team_list, False)

### Cross Val Pipeline

In [12]:
cross_val_data = cross_validate(pipe, X, y, cv=leaguekfold)
print(cross_val_data)

{'fit_time': array([ 0.12700748,  0.12133193,  0.12164426,  0.12294245,  0.12059069,
        0.12053657,  0.12129998,  0.12265182,  0.12164259,  0.12049365,
        0.12100196]), 'score_time': array([ 0.11932707,  0.12011957,  0.11892581,  0.11934638,  0.1207993 ,
        0.11922932,  0.11938381,  0.11930919,  0.12113118,  0.11940742,
        0.12005281]), 'test_score': array([ 0.6,  0.2,  0.7,  0.5,  0.5,  0.3,  0.4,  0.6,  0.4,  0.3,  0.7]), 'train_score': array([ 1.        ,  0.7       ,  0.56666667,  0.55      ,  0.56      ,
        0.51666667,  0.45714286,  0.45      ,  0.44444444,  0.43      ,
        0.40909091])}


In [13]:
leaguekfold = LeagueKFold(X,team_list, False)
cross_val_scores = cross_val_score(pipe, X, y, cv=leaguekfold)
print(cross_val_scores)

[ 0.6  0.2  0.7  0.5  0.5  0.3  0.4  0.6  0.4  0.3  0.7]


### Set up custom train_test_split function 

In [14]:
def lkf_train_test_split(X,y, team_list):
    num_splits = 1
    lkf = LeagueKFold(X,team_list, False)
    cv_inds = list(lkf.split(X))[-num_splits:]
    cv_vals = [(X.iloc[train], X.iloc[test],y.iloc[train], y.iloc[test]) for train, test in cv_inds]
    return cv_vals[0][0], cv_vals[0][1], cv_vals[0][2], cv_vals[0][3]

X_train, X_test, y_train, y_test = lkf_train_test_split(X, y, team_list)
print(X_train[0:5], X_train.shape)
print(X_test[0:5], X_test.shape)
print(y_train[0:5], y_train.shape)
print(y_test[0:5], y_test.shape)

             h_team             a_team
50          Burnley  Huddersfield Town
51          Everton        Bournemouth
52   Leicester City          Liverpool
53  Manchester City     Crystal Palace
54      Southampton  Manchester United (110, 2)
                h_team                a_team
160            Burnley            Stoke City
161     Crystal Palace               Watford
162  Huddersfield Town               Chelsea
163          Liverpool  West Bromwich Albion
164  Manchester United           Bournemouth (10, 2)
50    1
51    2
52    0
53    2
54    0
Name: coded_result, dtype: int64 (110,)
160    2
161    2
162    0
163    1
164    2
Name: coded_result, dtype: int64 (10,)


### Set up readable results 

In [15]:
pipe.fit(X_train,y_train)
probas = pipe.predict_proba(X_test) # Using ibex we are getting a dataframe out of the estimator! :)
readable_probas = pd.concat([X_test, probas, y_test], axis=1)
readable_probas['pred_result'] = probas.idxmax(axis=1) # or use idxmax() on probas instead of full pipeline
#print(type(probas))
print(readable_probas)

                h_team                    a_team    0    1    2  coded_result  pred_result
160            Burnley                Stoke City 0.35 0.09 0.56             2            2
161     Crystal Palace                   Watford 0.82 0.09 0.09             2            0
162  Huddersfield Town                   Chelsea 0.81 0.10 0.09             0            0
163          Liverpool      West Bromwich Albion 0.06 0.02 0.92             1            2
164  Manchester United               Bournemouth 0.15 0.05 0.80             2            2
165   Newcastle United                   Everton 0.59 0.10 0.31             0            0
166        Southampton            Leicester City 0.65 0.11 0.24             0            0
167       Swansea City           Manchester City 0.88 0.08 0.04             0            0
168  Tottenham Hotspur  Brighton and Hove Albion 0.24 0.07 0.69             2            2
169    West Ham United                   Arsenal 0.89 0.08 0.04             1            0