In [76]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

### Load in the dataset from `datasets/basketball_data.csv`

In [3]:
df = pd.read_csv('./datasets/basketball_data.csv')

### Set up X and y

X will be all columns except the following:
```python
['GameId','GameDate','GameTime','HostName', 'GuestName','total_score','total_line','game_line', 'winner','loser','host_wins','Season']
```

y will be weather the home team was the winner

In [8]:
exclude = ['GameId','GameDate','GameTime','HostName', 'GuestName','total_score',
           'total_line','game_line', 'winner','loser','host_wins','Season']
X = pd.DataFrame()
for c in df.columns:
    if c not in exclude:
        X[c] = df[c]
#exclusion = ['GameId','GameDate','GameTime','HostName', 'GuestName',
#'total_score','total_line','game_line', 'winner','loser','host_wins','Season']
#features = [col for col in df.columns if not col in exclusion]
#X = df[features]

In [11]:
y = (df['HostName'] == df['winner']).astype(int)

### Calculate baseline accuracy

In [16]:
max(y.value_counts(normalize=True))

0.5942144373673036

### Train/Test Split

- The test set will be the 2016 season
- Train is all other seasons

In [17]:
# Determine TTS bins
df['Season'].unique()

array([2013, 2014, 2015, 2016])

In [21]:
train_indicies = list(df.loc[df['Season'] != 2016].index)
test_indicies = list(df.loc[df['Season'] == 2016].index)

In [22]:
X_train = X.loc[train_indicies]
X_test = X.loc[test_indicies]
y_train = y[train_indicies]
y_test = y[test_indicies]

### Create instances of StandardScaler and KNeighborsClassifier to be incorporated into the Pipeline

In [24]:
ss = StandardScaler()
knn = KNeighborsClassifier()

### Create two pipelines, one for a standalone model, and the other for GridSearchCV

In [26]:
pipe = Pipeline([
    ('ss', ss), # (class name, obj)
    ('knn', knn)
])

# Copy for Grid Search
pipe_gs = Pipeline([
    ('ss', ss), # (class name, obj)
    ('knn', knn)
])

##### How it went
ss.fit(X_train)  
X_train_sc = ss.transform(X_train)  
X_test_sc = ss.transform(X_test)

In [27]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('ss', StandardScaler(copy=True, with_mean=True, with_std=True)), ('knn', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))])

In [28]:
pipe.score(X_train, y_train)

0.7495508444125045

In [29]:
pipe.score(X_test, y_test)

0.5786802030456852

In [32]:
pipe.predict(X_test)

array([0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0,
       1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0,

In [35]:
pipe.predict_proba(X_test)

array([[0.8, 0.2],
       [0.2, 0.8],
       [0. , 1. ],
       ...,
       [0.2, 0.8],
       [0.2, 0.8],
       [0.4, 0.6]])

In [34]:
pipe.predict_proba(X_test)[:, 1] # Just the ones

array([0.2, 0.8, 1. , 0.6, 1. , 0.8, 0.4, 0.6, 0. , 0.8, 0.8, 1. , 0.6,
       0.6, 0.4, 0.6, 0.8, 0.8, 1. , 0.4, 0.6, 0.4, 0.6, 0. , 0.4, 0.8,
       0.8, 0.6, 0.6, 1. , 0.4, 0.4, 1. , 0.8, 0.8, 1. , 0.4, 0.6, 0.4,
       0.8, 1. , 0.4, 0.6, 0. , 0. , 1. , 0.2, 0.4, 0.6, 0.2, 0.8, 0.4,
       0.8, 0.6, 0.6, 0.6, 0.8, 0.8, 0.8, 0.4, 0.8, 0.6, 1. , 0.6, 1. ,
       0.6, 0.8, 0.6, 0.6, 0.8, 0.4, 0.8, 0.4, 0.6, 0.6, 0.4, 0.6, 0.6,
       0.6, 0.4, 0.6, 0.8, 1. , 0.6, 0.8, 0.8, 0.2, 0.6, 0.6, 0.8, 0.6,
       0.4, 0.4, 0.2, 0.8, 0.6, 0.8, 0.4, 0.8, 1. , 0.6, 1. , 0.4, 0.4,
       0.2, 0.4, 0.4, 0.2, 0.8, 0.4, 0.4, 0.6, 0.8, 0.6, 0.6, 0.2, 0.4,
       0.8, 0.2, 0.6, 0.4, 0.4, 0.4, 0.8, 0.4, 0.4, 0.2, 0.8, 0.6, 0.8,
       0.2, 0.4, 0.6, 1. , 0.6, 0.4, 1. , 0.6, 0.6, 1. , 0.6, 0.6, 1. ,
       0.4, 0.8, 0.4, 0.4, 0.8, 0.4, 0.8, 1. , 0.8, 0.6, 0.8, 0.4, 0.8,
       0.6, 0.6, 0.8, 0.2, 0.4, 0.4, 0.2, 0.6, 0.4, 0.6, 0.6, 0.6, 0.4,
       1. , 0.4, 0.8, 0.6, 0.8, 0.6, 0.2, 0.8, 1. , 0.4, 0.6, 0.

### GridSearchCV

#### First Run

In [36]:
KNeighborsClassifier()
pipe_gs = Pipeline([
    ('ss', ss), # (class name, obj)
    ('knn', knn)
])

In [41]:
params = {}
gs = GridSearchCV(pipe_gs, param_grid=params, cv=5)
gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('ss', StandardScaler(copy=True, with_mean=True, with_std=True)), ('knn', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))]),
       fit_params=None, iid=True, n_jobs=1, param_grid={},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [42]:
gs.best_score_

0.5932446999640676

In [43]:
gs.best_params_

{}

In [44]:
gs.score(X_test, y_test)

0.5786802030456852

#### Second Run

In [45]:
KNeighborsClassifier()
pipe_gs = Pipeline([
    ('ss', ss), # (class name, obj)
    ('knn', knn)
])

In [46]:
params = {
    'knn__n_neighbors': [5, 6, 7, 8, 9, 10] # name you gave it, two underscores, name of parameter
    #'knn__n_neighbors': range(5, 12, 2) # to pass a range
}
gs = GridSearchCV(pipe_gs, param_grid=params, cv=5)
gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('ss', StandardScaler(copy=True, with_mean=True, with_std=True)), ('knn', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'knn__n_neighbors': [5, 6, 7, 8, 9, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [49]:
print(gs.best_score_)
print(gs.best_params_)

0.6083363277039167
{'knn__n_neighbors': 9}


#### Third Run

In [51]:
KNeighborsClassifier()
pipe_gs = Pipeline([
    ('ss', ss), # (class name, obj)
    ('knn', knn)
])

In [52]:
params = {
    'knn__n_neighbors': range(5, 12, 2),
    'knn__weights': ['uniform', 'distance'],
    'knn__p': [1, 2]
}
gs = GridSearchCV(pipe_gs, param_grid=params, cv=5)
gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('ss', StandardScaler(copy=True, with_mean=True, with_std=True)), ('knn', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'knn__n_neighbors': range(5, 12, 2), 'knn__weights': ['uniform', 'distance'], 'knn__p': [1, 2]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [53]:
print(gs.best_score_)
print(gs.best_params_)

0.6083363277039167
{'knn__n_neighbors': 9, 'knn__p': 2, 'knn__weights': 'uniform'}


### Challenge: 

1. Create a pipeline with `StandardScaler` and `LogisticRegression`
    - For LogisticRegression, set solver to liblinear
2. Run your pipeline through `GridSearchCV`, testing the following parameters:
    - penalty
    - C
    
How does it score on the train and test sets? What are the best parameters?

#### First go

In [54]:
ss = StandardScaler()
lr = LogisticRegression()

In [55]:
lr_pipe = Pipeline([
    ('ss', ss),
    ('lr', lr),
])

In [56]:
lr_params = {}
gs = GridSearchCV(lr_pipe, lr_params, cv=5)

In [58]:
gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('ss', StandardScaler(copy=True, with_mean=True, with_std=True)), ('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1, param_grid={},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [59]:
print(gs.best_score_)
print(gs.best_params_)

0.6500179662234998
{}


In [60]:
print(gs.score(X_test, y_test))

0.6954314720812182


#### Second

In [61]:
ss = StandardScaler()
lr = LogisticRegression()

In [77]:
lr_pipe = Pipeline([
    ('poly', PolynomialFeatures()),
    ('ss', ss),
    ('lr', lr),
])

In [80]:
lr_params = {
    'lr__penalty': ['11', '12', '13'],
    'lr__C': [.1, .2, .3, .4, .5, .6, .7, .8, .9]
}
gs = GridSearchCV(lr_pipe, param_grid=lr_params)

In [81]:
gs.fit(X_train, y_train)

ValueError: Unsupported set of arguments: The combination of penalty='11' and loss='logistic_regression' is not supported, Parameters: penalty='11', loss='logistic_regression', dual=False

In [67]:
print(gs.best_score_)
print(gs.best_params_)
print(gs.score(X_test, y_test))

AttributeError: 'GridSearchCV' object has no attribute 'best_params_'