# Hockey Simulator Project

### Cameron Grayson
### Began 02/17/2022

This project is meant for recreational purposes only.

## Import Tools
Must begin by running this cell first. Ctrl + Enter to run a cell.

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

## Functions
Run this cell. This creates the functions we will use below.

In [2]:
# Input string year, string for team name.
# This will read the file.
def load_gd_file(year, team):
    short_year = year[-2] + year[-1]
    gd_file = pd.ExcelFile(year + "_game_data.xlsx")
    gd = pd.read_excel(gd_file, team + "_" + short_year)
    return gd

# Input game data dataframe for dg, string for team name.
# This will add the name of the team whose game data we are importing to each game and rename columns.
def add_team_and_rename(gd, team):
    team_name = []
    for i in range(0, len(gd)):{
        team_name.append(team)
    }
    gd.insert(2, 'Team_Name', team_name)
    
    gd_col_rename = {'Unnamed: 2': 'Location', 'Unnamed: 6': 'Result', 'Unnamed: 7': 'OT_Result', 
                      'OL' : 'OTL'}
    gd = gd.rename(columns = gd_col_rename)
    return gd

# Converts the streak to a numeric value (positive for W streak, negative for L streak)
def convert_streak_to_numeric(gd):
    for i in range(0, len(gd)):
        if(gd.Streak[i].startswith('L')):
            gd.Streak[i] = int(gd.Streak[i][gd.Streak[i].rindex(" ") + 1:]) * -1
        elif(gd.Streak[i].startswith('W')):
            gd.Streak[i] = int(gd.Streak[i][gd.Streak[i].rindex(" ") + 1:])
        else:
            gd.Streak[i] = 'ERROR'
    return gd

# This will add points and point percentage and goal differential.
def add_points(gd):
    Points = []
    Point_perc = []
    Goal_Diff = []
    
    
    for i in range(0, len(gd)):
        Points.append(0)
        Point_perc.append(0)
        Goal_Diff.append(0)
    
    gd.insert(7, 'Goal_Diff', Goal_Diff)
    
    for i in range(0, len(gd)):
        if i == 0:
            gd.at[i, 'Goal_Diff'] = (gd.at[(i), 'GF'] - gd.at[(i), 'GA'])
        else:
            gd.at[i, 'Goal_Diff'] = (gd.at[(i-1), 'Goal_Diff'] + gd.at[(i), 'GF'] - gd.at[(i), 'GA'])

    gd.insert(12, 'Points', Points)
    gd.insert(13, 'Point_perc', Point_perc)

    gd.Points = (gd.W * 2) + (gd.OTL)
    gd.Point_perc = gd.Points / (gd.GP * 2)
    
    return gd

# Calculate season series.
# RETURNS 2 DF's, one with pregame stats and one with final stats.
def season_series(gd):
    ss_W = []
    ss_L = []
    ss_OTL = []
    ss_GP = []
    ss_Goal_Diff = []
    
    for i in range(0, len(gd)):
        ss_W.append(0)
        ss_L.append(0)
        ss_OTL.append(0)
        ss_GP.append(0)
        ss_Goal_Diff.append(0)
    
    gd['ss_GP'] = ss_GP
    gd['ss_W'] = ss_W
    gd['ss_L'] = ss_L
    gd['ss_OTL'] = ss_OTL
    gd['ss_Goal_Diff'] = ss_Goal_Diff
    
    gd_pg = gd.copy()
    
    for j in range(0, len(team_list)):
        season_series_calc = gd.loc[gd['Opponent'] == team_list[j]]

    #indexing will be completely off if I attempt to select as is, so I make the index 0-4, and change it back.
        index_list = list(season_series_calc.index.values)
        season_series_calc.index = range(len(season_series_calc))

        # SS calculation for postgame dataset
        for i in range(0, len(season_series_calc)): 
            season_series_calc.at[i, 'ss_GP'] = (i+1)
            if(i==0):
                season_series_calc.at[i, 'ss_Goal_Diff'] = (season_series_calc.at[(i), 'GF'] - season_series_calc.at[(i), 'GA'])
            elif(i>0):
                season_series_calc.at[i, 'ss_Goal_Diff'] = (season_series_calc.at[(i-1), 'ss_Goal_Diff'] + season_series_calc.at[(i), 'GF'] - season_series_calc.at[(i), 'GA'])
                season_series_calc.at[i, 'ss_OTL'] = (season_series_calc.at[(i-1), 'ss_OTL'])
                season_series_calc.at[i, 'ss_W'] = (season_series_calc.at[(i-1), 'ss_W'])
                season_series_calc.at[i, 'ss_L'] = (season_series_calc.at[(i-1), 'ss_L'])
        
            if((season_series_calc.loc[i].OT_Result == 'SO' or season_series_calc.loc[i].OT_Result == 'OT') & (season_series_calc.loc[i].Result == 'L')):
                season_series_calc.at[i, 'ss_OTL'] = (season_series_calc.loc[i, 'ss_OTL'] + 1)
            elif(season_series_calc.loc[i].Result == 'L'):
                season_series_calc.at[i, 'ss_L'] = (season_series_calc.loc[i, 'ss_L'] + 1)
            else:
                season_series_calc.at[i, 'ss_W'] = (season_series_calc.loc[i, 'ss_W'] + 1)
        
        season_series_calc.index = index_list

        for i in range(0, len(index_list)):
            gd.at[index_list[i], 'ss_Goal_Diff'] = season_series_calc.loc[index_list[i], 'ss_Goal_Diff']
            gd.at[index_list[i], 'ss_GP'] = season_series_calc.loc[index_list[i], 'ss_GP']
            gd.at[index_list[i], 'ss_OTL'] = season_series_calc.loc[index_list[i], 'ss_OTL']
            gd.at[index_list[i], 'ss_W'] = season_series_calc.loc[index_list[i], 'ss_W']
            gd.at[index_list[i], 'ss_L'] = season_series_calc.loc[index_list[i], 'ss_L']


    # SS calculation for the pregame dataset.
    for j in range(0, len(team_list)):
        season_series_calc = gd_pg.loc[gd_pg['Opponent'] == team_list[j]]

        #indexing will be completely off if I attempt to select as is, so I make the index 0-4, and change it back.
        index_list = list(season_series_calc.index.values)
        season_series_calc.index = range(len(season_series_calc))

        for i in range(0, len(season_series_calc) - 1): 
            season_series_calc.at[i+1, 'ss_GP'] = (i+1)
            if(i==0):
                season_series_calc.at[i+1, 'ss_Goal_Diff'] = (season_series_calc.at[(i), 'GF'] - season_series_calc.at[(i), 'GA'])
            elif(i>0):
                season_series_calc.at[i+1, 'ss_Goal_Diff'] = (season_series_calc.at[(i), 'ss_Goal_Diff'] + season_series_calc.at[(i), 'GF'] - season_series_calc.at[(i), 'GA'])
                season_series_calc.at[i+1, 'ss_OTL'] = (season_series_calc.at[(i), 'ss_OTL'])
                season_series_calc.at[i+1, 'ss_W'] = (season_series_calc.at[(i), 'ss_W'])
                season_series_calc.at[i+1, 'ss_L'] = (season_series_calc.at[(i), 'ss_L'])
        
            if((season_series_calc.loc[i].OT_Result == 'SO' or season_series_calc.loc[i].OT_Result == 'OT') & (season_series_calc.loc[i].Result == 'L')):
                season_series_calc.at[i+1, 'ss_OTL'] = (season_series_calc.loc[i, 'ss_OTL'] + 1)
            elif(season_series_calc.loc[i].Result == 'L'):
                season_series_calc.at[i+1, 'ss_L'] = (season_series_calc.loc[i, 'ss_L'] + 1)
            else:
                season_series_calc.at[i+1, 'ss_W'] = (season_series_calc.loc[i, 'ss_W'] + 1)
        
        season_series_calc.index = index_list

        for i in range(0, len(index_list)):
            gd_pg.at[index_list[i], 'ss_Goal_Diff'] = season_series_calc.loc[index_list[i], 'ss_Goal_Diff']
            gd_pg.at[index_list[i], 'ss_GP'] = season_series_calc.loc[index_list[i], 'ss_GP']
            gd_pg.at[index_list[i], 'ss_OTL'] = season_series_calc.loc[index_list[i], 'ss_OTL']
            gd_pg.at[index_list[i], 'ss_W'] = season_series_calc.loc[index_list[i], 'ss_W']
            gd_pg.at[index_list[i], 'ss_L'] = season_series_calc.loc[index_list[i], 'ss_L']

    #Add points and point percentages for the SS datasets.
    ss_Points = []
    ss_Point_perc = []
        
    for i in range(0, len(gd)):
        ss_Points.append(0)
        ss_Point_perc.append(0)
        
    gd['ss_Points'] = ss_Points
    gd['ss_Point_perc'] = ss_Point_perc

    gd.ss_Points = (gd.ss_W * 2) + (gd.ss_OTL)
    gd.ss_Point_perc = gd.ss_Points / (gd.ss_GP * 2)
        
    gd_pg['ss_Points'] = ss_Points
    gd_pg['ss_Point_perc'] = ss_Point_perc

    gd_pg.ss_Points = (gd_pg.ss_W * 2) + (gd_pg.ss_OTL)
    gd_pg.ss_Point_perc = gd_pg.ss_Points / (gd_pg.ss_GP * 2)
        
    return gd, gd_pg

# Moves the results to the front column.
def move_results(gd, gd_pg):
    result_list = list(gd.Result.values)
    gd = gd.drop('Result', axis = 1)
    gd.insert(0, 'Result', result_list)
    
    result_list = list(gd_pg.Result.values)
    gd_pg = gd_pg.drop('Result', axis = 1)
    gd_pg.insert(0, 'Result', result_list)

    return gd, gd_pg

def remove_away_games(gd, gd_pg, home):
    away_games = gd.loc[gd['Location'] == '@']
    loc_index = list(away_games.index.values)

    gd = gd.drop(loc_index)
    gd_pg = gd_pg.drop(loc_index)
    gd_pg = gd_pg.drop('Location', axis = 1)
    
    if(home == True):
        gd = gd.drop('Location', axis = 1)
        
        gd = gd.rename(columns = {'Team_Name' : 'Home', 'Opponent' : 'Visitors'})

    return gd, gd_pg


# This will shift the pregame columns down that need to so we don't have info we shouldn't before the game
# is played. Also,  we will remove columns with information we shouldn't have.
# This also converts NaN values to zeroes, as is intended with this dataset.
def pregame_cleanup(gd_pg):
    GP_list = list(gd_pg.GP.values)
    Goal_Diff_list = list(gd_pg.Goal_Diff.values)
    W_list = list(gd_pg.W.values)
    L_list = list(gd_pg.L.values)
    OTL_list = list(gd_pg.OTL.values)
    Points_list = list(gd_pg.Points.values)
    Point_perc_list = list(gd_pg.Point_perc.values)
    Streak_list = list(gd_pg.Streak.values)

    GP_list = [0] + GP_list
    GP_list.pop()
    Goal_Diff_list = [0] + Goal_Diff_list
    Goal_Diff_list.pop()
    W_list = [0] + W_list
    W_list.pop()
    L_list = [0] + L_list
    L_list.pop()
    OTL_list = [0] + OTL_list
    OTL_list.pop()
    Points_list = [0] + Points_list
    Points_list.pop()
    Point_perc_list = [0] + Point_perc_list
    Point_perc_list.pop()
    Streak_list = [0] + Streak_list
    Streak_list.pop()
    
    gd_pg.GP = GP_list
    gd_pg.Goal_Diff = Goal_Diff_list
    gd_pg.W = W_list
    gd_pg.L = L_list
    gd_pg.OTL = OTL_list
    gd_pg.Points = Points_list
    gd_pg.Point_perc = Point_perc_list
    gd_pg.Streak = Streak_list
    
    gd_pg = gd_pg.drop('Date', axis = 1)
    gd_pg = gd_pg.drop('OT_Result', axis = 1)
    gd_pg = gd_pg.drop('Team_Name', axis = 1)
    gd_pg = gd_pg.drop('Opponent', axis = 1)
    gd_pg = gd_pg.drop('GF', axis = 1)
    gd_pg = gd_pg.drop('GA', axis = 1)

    gd_pg['ss_Point_perc'] = gd_pg['ss_Point_perc'].fillna(0)
    
    return gd_pg

def log_reg_prediction(gd_pg):
    X1 = gd_pg.iloc[:,1:].values
    y1 = gd_pg.iloc[:,:1].values

    X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X1, y1, random_state = 0)

    ts = np.shape(X_train_1)
    trs = np.shape(X_test_1)
        
    logreg = LogisticRegression()

    logreg.fit(X_train_1, y_train_1)

    y_pred = logreg.predict(X_test_1)

    cnf_matrix = metrics.confusion_matrix(y_test_1, y_pred)
    
    pred = (y_train_1.tolist() + y_pred.tolist())
    
    pred = [''.join(i) for i in pred]
    
    pred_type = []
    for i in range(0, len(gd_pg)):
        pred_type.append("Train")
    
    for i in range(len(y_train_1), len(gd_pg)):
        pred_type[i] = "Test"
    
    return ts, trs, cnf_matrix, pred, logreg, pred_type

## Parameters to Change
Don't forget to update your parameters! Here's a description:

**data_year:** This is the year of game data you are importing.<br>
**single_team_season:** Enter the full team name for a complete list of that team's games.<br>
**only_home:** This should be True if you ONLY want all of the home games.

In [3]:
data_year = '2015'
single_team_season = 'Nashville Predators'
only_home = True

In [4]:
team_list = ['Anaheim Ducks', 'Arizona Coyotes', 'Boston Bruins', 'Buffalo Sabres', 'Calgary Flames', 'Carolina Hurricanes',
'Chicago Blackhawks', 'Colorado Avalanche', 'Columbus Blue Jackets', 'Dallas Stars', 'Detroit Red Wings', 
'Edmonton Oilers', 'Florida Panthers', 'Los Angeles Kings', 'Minnesota Wild', 'Montreal Canadiens', 
'Nashville Predators', 'New Jersey Devils', 'New York Islanders', 'New York Rangers', 'Ottawa Senators', 
'Philadelphia Flyers', 'Pittsburgh Penguins', 'San Jose Sharks', 'St. Louis Blues', 'Tampa Bay Lightning', 
'Toronto Maple Leafs', 'Vancouver Canucks', 'Washington Capitals', 'Winnipeg Jets']

This will generate two data sets for one team's entire season schedule, one including the data from the game that is played on that row, and the other only containing data you will have before the game is played.

In [404]:
game_data = load_gd_file(data_year, single_team_season)
game_data = add_team_and_rename(game_data, single_team_season)
game_data = convert_streak_to_numeric(game_data)
game_data = add_points(game_data)
game_data, game_data_pregame = season_series(game_data)
game_data, game_data_pregame = move_results(game_data, game_data_pregame)
game_data_pregame = pregame_cleanup(game_data_pregame)
game_data, game_data_pregame = remove_away_games(game_data, game_data_pregame)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gd.Streak[i] = int(gd.Streak[i][gd.Streak[i].rindex(" ") + 1:])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gd.Streak[i] = int(gd.Streak[i][gd.Streak[i].rindex(" ") + 1:]) * -1


You may remove the hash and run these cells if you would like to see the game_data or game_data_pregame dataframes for the team and year specified in the cell above.

In [407]:
#game_data

In [378]:
#game_data_pregame

This one will take a little while to run. If your next cell doesn't run right away, don't worry: just wait a bit. Make some tea, enjoy the day.

In [5]:
game_data_total = pd.DataFrame()
game_data_total_pregame = pd.DataFrame()

for i in range(0, len(team_list)):
    game_data = load_gd_file(data_year, team_list[i])
    game_data = add_team_and_rename(game_data, team_list[i])
    game_data = convert_streak_to_numeric(game_data)
    game_data = add_points(game_data)
    game_data, game_data_pregame = season_series(game_data)
    game_data, game_data_pregame = move_results(game_data, game_data_pregame)
    game_data_pregame = pregame_cleanup(game_data_pregame)
    game_data, game_data_pregame = remove_away_games(game_data, game_data_pregame, only_home)
    
    game_data_total = pd.concat([game_data_total, game_data], axis = 0)
    game_data_total_pregame = pd.concat([game_data_total_pregame, game_data_pregame], axis = 0)
    
game_data_total = game_data_total.reset_index(drop = True)
game_data_total_pregame = game_data_total_pregame.reset_index(drop = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gd.Streak[i] = int(gd.Streak[i][gd.Streak[i].rindex(" ") + 1:]) * -1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gd.Streak[i] = int(gd.Streak[i][gd.Streak[i].rindex(" ") + 1:])


Remove the hashes and run the cells below if you would like to see these dataframes.

In [8]:
#game_data_total

In [6]:
game_data_total_pregame

Unnamed: 0,Result,GP,Goal_Diff,W,L,Points,Point_perc,OTL,Streak,ss_GP,ss_W,ss_L,ss_OTL,ss_Goal_Diff,ss_Points,ss_Point_perc
0,W,4,4,3,1,6,0.750000,0,3,0,0,0,0,0,0,0.000
1,W,5,5,4,1,8,0.800000,0,4,0,0,0,0,0,0,0.000
2,W,6,8,5,1,10,0.833333,0,5,1,1,0,0,4,2,1.000
3,W,7,11,6,1,12,0.857143,0,6,0,0,0,0,0,0,0.000
4,L,8,14,7,1,14,0.875000,0,7,0,0,0,0,0,0,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1225,W,74,12,38,24,88,0.594595,12,-1,1,0,1,0,-3,0,0.000
1226,L,75,15,39,24,90,0.600000,12,1,4,3,0,1,6,7,0.875
1227,L,76,14,39,25,90,0.592105,12,-1,1,1,0,0,1,2,1.000
1228,W,77,13,39,26,90,0.584416,12,-2,2,0,1,1,-4,1,0.250


## Create Predictions
Now we will create predictions for whether a team will win or lose the game they are about to play.

The prediction will then be merged back into the game_data_total dataset, so you can compare the predictions vs. whether a team actually won or lost each game.

## TWO PREDICTION OPTIONS

### Option #1
This first cell will randomize the games before creating the prediction. This is useful if you only have 1 season worth of data, and your split is within a season. Randomizing the games will allow a spread of training and testing data across all teams, and not just training on some teams and testing on others

In [7]:
randseed = 73

game_data_total = game_data_total.sample(frac = 1, random_state = randseed)
game_data_total_pregame = game_data_total_pregame.sample(frac = 1, random_state = randseed)

test_shape, train_shape, cnf_matrix, prediction, log_reg_model, prediction_type = log_reg_prediction(game_data_total_pregame)

game_data_total.insert(0, 'Prediction', prediction)
game_data_total.insert(0, 'Prediction_Type', prediction_type)

game_data_total = game_data_total.sort_index()
game_data_total_pregame = game_data_total_pregame.sort_index()

print('Training Features Shape:', train_shape)
print('Test Features Shape:    ', test_shape)
print(cnf_matrix)

Training Features Shape: (308, 15)
Test Features Shape:     (922, 15)
[[ 47  87]
 [ 45 129]]


  return f(**kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Option #2
This will keep the games dataset as is before creating the predictions. This is useful if you have multiple years of data, and are splitting before the start of a season. For example, you have 2015 and 2016 games, so you will train on 2015 games and predict all games for 2016.

In [410]:
test_shape, train_shape, cnf_matrix, prediction, log_reg_model = log_reg_prediction(game_data_total_pregame)

print('Training Features Shape:', train_shape)
print('Test Features Shape:    ', test_shape)

game_data_total.insert(0, 'Prediction', prediction)
print(cnf_matrix)

Training Features Shape: (308, 15)
Test Features Shape:     (922, 15)
[[ 36 108]
 [ 31 133]]


  return f(**kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [481]:
game_data_total = game_data_total.drop('Prediction', axis = 1)
game_data_total = game_data_total.drop('Prediction_Type', axis = 1)

## Export Datasets we have Created
Remove the hash for any datasets you would like to have exported to Excel files.

In [14]:
game_data_total.to_excel('game_data_total_' + data_year + '.xlsx', sheet_name = 'game_data_total_' + data_year)

# How Did We Get Here?
Below will be a step-by-step of how the data is being sorted and filtered. This example will only apply to one team, and uses different dataframe names in order to keep the example separate from our data we are working with above.

## Sort and Filter Data

### Import Game Data and Create Game Logs with Desired Information

First, I'll create a list of every team we will be working with.

In [6]:
#team_list = ['Anaheim Ducks', 'Arizona Coyotes', 'Boston Bruins', 'Buffalo Sabres', 'Calgary Flames', 'Carolina Hurricanes',
#'Chicago Blackhawks', 'Colorado Avalanche', 'Columbus Blue Jackets', 'Dallas Stars', 'Detroit Red Wings', 
#'Edmonton Oilers', 'Florida Panthers', 'Los Angeles Kings', 'Minnesota Wild', 'Montreal Canadiens', 
#'Nashville Predators', 'New Jersey Devils', 'New York Islanders', 'New York Rangers', 'Ottawa Senators', 
#'Philadelphia Flyers', 'Pittsburgh Penguins', 'San Jose Sharks', 'St. Louis Blues', 'Tampa Bay Lightning', 
#'Toronto Maple Leafs', 'Vancouver Canucks', 'Washington Capitals', 'Winnipeg Jets']

The data we are importing will have two sheets for each team's games played per year. One is the game_data_year sheet, and the other is the game_data_year_log sheet. Unedited, they will look like what is shown below.

In [103]:
game_data_file = pd.ExcelFile("2015_game_data.xlsx")
game_data_15_team_ex = pd.read_excel(game_data_file, team_list[0] + "_15")
game_data_15_team_ex

Unnamed: 0,GP,Date,Unnamed: 2,Opponent,GF,GA,Unnamed: 6,Unnamed: 7,W,L,OL,Streak
0,1,2014-10-09,@,Pittsburgh Penguins,4,6,L,,0,1,0,L 1
1,2,2014-10-11,@,Detroit Red Wings,3,2,W,,1,1,0,W 1
2,3,2014-10-13,@,Buffalo Sabres,5,1,W,,2,1,0,W 2
3,4,2014-10-14,@,Philadelphia Flyers,4,3,W,SO,3,1,0,W 3
4,5,2014-10-17,,Minnesota Wild,2,1,W,,4,1,0,W 4
...,...,...,...,...,...,...,...,...,...,...,...,...
77,78,2015-03-29,@,New Jersey Devils,2,1,W,,49,22,7,W 3
78,79,2015-04-01,,Edmonton Oilers,5,1,W,,50,22,7,W 4
79,80,2015-04-03,,Colorado Avalanche,2,4,L,,50,23,7,L 1
80,81,2015-04-08,,Dallas Stars,0,4,L,,50,24,7,L 2


In [104]:
game_data_15_log_ex = pd.read_excel(game_data_file, team_list[0] + "_15_log")
game_data_15_log_ex

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Team,...,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30,Unnamed: 31
0,GP,Date,,Opponent,GF,GA,,,,S,...,CA,CF%,FF,FA,FF%,FOW,FOL,FO%,oZS%,PDO
1,1,2014-10-09 00:00:00,@,Pittsburgh Penguins,4,6,L,,,29,...,47,42,26,38,40.6,32,25,56.1,34.4,92.9
2,2,2014-10-11 00:00:00,@,Detroit Red Wings,3,2,W,,,26,...,42,46.8,26,27,49.1,25,17,59.5,51.7,104.3
3,3,2014-10-13 00:00:00,@,Buffalo Sabres,5,1,W,,,44,...,29,65.5,46,15,75.4,35,11,76.1,75.3,98
4,4,2014-10-14 00:00:00,@,Philadelphia Flyers,4,3,W,SO,,28,...,63,37,28,40,41.2,24,25,49,54.5,108.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78,78,2015-03-29 00:00:00,@,New Jersey Devils,2,1,W,,,25,...,39,47.3,27,33,45,32,22,59.3,50,105.5
79,79,2015-04-01 00:00:00,,Edmonton Oilers,5,1,W,,,27,...,46,49.5,39,32,54.9,24,25,49,46.4,112.3
80,80,2015-04-03 00:00:00,,Colorado Avalanche,2,4,L,,,37,...,37,58.9,43,31,58.1,21,21,50,53.1,91.4
81,81,2015-04-08 00:00:00,,Dallas Stars,0,4,L,,,19,...,56,39.1,28,46,37.8,22,32,40.7,42.1,89.7


I'm going to start by manipulating the data to get all of the parameters I'm going to want to use for a single team, sorted and filtered as I would like.

In [131]:
game_data_15_team_ex = pd.read_excel(game_data_file, team_list[0] + "_15")

#This bit will add the name of the team whose game data we are importing to each game.
team_name = []
for i in range(0, len(game_data_15_team_ex)):{
    team_name.append(team_list[0])
}
game_data_15_team_ex.insert(2, 'Team_Name', team_name)

print(game_data_15_team_ex.columns)
game_data_15_team_ex

Index(['GP', 'Date', 'Team_Name', 'Unnamed: 2', 'Opponent', 'GF', 'GA',
       'Unnamed: 6', 'Unnamed: 7', 'W', 'L', 'OL', 'Streak'],
      dtype='object')


Unnamed: 0,GP,Date,Team_Name,Unnamed: 2,Opponent,GF,GA,Unnamed: 6,Unnamed: 7,W,L,OL,Streak
0,1,2014-10-09,Anaheim Ducks,@,Pittsburgh Penguins,4,6,L,,0,1,0,L 1
1,2,2014-10-11,Anaheim Ducks,@,Detroit Red Wings,3,2,W,,1,1,0,W 1
2,3,2014-10-13,Anaheim Ducks,@,Buffalo Sabres,5,1,W,,2,1,0,W 2
3,4,2014-10-14,Anaheim Ducks,@,Philadelphia Flyers,4,3,W,SO,3,1,0,W 3
4,5,2014-10-17,Anaheim Ducks,,Minnesota Wild,2,1,W,,4,1,0,W 4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,78,2015-03-29,Anaheim Ducks,@,New Jersey Devils,2,1,W,,49,22,7,W 3
78,79,2015-04-01,Anaheim Ducks,,Edmonton Oilers,5,1,W,,50,22,7,W 4
79,80,2015-04-03,Anaheim Ducks,,Colorado Avalanche,2,4,L,,50,23,7,L 1
80,81,2015-04-08,Anaheim Ducks,,Dallas Stars,0,4,L,,50,24,7,L 2


In [132]:
gd15_column_rename = {'Unnamed: 2': 'Location', 'Unnamed: 6': 'Result', 'Unnamed: 7': 'OT_Result', 
                      'OL' : 'OTL'}
game_data_15_team_ex = game_data_15_team_ex.rename(columns = gd15_column_rename)
game_data_15_team_ex

Unnamed: 0,GP,Date,Team_Name,Location,Opponent,GF,GA,Result,OT_Result,W,L,OTL,Streak
0,1,2014-10-09,Anaheim Ducks,@,Pittsburgh Penguins,4,6,L,,0,1,0,L 1
1,2,2014-10-11,Anaheim Ducks,@,Detroit Red Wings,3,2,W,,1,1,0,W 1
2,3,2014-10-13,Anaheim Ducks,@,Buffalo Sabres,5,1,W,,2,1,0,W 2
3,4,2014-10-14,Anaheim Ducks,@,Philadelphia Flyers,4,3,W,SO,3,1,0,W 3
4,5,2014-10-17,Anaheim Ducks,,Minnesota Wild,2,1,W,,4,1,0,W 4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,78,2015-03-29,Anaheim Ducks,@,New Jersey Devils,2,1,W,,49,22,7,W 3
78,79,2015-04-01,Anaheim Ducks,,Edmonton Oilers,5,1,W,,50,22,7,W 4
79,80,2015-04-03,Anaheim Ducks,,Colorado Avalanche,2,4,L,,50,23,7,L 1
80,81,2015-04-08,Anaheim Ducks,,Dallas Stars,0,4,L,,50,24,7,L 2


Let's convert streak to numeric values.

In [133]:
print(int(game_data_15_team_ex.Streak[2][game_data_15_team_ex.Streak[2].rindex(" ") + 1:]) * -1)

-2


In [134]:
for i in range(0, len(game_data_15_team_ex)):
    if(game_data_15_team_ex.Streak[i].startswith('L')):
        game_data_15_team_ex.Streak[i] = int(game_data_15_team_ex.Streak[i][game_data_15_team_ex.Streak[i].rindex(" ") + 1:]) * -1
    elif(game_data_15_team_ex.Streak[i].startswith('W')):
        game_data_15_team_ex.Streak[i] = int(game_data_15_team_ex.Streak[i][game_data_15_team_ex.Streak[i].rindex(" ") + 1:])
    else:
        game_data_15_team_ex.Streak[i] = 'ERROR'
        
game_data_15_team_ex        

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  game_data_15_team_ex.Streak[i] = int(game_data_15_team_ex.Streak[i][game_data_15_team_ex.Streak[i].rindex(" ") + 1:]) * -1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  game_data_15_team_ex.Streak[i] = int(game_data_15_team_ex.Streak[i][game_data_15_team_ex.Streak[i].rindex(" ") + 1:])


Unnamed: 0,GP,Date,Team_Name,Location,Opponent,GF,GA,Result,OT_Result,W,L,OTL,Streak
0,1,2014-10-09,Anaheim Ducks,@,Pittsburgh Penguins,4,6,L,,0,1,0,-1
1,2,2014-10-11,Anaheim Ducks,@,Detroit Red Wings,3,2,W,,1,1,0,1
2,3,2014-10-13,Anaheim Ducks,@,Buffalo Sabres,5,1,W,,2,1,0,2
3,4,2014-10-14,Anaheim Ducks,@,Philadelphia Flyers,4,3,W,SO,3,1,0,3
4,5,2014-10-17,Anaheim Ducks,,Minnesota Wild,2,1,W,,4,1,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,78,2015-03-29,Anaheim Ducks,@,New Jersey Devils,2,1,W,,49,22,7,3
78,79,2015-04-01,Anaheim Ducks,,Edmonton Oilers,5,1,W,,50,22,7,4
79,80,2015-04-03,Anaheim Ducks,,Colorado Avalanche,2,4,L,,50,23,7,-1
80,81,2015-04-08,Anaheim Ducks,,Dallas Stars,0,4,L,,50,24,7,-2


We'll create an empty list for all of the columns we will eventually be adding values to.

In [135]:
ss_W = []
ss_L = []
ss_OTL = []
Points = []
Point_perc = []
Goal_Diff = []
ss_Points = []
ss_Point_perc = []
ss_GP = []
ss_Goal_Diff = []

for i in range(0, len(game_data_15_team_ex)):
    ss_W.append(0)
    ss_L.append(0)
    ss_OTL.append(0)
    Points.append(0)
    Point_perc.append(0)
    Goal_Diff.append(0)
    ss_Points.append(0)
    ss_Point_perc.append(0)
    ss_GP.append(0)
    ss_Goal_Diff.append(0)

Next we'll add points and point percentage.

In [136]:
game_data_15_team_ex.insert(12, 'Points', Points)
game_data_15_team_ex.insert(13, 'Point_perc', Point_perc)

game_data_15_team_ex.Points = (game_data_15_team_ex.W * 2) + (game_data_15_team_ex.OTL)
game_data_15_team_ex.Point_perc = game_data_15_team_ex.Points / (game_data_15_team_ex.GP * 2)

game_data_15_team_ex

Unnamed: 0,GP,Date,Team_Name,Location,Opponent,GF,GA,Result,OT_Result,W,L,OTL,Points,Point_perc,Streak
0,1,2014-10-09,Anaheim Ducks,@,Pittsburgh Penguins,4,6,L,,0,1,0,0,0.000000,-1
1,2,2014-10-11,Anaheim Ducks,@,Detroit Red Wings,3,2,W,,1,1,0,2,0.500000,1
2,3,2014-10-13,Anaheim Ducks,@,Buffalo Sabres,5,1,W,,2,1,0,4,0.666667,2
3,4,2014-10-14,Anaheim Ducks,@,Philadelphia Flyers,4,3,W,SO,3,1,0,6,0.750000,3
4,5,2014-10-17,Anaheim Ducks,,Minnesota Wild,2,1,W,,4,1,0,8,0.800000,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,78,2015-03-29,Anaheim Ducks,@,New Jersey Devils,2,1,W,,49,22,7,105,0.673077,3
78,79,2015-04-01,Anaheim Ducks,,Edmonton Oilers,5,1,W,,50,22,7,107,0.677215,4
79,80,2015-04-03,Anaheim Ducks,,Colorado Avalanche,2,4,L,,50,23,7,107,0.668750,-1
80,81,2015-04-08,Anaheim Ducks,,Dallas Stars,0,4,L,,50,24,7,107,0.660494,-2


Goal Differential.

In [137]:
game_data_15_team_ex.insert(7, 'Goal_Diff', Goal_Diff)

for i in range(0, len(game_data_15_team_ex)):
    if i == 0:
        game_data_15_team_ex.at[i, 'Goal_Diff'] = (game_data_15_team_ex.at[(i), 'GF'] - game_data_15_team_ex.at[(i), 'GA'])
    else:
        game_data_15_team_ex.at[i, 'Goal_Diff'] = (game_data_15_team_ex.at[(i-1), 'Goal_Diff'] + game_data_15_team_ex.at[(i), 'GF'] - game_data_15_team_ex.at[(i), 'GA'])
        
game_data_15_team_ex

Unnamed: 0,GP,Date,Team_Name,Location,Opponent,GF,GA,Goal_Diff,Result,OT_Result,W,L,OTL,Points,Point_perc,Streak
0,1,2014-10-09,Anaheim Ducks,@,Pittsburgh Penguins,4,6,-2,L,,0,1,0,0,0.000000,-1
1,2,2014-10-11,Anaheim Ducks,@,Detroit Red Wings,3,2,-1,W,,1,1,0,2,0.500000,1
2,3,2014-10-13,Anaheim Ducks,@,Buffalo Sabres,5,1,3,W,,2,1,0,4,0.666667,2
3,4,2014-10-14,Anaheim Ducks,@,Philadelphia Flyers,4,3,4,W,SO,3,1,0,6,0.750000,3
4,5,2014-10-17,Anaheim Ducks,,Minnesota Wild,2,1,5,W,,4,1,0,8,0.800000,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,78,2015-03-29,Anaheim Ducks,@,New Jersey Devils,2,1,11,W,,49,22,7,105,0.673077,3
78,79,2015-04-01,Anaheim Ducks,,Edmonton Oilers,5,1,15,W,,50,22,7,107,0.677215,4
79,80,2015-04-03,Anaheim Ducks,,Colorado Avalanche,2,4,13,L,,50,23,7,107,0.668750,-1
80,81,2015-04-08,Anaheim Ducks,,Dallas Stars,0,4,9,L,,50,24,7,107,0.660494,-2


Now we'll calculate the season series against each team.

In [138]:
game_data_15_team_ex['ss_GP'] = ss_GP
game_data_15_team_ex['ss_W'] = ss_W
game_data_15_team_ex['ss_L'] = ss_L
game_data_15_team_ex['ss_OTL'] = ss_OTL
game_data_15_team_ex['ss_Goal_Diff'] = ss_Goal_Diff

game_data_15_team_ex

Unnamed: 0,GP,Date,Team_Name,Location,Opponent,GF,GA,Goal_Diff,Result,OT_Result,...,L,OTL,Points,Point_perc,Streak,ss_GP,ss_W,ss_L,ss_OTL,ss_Goal_Diff
0,1,2014-10-09,Anaheim Ducks,@,Pittsburgh Penguins,4,6,-2,L,,...,1,0,0,0.000000,-1,0,0,0,0,0
1,2,2014-10-11,Anaheim Ducks,@,Detroit Red Wings,3,2,-1,W,,...,1,0,2,0.500000,1,0,0,0,0,0
2,3,2014-10-13,Anaheim Ducks,@,Buffalo Sabres,5,1,3,W,,...,1,0,4,0.666667,2,0,0,0,0,0
3,4,2014-10-14,Anaheim Ducks,@,Philadelphia Flyers,4,3,4,W,SO,...,1,0,6,0.750000,3,0,0,0,0,0
4,5,2014-10-17,Anaheim Ducks,,Minnesota Wild,2,1,5,W,,...,1,0,8,0.800000,4,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,78,2015-03-29,Anaheim Ducks,@,New Jersey Devils,2,1,11,W,,...,22,7,105,0.673077,3,0,0,0,0,0
78,79,2015-04-01,Anaheim Ducks,,Edmonton Oilers,5,1,15,W,,...,22,7,107,0.677215,4,0,0,0,0,0
79,80,2015-04-03,Anaheim Ducks,,Colorado Avalanche,2,4,13,L,,...,23,7,107,0.668750,-1,0,0,0,0,0
80,81,2015-04-08,Anaheim Ducks,,Dallas Stars,0,4,9,L,,...,24,7,107,0.660494,-2,0,0,0,0,0


In [233]:
#season_series_calc = game_data_15_team_ex.loc[game_data_15_team_ex['Opponent'] == team_list[1]]

#indexing will be completely off if I attempt to select as is, so I make the index 0-4, and change it back.
#index_list = list(season_series_calc.index.values)
#season_series_calc.index = range(len(season_series_calc))

#for i in range(0, len(season_series_calc)):
#    season_series_calc.at[i, 'ss_GP'] = (i+1)
#    if(i>0):
#        season_series_calc.at[i, 'ss_OTL'] = (season_series_calc.at[(i-1), 'ss_OTL'])
#        season_series_calc.at[i, 'ss_W'] = (season_series_calc.at[(i-1), 'ss_W'])
#        season_series_calc.at[i, 'ss_L'] = (season_series_calc.at[(i-1), 'ss_L'])
       
#    if((season_series_calc.loc[i].OT_Result == 'SO' or season_series_calc.loc[i].OT_Result == 'OT') & (season_series_calc.loc[i].Result == 'L')):
#        season_series_calc.at[i, 'ss_OTL'] = (season_series_calc.loc[i, 'ss_OTL'] + 1)
#    elif(season_series_calc.loc[i].Result == 'L'):
#        season_series_calc.at[i, 'ss_L'] = (season_series_calc.loc[i, 'ss_L'] + 1)
#    else:
#        season_series_calc.at[i, 'ss_W'] = (season_series_calc.loc[i, 'ss_W'] + 1)
    
#season_series_calc.index = index_list
#season_series_calc

In [167]:
#for i in range(0, len(index_list)):
#    game_data_15_team_ex.at[index_list[i], 'ss_OTL'] = season_series_calc.loc[index_list[i], 'ss_OTL']
#    game_data_15_team_ex.at[index_list[i], 'ss_W'] = season_series_calc.loc[index_list[i], 'ss_W']
#    game_data_15_team_ex.at[index_list[i], 'ss_L'] = season_series_calc.loc[index_list[i], 'ss_L']
#game_data_15_team_ex

Now I am going to create a new data set for the pregame stats. This will be helpful later when making predictions. I'm going to continue fleshing out both tables side by side, with the original containing all data including results as it currently does, while also building this new data set that will only have pregame stats. Calculations will be slightly different from here on. 

In [139]:
game_data_15_team_ex_pregame = game_data_15_team_ex.copy()

In [140]:
for j in range(0, len(team_list)):
    season_series_calc = game_data_15_team_ex.loc[game_data_15_team_ex['Opponent'] == team_list[j]]

#indexing will be completely off if I attempt to select as is, so I make the index 0-4, and change it back.
    index_list = list(season_series_calc.index.values)
    season_series_calc.index = range(len(season_series_calc))

    for i in range(0, len(season_series_calc)): 
        season_series_calc.at[i, 'ss_GP'] = (i+1)
        if(i==0):
            season_series_calc.at[i, 'ss_Goal_Diff'] = (season_series_calc.at[(i), 'GF'] - season_series_calc.at[(i), 'GA'])
        elif(i>0):
            season_series_calc.at[i, 'ss_Goal_Diff'] = (season_series_calc.at[(i-1), 'ss_Goal_Diff'] + season_series_calc.at[(i), 'GF'] - season_series_calc.at[(i), 'GA'])
            season_series_calc.at[i, 'ss_OTL'] = (season_series_calc.at[(i-1), 'ss_OTL'])
            season_series_calc.at[i, 'ss_W'] = (season_series_calc.at[(i-1), 'ss_W'])
            season_series_calc.at[i, 'ss_L'] = (season_series_calc.at[(i-1), 'ss_L'])
        
        if((season_series_calc.loc[i].OT_Result == 'SO' or season_series_calc.loc[i].OT_Result == 'OT') & (season_series_calc.loc[i].Result == 'L')):
            season_series_calc.at[i, 'ss_OTL'] = (season_series_calc.loc[i, 'ss_OTL'] + 1)
        elif(season_series_calc.loc[i].Result == 'L'):
            season_series_calc.at[i, 'ss_L'] = (season_series_calc.loc[i, 'ss_L'] + 1)
        else:
            season_series_calc.at[i, 'ss_W'] = (season_series_calc.loc[i, 'ss_W'] + 1)
        
    season_series_calc.index = index_list

    for i in range(0, len(index_list)):
        game_data_15_team_ex.at[index_list[i], 'ss_Goal_Diff'] = season_series_calc.loc[index_list[i], 'ss_Goal_Diff']
        game_data_15_team_ex.at[index_list[i], 'ss_GP'] = season_series_calc.loc[index_list[i], 'ss_GP']
        game_data_15_team_ex.at[index_list[i], 'ss_OTL'] = season_series_calc.loc[index_list[i], 'ss_OTL']
        game_data_15_team_ex.at[index_list[i], 'ss_W'] = season_series_calc.loc[index_list[i], 'ss_W']
        game_data_15_team_ex.at[index_list[i], 'ss_L'] = season_series_calc.loc[index_list[i], 'ss_L']

game_data_15_team_ex

Unnamed: 0,GP,Date,Team_Name,Location,Opponent,GF,GA,Goal_Diff,Result,OT_Result,...,L,OTL,Points,Point_perc,Streak,ss_GP,ss_W,ss_L,ss_OTL,ss_Goal_Diff
0,1,2014-10-09,Anaheim Ducks,@,Pittsburgh Penguins,4,6,-2,L,,...,1,0,0,0.000000,-1,1,0,1,0,-2
1,2,2014-10-11,Anaheim Ducks,@,Detroit Red Wings,3,2,-1,W,,...,1,0,2,0.500000,1,1,1,0,0,1
2,3,2014-10-13,Anaheim Ducks,@,Buffalo Sabres,5,1,3,W,,...,1,0,4,0.666667,2,1,1,0,0,4
3,4,2014-10-14,Anaheim Ducks,@,Philadelphia Flyers,4,3,4,W,SO,...,1,0,6,0.750000,3,1,1,0,0,1
4,5,2014-10-17,Anaheim Ducks,,Minnesota Wild,2,1,5,W,,...,1,0,8,0.800000,4,1,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,78,2015-03-29,Anaheim Ducks,@,New Jersey Devils,2,1,11,W,,...,22,7,105,0.673077,3,2,2,0,0,5
78,79,2015-04-01,Anaheim Ducks,,Edmonton Oilers,5,1,15,W,,...,22,7,107,0.677215,4,4,4,0,0,8
79,80,2015-04-03,Anaheim Ducks,,Colorado Avalanche,2,4,13,L,,...,23,7,107,0.668750,-1,3,2,1,0,0
80,81,2015-04-08,Anaheim Ducks,,Dallas Stars,0,4,9,L,,...,24,7,107,0.660494,-2,3,2,1,0,-1


Here we calculate the season series results to line up with the next time they play the specific opponent. For example, if Anaheim plays Pittsburgh, it will show 0 for all stats the first matchup. Their second matchup, they will see 1GP between the two teams, and it will reflect the record and goal differential only up to the last game they played.

In [141]:
for j in range(0, len(team_list)):
    season_series_calc = game_data_15_team_ex_pregame.loc[game_data_15_team_ex_pregame['Opponent'] == team_list[j]]

#indexing will be completely off if I attempt to select as is, so I make the index 0-4, and change it back.
    index_list = list(season_series_calc.index.values)
    season_series_calc.index = range(len(season_series_calc))

    for i in range(0, len(season_series_calc) - 1): 
        season_series_calc.at[i+1, 'ss_GP'] = (i+1)
        if(i==0):
            season_series_calc.at[i+1, 'ss_Goal_Diff'] = (season_series_calc.at[(i), 'GF'] - season_series_calc.at[(i), 'GA'])
        elif(i>0):
            season_series_calc.at[i+1, 'ss_Goal_Diff'] = (season_series_calc.at[(i), 'ss_Goal_Diff'] + season_series_calc.at[(i), 'GF'] - season_series_calc.at[(i), 'GA'])
            season_series_calc.at[i+1, 'ss_OTL'] = (season_series_calc.at[(i), 'ss_OTL'])
            season_series_calc.at[i+1, 'ss_W'] = (season_series_calc.at[(i), 'ss_W'])
            season_series_calc.at[i+1, 'ss_L'] = (season_series_calc.at[(i), 'ss_L'])
        
        if((season_series_calc.loc[i].OT_Result == 'SO' or season_series_calc.loc[i].OT_Result == 'OT') & (season_series_calc.loc[i].Result == 'L')):
            season_series_calc.at[i+1, 'ss_OTL'] = (season_series_calc.loc[i, 'ss_OTL'] + 1)
        elif(season_series_calc.loc[i].Result == 'L'):
            season_series_calc.at[i+1, 'ss_L'] = (season_series_calc.loc[i, 'ss_L'] + 1)
        else:
            season_series_calc.at[i+1, 'ss_W'] = (season_series_calc.loc[i, 'ss_W'] + 1)
        
    season_series_calc.index = index_list

    for i in range(0, len(index_list)):
        game_data_15_team_ex_pregame.at[index_list[i], 'ss_Goal_Diff'] = season_series_calc.loc[index_list[i], 'ss_Goal_Diff']
        game_data_15_team_ex_pregame.at[index_list[i], 'ss_GP'] = season_series_calc.loc[index_list[i], 'ss_GP']
        game_data_15_team_ex_pregame.at[index_list[i], 'ss_OTL'] = season_series_calc.loc[index_list[i], 'ss_OTL']
        game_data_15_team_ex_pregame.at[index_list[i], 'ss_W'] = season_series_calc.loc[index_list[i], 'ss_W']
        game_data_15_team_ex_pregame.at[index_list[i], 'ss_L'] = season_series_calc.loc[index_list[i], 'ss_L']
    
game_data_15_team_ex_pregame

Unnamed: 0,GP,Date,Team_Name,Location,Opponent,GF,GA,Goal_Diff,Result,OT_Result,...,L,OTL,Points,Point_perc,Streak,ss_GP,ss_W,ss_L,ss_OTL,ss_Goal_Diff
0,1,2014-10-09,Anaheim Ducks,@,Pittsburgh Penguins,4,6,-2,L,,...,1,0,0,0.000000,-1,0,0,0,0,0
1,2,2014-10-11,Anaheim Ducks,@,Detroit Red Wings,3,2,-1,W,,...,1,0,2,0.500000,1,0,0,0,0,0
2,3,2014-10-13,Anaheim Ducks,@,Buffalo Sabres,5,1,3,W,,...,1,0,4,0.666667,2,0,0,0,0,0
3,4,2014-10-14,Anaheim Ducks,@,Philadelphia Flyers,4,3,4,W,SO,...,1,0,6,0.750000,3,0,0,0,0,0
4,5,2014-10-17,Anaheim Ducks,,Minnesota Wild,2,1,5,W,,...,1,0,8,0.800000,4,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,78,2015-03-29,Anaheim Ducks,@,New Jersey Devils,2,1,11,W,,...,22,7,105,0.673077,3,1,1,0,0,4
78,79,2015-04-01,Anaheim Ducks,,Edmonton Oilers,5,1,15,W,,...,22,7,107,0.677215,4,3,3,0,0,4
79,80,2015-04-03,Anaheim Ducks,,Colorado Avalanche,2,4,13,L,,...,23,7,107,0.668750,-1,2,2,0,0,2
80,81,2015-04-08,Anaheim Ducks,,Dallas Stars,0,4,9,L,,...,24,7,107,0.660494,-2,2,2,0,0,3


If I want to see a season series example...

In [287]:
#season_series_calc

Now we will add points and point percentages.

In [142]:
game_data_15_team_ex['ss_Points'] = ss_Points
game_data_15_team_ex['ss_Point_perc'] = ss_Point_perc

game_data_15_team_ex.ss_Points = (game_data_15_team_ex.ss_W * 2) + (game_data_15_team_ex.ss_OTL)

game_data_15_team_ex.ss_Point_perc = game_data_15_team_ex.ss_Points / (game_data_15_team_ex.ss_GP * 2)

game_data_15_team_ex

Unnamed: 0,GP,Date,Team_Name,Location,Opponent,GF,GA,Goal_Diff,Result,OT_Result,...,Points,Point_perc,Streak,ss_GP,ss_W,ss_L,ss_OTL,ss_Goal_Diff,ss_Points,ss_Point_perc
0,1,2014-10-09,Anaheim Ducks,@,Pittsburgh Penguins,4,6,-2,L,,...,0,0.000000,-1,1,0,1,0,-2,0,0.000000
1,2,2014-10-11,Anaheim Ducks,@,Detroit Red Wings,3,2,-1,W,,...,2,0.500000,1,1,1,0,0,1,2,1.000000
2,3,2014-10-13,Anaheim Ducks,@,Buffalo Sabres,5,1,3,W,,...,4,0.666667,2,1,1,0,0,4,2,1.000000
3,4,2014-10-14,Anaheim Ducks,@,Philadelphia Flyers,4,3,4,W,SO,...,6,0.750000,3,1,1,0,0,1,2,1.000000
4,5,2014-10-17,Anaheim Ducks,,Minnesota Wild,2,1,5,W,,...,8,0.800000,4,1,1,0,0,1,2,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,78,2015-03-29,Anaheim Ducks,@,New Jersey Devils,2,1,11,W,,...,105,0.673077,3,2,2,0,0,5,4,1.000000
78,79,2015-04-01,Anaheim Ducks,,Edmonton Oilers,5,1,15,W,,...,107,0.677215,4,4,4,0,0,8,8,1.000000
79,80,2015-04-03,Anaheim Ducks,,Colorado Avalanche,2,4,13,L,,...,107,0.668750,-1,3,2,1,0,0,4,0.666667
80,81,2015-04-08,Anaheim Ducks,,Dallas Stars,0,4,9,L,,...,107,0.660494,-2,3,2,1,0,-1,4,0.666667


In [143]:
game_data_15_team_ex_pregame['ss_Points'] = ss_Points
game_data_15_team_ex_pregame['ss_Point_perc'] = ss_Point_perc

game_data_15_team_ex_pregame.ss_Points = (game_data_15_team_ex_pregame.ss_W * 2) + (game_data_15_team_ex_pregame.ss_OTL)

game_data_15_team_ex_pregame.ss_Point_perc = game_data_15_team_ex_pregame.ss_Points / (game_data_15_team_ex_pregame.ss_GP * 2)

game_data_15_team_ex_pregame

Unnamed: 0,GP,Date,Team_Name,Location,Opponent,GF,GA,Goal_Diff,Result,OT_Result,...,Points,Point_perc,Streak,ss_GP,ss_W,ss_L,ss_OTL,ss_Goal_Diff,ss_Points,ss_Point_perc
0,1,2014-10-09,Anaheim Ducks,@,Pittsburgh Penguins,4,6,-2,L,,...,0,0.000000,-1,0,0,0,0,0,0,
1,2,2014-10-11,Anaheim Ducks,@,Detroit Red Wings,3,2,-1,W,,...,2,0.500000,1,0,0,0,0,0,0,
2,3,2014-10-13,Anaheim Ducks,@,Buffalo Sabres,5,1,3,W,,...,4,0.666667,2,0,0,0,0,0,0,
3,4,2014-10-14,Anaheim Ducks,@,Philadelphia Flyers,4,3,4,W,SO,...,6,0.750000,3,0,0,0,0,0,0,
4,5,2014-10-17,Anaheim Ducks,,Minnesota Wild,2,1,5,W,,...,8,0.800000,4,0,0,0,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,78,2015-03-29,Anaheim Ducks,@,New Jersey Devils,2,1,11,W,,...,105,0.673077,3,1,1,0,0,4,2,1.00
78,79,2015-04-01,Anaheim Ducks,,Edmonton Oilers,5,1,15,W,,...,107,0.677215,4,3,3,0,0,4,6,1.00
79,80,2015-04-03,Anaheim Ducks,,Colorado Avalanche,2,4,13,L,,...,107,0.668750,-1,2,2,0,0,2,4,1.00
80,81,2015-04-08,Anaheim Ducks,,Dallas Stars,0,4,9,L,,...,107,0.660494,-2,2,2,0,0,3,4,1.00


Here we move the results to the left hand column so we can easily see the game results.

In [144]:
result_list = list(game_data_15_team_ex.Result.values)
game_data_15_team_ex = game_data_15_team_ex.drop('Result', axis = 1)
game_data_15_team_ex.insert(0, 'Result', result_list)
game_data_15_team_ex

Unnamed: 0,Result,GP,Date,Team_Name,Location,Opponent,GF,GA,Goal_Diff,OT_Result,...,Points,Point_perc,Streak,ss_GP,ss_W,ss_L,ss_OTL,ss_Goal_Diff,ss_Points,ss_Point_perc
0,L,1,2014-10-09,Anaheim Ducks,@,Pittsburgh Penguins,4,6,-2,,...,0,0.000000,-1,1,0,1,0,-2,0,0.000000
1,W,2,2014-10-11,Anaheim Ducks,@,Detroit Red Wings,3,2,-1,,...,2,0.500000,1,1,1,0,0,1,2,1.000000
2,W,3,2014-10-13,Anaheim Ducks,@,Buffalo Sabres,5,1,3,,...,4,0.666667,2,1,1,0,0,4,2,1.000000
3,W,4,2014-10-14,Anaheim Ducks,@,Philadelphia Flyers,4,3,4,SO,...,6,0.750000,3,1,1,0,0,1,2,1.000000
4,W,5,2014-10-17,Anaheim Ducks,,Minnesota Wild,2,1,5,,...,8,0.800000,4,1,1,0,0,1,2,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,W,78,2015-03-29,Anaheim Ducks,@,New Jersey Devils,2,1,11,,...,105,0.673077,3,2,2,0,0,5,4,1.000000
78,W,79,2015-04-01,Anaheim Ducks,,Edmonton Oilers,5,1,15,,...,107,0.677215,4,4,4,0,0,8,8,1.000000
79,L,80,2015-04-03,Anaheim Ducks,,Colorado Avalanche,2,4,13,,...,107,0.668750,-1,3,2,1,0,0,4,0.666667
80,L,81,2015-04-08,Anaheim Ducks,,Dallas Stars,0,4,9,,...,107,0.660494,-2,3,2,1,0,-1,4,0.666667


In [145]:
result_list = list(game_data_15_team_ex_pregame.Result.values)
game_data_15_team_ex_pregame = game_data_15_team_ex_pregame.drop('Result', axis = 1)
game_data_15_team_ex_pregame.insert(0, 'Result', result_list)
game_data_15_team_ex_pregame

Unnamed: 0,Result,GP,Date,Team_Name,Location,Opponent,GF,GA,Goal_Diff,OT_Result,...,Points,Point_perc,Streak,ss_GP,ss_W,ss_L,ss_OTL,ss_Goal_Diff,ss_Points,ss_Point_perc
0,L,1,2014-10-09,Anaheim Ducks,@,Pittsburgh Penguins,4,6,-2,,...,0,0.000000,-1,0,0,0,0,0,0,
1,W,2,2014-10-11,Anaheim Ducks,@,Detroit Red Wings,3,2,-1,,...,2,0.500000,1,0,0,0,0,0,0,
2,W,3,2014-10-13,Anaheim Ducks,@,Buffalo Sabres,5,1,3,,...,4,0.666667,2,0,0,0,0,0,0,
3,W,4,2014-10-14,Anaheim Ducks,@,Philadelphia Flyers,4,3,4,SO,...,6,0.750000,3,0,0,0,0,0,0,
4,W,5,2014-10-17,Anaheim Ducks,,Minnesota Wild,2,1,5,,...,8,0.800000,4,0,0,0,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,W,78,2015-03-29,Anaheim Ducks,@,New Jersey Devils,2,1,11,,...,105,0.673077,3,1,1,0,0,4,2,1.00
78,W,79,2015-04-01,Anaheim Ducks,,Edmonton Oilers,5,1,15,,...,107,0.677215,4,3,3,0,0,4,6,1.00
79,L,80,2015-04-03,Anaheim Ducks,,Colorado Avalanche,2,4,13,,...,107,0.668750,-1,2,2,0,0,2,4,1.00
80,L,81,2015-04-08,Anaheim Ducks,,Dallas Stars,0,4,9,,...,107,0.660494,-2,2,2,0,0,3,4,1.00


In [24]:
print(game_data_15_team_ex_pregame.columns)

Index(['Result', 'GP', 'Date', 'Team_Name', 'Location', 'Opponent', 'GF', 'GA',
       'Goal_Diff', 'OT_Result', 'W', 'L', 'OTL', 'Points', 'Point_perc',
       'Streak', 'ss_GP', 'ss_W', 'ss_L', 'ss_OTL', 'ss_Goal_Diff',
       'ss_Points', 'ss_Point_perc'],
      dtype='object')


For our pregame dataset, se need to shift GP, Goal_Diff, W, L, OTL, points, Point_perc, and Streak down one row. Essentially we are making the stats we have access to only available for games that have been played already, as has already been done with the season series.

In [146]:
GP_list = list(game_data_15_team_ex_pregame.GP.values)
Goal_Diff_list = list(game_data_15_team_ex_pregame.Goal_Diff.values)
W_list = list(game_data_15_team_ex_pregame.W.values)
L_list = list(game_data_15_team_ex_pregame.L.values)
OTL_list = list(game_data_15_team_ex_pregame.OTL.values)
Points_list = list(game_data_15_team_ex_pregame.Points.values)
Point_perc_list = list(game_data_15_team_ex_pregame.Point_perc.values)
Streak_list = list(game_data_15_team_ex_pregame.Streak.values)

GP_list = [0] + GP_list
GP_list.pop()

Goal_Diff_list = [0] + GP_list
Goal_Diff_list.pop()

W_list = [0] + W_list
W_list.pop()

L_list = [0] + L_list
L_list.pop()

OTL_list = [0] + OTL_list
OTL_list.pop()

Points_list = [0] + Points_list
Points_list.pop()

Point_perc_list = [0] + Point_perc_list
Point_perc_list.pop()

Streak_list = [0] + Streak_list
Streak_list.pop()

print(GP_list)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81]


In [147]:
game_data_15_team_ex_pregame.GP = GP_list
game_data_15_team_ex_pregame.Goal_Diff = Goal_Diff_list
game_data_15_team_ex_pregame.W = W_list
game_data_15_team_ex_pregame.L = L_list
game_data_15_team_ex_pregame.OTL = OTL_list
game_data_15_team_ex_pregame.Points = Points_list
game_data_15_team_ex_pregame.Point_perc = Point_perc_list
game_data_15_team_ex_pregame.Streak = Streak_list

game_data_15_team_ex_pregame

Unnamed: 0,Result,GP,Date,Team_Name,Location,Opponent,GF,GA,Goal_Diff,OT_Result,...,Points,Point_perc,Streak,ss_GP,ss_W,ss_L,ss_OTL,ss_Goal_Diff,ss_Points,ss_Point_perc
0,L,0,2014-10-09,Anaheim Ducks,@,Pittsburgh Penguins,4,6,0,,...,0,0.000000,0,0,0,0,0,0,0,
1,W,1,2014-10-11,Anaheim Ducks,@,Detroit Red Wings,3,2,0,,...,0,0.000000,-1,0,0,0,0,0,0,
2,W,2,2014-10-13,Anaheim Ducks,@,Buffalo Sabres,5,1,1,,...,2,0.500000,1,0,0,0,0,0,0,
3,W,3,2014-10-14,Anaheim Ducks,@,Philadelphia Flyers,4,3,2,SO,...,4,0.666667,2,0,0,0,0,0,0,
4,W,4,2014-10-17,Anaheim Ducks,,Minnesota Wild,2,1,3,,...,6,0.750000,3,0,0,0,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,W,77,2015-03-29,Anaheim Ducks,@,New Jersey Devils,2,1,76,,...,103,0.668831,2,1,1,0,0,4,2,1.00
78,W,78,2015-04-01,Anaheim Ducks,,Edmonton Oilers,5,1,77,,...,105,0.673077,3,3,3,0,0,4,6,1.00
79,L,79,2015-04-03,Anaheim Ducks,,Colorado Avalanche,2,4,78,,...,107,0.677215,4,2,2,0,0,2,4,1.00
80,L,80,2015-04-08,Anaheim Ducks,,Dallas Stars,0,4,79,,...,107,0.668750,-1,2,2,0,0,3,4,1.00


Now we will concatenate additional game log data to our dataframe.

In [290]:
#game_data_file = pd.ExcelFile("2015_game_data.xlsx")
#game_data_15_log = pd.read_excel(game_data_file, team_list[0] + "_15_log", header = 1)
#game_data_15_log = game_data_15_log.drop(['GP', 'Opponent', 'GF', 'GA', 'Unnamed: 2', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 14', 'Unnamed: 20'], axis = 1)
#print(game_data_15_log.columns)
#game_data_15_log

# Modelling

Going to try to create a basic Logistic Regression model now with what I have.

To create the model, I'm going to need to adjust my data to make sure I don't use any information I shouldn't already know about a game.

In [400]:
#game_data_15_team_ex = game_data_15_team_ex.drop('Date', axis = 1)
#game_data_15_team_ex = game_data_15_team_ex.drop('Location', axis = 1)
#game_data_15_team_ex = game_data_15_team_ex.drop('OT_Result', axis = 1)
#game_data_15_team_ex = game_data_15_team_ex.drop('Team_Name', axis = 1)
#game_data_15_team_ex = game_data_15_team_ex.drop('Opponent', axis = 1)
#game_data_15_team_ex = game_data_15_team_ex.drop('ss_GP', axis = 1)
#game_data_15_team_ex = game_data_15_team_ex.drop('ss_W', axis = 1)
#game_data_15_team_ex = game_data_15_team_ex.drop('ss_L', axis = 1)
#game_data_15_team_ex = game_data_15_team_ex.drop('ss_OTL', axis = 1)
#game_data_15_team_ex = game_data_15_team_ex.drop('ss_Goal_Diff', axis = 1)
#game_data_15_team_ex = game_data_15_team_ex.drop('ss_Points', axis = 1)
#game_data_15_team_ex = game_data_15_team_ex.drop('ss_Point_perc', axis = 1)

In [148]:
game_data_15_team_ex_pregame = game_data_15_team_ex_pregame.drop('Date', axis = 1)
game_data_15_team_ex_pregame = game_data_15_team_ex_pregame.drop('Location', axis = 1)
game_data_15_team_ex_pregame = game_data_15_team_ex_pregame.drop('OT_Result', axis = 1)
game_data_15_team_ex_pregame = game_data_15_team_ex_pregame.drop('Team_Name', axis = 1)
game_data_15_team_ex_pregame = game_data_15_team_ex_pregame.drop('Opponent', axis = 1)
game_data_15_team_ex_pregame = game_data_15_team_ex_pregame.drop('GF', axis = 1)
game_data_15_team_ex_pregame = game_data_15_team_ex_pregame.drop('GA', axis = 1)
#game_data_15_team_ex = game_data_15_team_ex.drop('ss_GP', axis = 1)
#game_data_15_team_ex = game_data_15_team_ex.drop('ss_W', axis = 1)
#game_data_15_team_ex = game_data_15_team_ex.drop('ss_L', axis = 1)
#game_data_15_team_ex = game_data_15_team_ex.drop('ss_OTL', axis = 1)
#game_data_15_team_ex = game_data_15_team_ex.drop('ss_Goal_Diff', axis = 1)
#game_data_15_team_ex = game_data_15_team_ex.drop('ss_Points', axis = 1)
#game_data_15_team_ex = game_data_15_team_ex.drop('ss_Point_perc', axis = 1)
game_data_15_team_ex_pregame

Unnamed: 0,Result,GP,Goal_Diff,W,L,OTL,Points,Point_perc,Streak,ss_GP,ss_W,ss_L,ss_OTL,ss_Goal_Diff,ss_Points,ss_Point_perc
0,L,0,0,0,0,0,0,0.000000,0,0,0,0,0,0,0,
1,W,1,0,0,1,0,0,0.000000,-1,0,0,0,0,0,0,
2,W,2,1,1,1,0,2,0.500000,1,0,0,0,0,0,0,
3,W,3,2,2,1,0,4,0.666667,2,0,0,0,0,0,0,
4,W,4,3,3,1,0,6,0.750000,3,0,0,0,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,W,77,76,48,22,7,103,0.668831,2,1,1,0,0,4,2,1.00
78,W,78,77,49,22,7,105,0.673077,3,3,3,0,0,4,6,1.00
79,L,79,78,50,22,7,107,0.677215,4,2,2,0,0,2,4,1.00
80,L,80,79,50,23,7,107,0.668750,-1,2,2,0,0,3,4,1.00


In [83]:
result_list = list(game_data_15_team_ex.Result.values)
first_game = [0,0,0,0,0,0,0,0,0,0,0]


#top_row = pd.DataFrame({'Result':[0],'GP':[0],'GF':[0],'GA':[0],'Goal_Diff':[0],'W':[0],'L':[0],'OTL':[0],'Points':[0],'Point_perc':[0],'Streak':[0]})
#game_data_15_team_ex = pd.concat([top_row, game_data_15_team_ex]).reset_index(drop = True)


#game_data_15_team_ex = game_data_15_team_ex.drop([82], axis = 0)
#game_data_15_team_ex.Result = result_list
print(result_list)

game_data_15_team_ex

[0, 'L', 'W', 'W', 'W', 'W', 'W', 'W', 'W', 'L', 'W', 'L', 'W', 'W', 'L', 'L', 'L', 'W', 'L', 'L', 'L', 'W', 'W', 'W', 'L', 'L', 'W', 'W', 'W', 'W', 'W', 'W', 'W', 'L', 'W', 'L', 'W', 'L', 'W', 'L', 'W', 'W', 'L', 'W', 'W', 'W', 'W', 'W', 'W', 'L', 'L', 'W', 'W', 'L', 'L', 'L', 'W', 'L', 'L', 'W', 'W', 'W', 'L', 'W', 'W', 'W', 'W', 'L', 'L', 'L', 'W', 'W', 'W', 'W', 'L', 'L', 'W', 'W', 'W', 'W', 'L', 'L']


Unnamed: 0,Result,GP,GF,GA,Goal_Diff,W,L,OTL,Points,Point_perc,Streak
0,0,0,0,0,0,0,0,0,0,0.000000,0
1,L,1,4,6,-2,0,1,0,0,0.000000,L 1
2,W,2,3,2,-1,1,1,0,2,0.500000,W 1
3,W,3,5,1,3,2,1,0,4,0.666667,W 2
4,W,4,4,3,4,3,1,0,6,0.750000,W 3
...,...,...,...,...,...,...,...,...,...,...,...
77,W,77,3,2,10,48,22,7,103,0.668831,W 2
78,W,78,2,1,11,49,22,7,105,0.673077,W 3
79,W,79,5,1,15,50,22,7,107,0.677215,W 4
80,L,80,2,4,13,50,23,7,107,0.668750,L 1


In [325]:
game_data_15_team_ex = game_data_15_team_ex.drop('Streak', axis = 1)

Now we'll start getting things ready for our predictions

In [150]:
game_data_15_team_ex_pregame['ss_Point_perc'] = game_data_15_team_ex_pregame['ss_Point_perc'].fillna(0)
game_data_15_team_ex_pregame

Unnamed: 0,Result,GP,Goal_Diff,W,L,OTL,Points,Point_perc,Streak,ss_GP,ss_W,ss_L,ss_OTL,ss_Goal_Diff,ss_Points,ss_Point_perc
0,L,0,0,0,0,0,0,0.000000,0,0,0,0,0,0,0,0.00
1,W,1,0,0,1,0,0,0.000000,-1,0,0,0,0,0,0,0.00
2,W,2,1,1,1,0,2,0.500000,1,0,0,0,0,0,0,0.00
3,W,3,2,2,1,0,4,0.666667,2,0,0,0,0,0,0,0.00
4,W,4,3,3,1,0,6,0.750000,3,0,0,0,0,0,0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,W,77,76,48,22,7,103,0.668831,2,1,1,0,0,4,2,1.00
78,W,78,77,49,22,7,105,0.673077,3,3,3,0,0,4,6,1.00
79,L,79,78,50,22,7,107,0.677215,4,2,2,0,0,2,4,1.00
80,L,80,79,50,23,7,107,0.668750,-1,2,2,0,0,3,4,1.00


In [330]:
X1 = game_data_15_team_ex.iloc[:,1:].values
y1 = game_data_15_team_ex.iloc[:,:1].values

X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X1, y1, random_state = 1)

print('Training Features Shape:', np.shape(X_train_1))
print('Test Features Shape:    ', np.shape(X_test_1))



Training Features Shape: (61, 16)
Test Features Shape:     (21, 16)


In [151]:
X1 = game_data_15_team_ex_pregame.iloc[:,1:].values
y1 = game_data_15_team_ex_pregame.iloc[:,:1].values

X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X1, y1, random_state = 1)

print('Training Features Shape:', np.shape(X_train_1))
print('Test Features Shape:    ', np.shape(X_test_1))

Training Features Shape: (61, 15)
Test Features Shape:     (21, 15)


In [152]:
logreg = LogisticRegression()

logreg.fit(X_train_1, y_train_1)

y_pred = logreg.predict(X_test_1)

  return f(**kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [153]:
from sklearn import metrics

cnf_matrix = metrics.confusion_matrix(y_test_1, y_pred)
cnf_matrix

array([[ 3,  2],
       [ 6, 10]], dtype=int64)

In [154]:
print((y1))

[['L']
 ['W']
 ['W']
 ['W']
 ['W']
 ['W']
 ['W']
 ['W']
 ['L']
 ['W']
 ['L']
 ['W']
 ['W']
 ['L']
 ['L']
 ['L']
 ['W']
 ['L']
 ['L']
 ['L']
 ['W']
 ['W']
 ['W']
 ['L']
 ['L']
 ['W']
 ['W']
 ['W']
 ['W']
 ['W']
 ['W']
 ['W']
 ['L']
 ['W']
 ['L']
 ['W']
 ['L']
 ['W']
 ['L']
 ['W']
 ['W']
 ['L']
 ['W']
 ['W']
 ['W']
 ['W']
 ['W']
 ['W']
 ['L']
 ['L']
 ['W']
 ['W']
 ['L']
 ['L']
 ['L']
 ['W']
 ['L']
 ['L']
 ['W']
 ['W']
 ['W']
 ['L']
 ['W']
 ['W']
 ['W']
 ['W']
 ['L']
 ['L']
 ['L']
 ['W']
 ['W']
 ['W']
 ['W']
 ['L']
 ['L']
 ['W']
 ['W']
 ['W']
 ['W']
 ['L']
 ['L']
 ['W']]


Thank you for making it this far! Bear in mind, some things have changed or been adjusted when building out the functions we are actually using due to scalability. I hope you have enjoyed this project as much as I have been so far.