In [11]:
import pandas as pd
import numpy as np
#import utils
import io
import itertools
import matplotlib.pyplot as plt
import time
import math
import unidecode
import requests
import seaborn as sns
from sklearn.linear_model import LinearRegression
from scipy import stats
from scipy.stats import norm, skew, pearsonr #for some statistics
from bs4 import BeautifulSoup
import re
import sys, getopt
import csv
from collections import Counter
pd.set_option('display.max_columns', None)

pd.options.display.max_columns=999

import warnings
warnings.filterwarnings('ignore')

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

### Function to scrape FBRef for the last five games
### Functions to make finished spreadsheets for plain xG and p90 xG, respectively

In [12]:
def scrapeURL(url, homeoraway):
    res = requests.get(url)
    ## The next two lines get around the issue with comments breaking the parsing
    comm = re.compile("<!--|-->")
    soup = BeautifulSoup(comm.sub("",res.text),'lxml')
    all_tables = soup.findAll("tbody")

    if (homeoraway == "home"):
        summary_table = all_tables[0] # 'player', minutes', 'goals', 'assists', 'shots_total', shots_on_target', 'xg', 'xa'
        passing_table = all_tables[1] # 'assisted_shots', 'passes_completed'
        misc_table = all_tables[5] # 'crosses' ,'fouled', 'fouls', 'tackles_won', 'interceptions', 'cards_yellow', 'cards_red'
    elif (homeoraway == "away"):
        summary_table = all_tables[7]
        passing_table = all_tables[8]
        misc_table = all_tables[12]
    else:
        return("Have to supply home or away")

    # parse each table and create dict
    pre_df_player = dict()
    features_from_summary =["shirtnumber", "minutes", "goals", "assists", "shots_total", "shots_on_target", "xg", "xa"]
    features_from_passing = ["assisted_shots", "passes_completed"]
    features_from_misc = ["crosses", "fouled", "fouls", "tackles_won", "interceptions", "cards_yellow"]

    rows_summary = summary_table.find_all('tr')
    rows_passing = passing_table.find_all('tr')
    rows_misc = misc_table.find_all('tr')
    for row in range(len(rows_summary)):
        # get the names of each player
        cell = rows_summary[row].find("th", {"data-stat": "player"})
        text = cell.text.strip()
        text = unidecode.unidecode(text)
        if "player" in pre_df_player:
            pre_df_player["player"].append(text)
        else:
            pre_df_player["player"] = [text]

        for feature in features_from_summary:
            cell = rows_summary[row].find("td", {"data-stat": feature})
            if cell.text == "":
                stat = 0.0
            else:
                stat = float(cell.text)
            if feature in pre_df_player:
                pre_df_player[feature].append(stat)
            else:
                pre_df_player[feature] = [stat]

        for feature in features_from_passing:
            cell = rows_passing[row].find("td", {"data-stat": feature})
            if cell.text == "":
                stat = 0.0
            else:
                stat = float(cell.text)
            if feature in pre_df_player:
                pre_df_player[feature].append(stat)
            else:
                pre_df_player[feature] = [stat]

        for feature in features_from_misc:
            cell = rows_misc[row].find("td", {"data-stat": feature})
            if cell.text == "":
                stat = 0.0
            else:
                stat = float(cell.text)
            if feature in pre_df_player:
                pre_df_player[feature].append(stat)
            else:
                pre_df_player[feature] = [stat]

    df_player = pd.DataFrame.from_dict(pre_df_player)
    df_player = df_player.set_index("player").sort_values(by="player")
    return(df_player)
    #return(pre_df_player)
        
        

def make_finished_spreadsheet(starters, fav_abbrev, notfav_abbrev, fav_odds, notfav_odds, fav_goal_odds, notfav_goal_odds, fav_team_shot_proj, notfav_team_shot_proj, fav_team_SoT_proj, notfav_team_SoT_proj):
    starters["90s"] = starters["90s"].astype('float')
    for i in range(len(starters)):
        if starters["90s"][i] > 67.5:
            starters["90s"][i] = starters["90s"][i]/90
        else:
            starters["90s"][i] = 0.75
    starters["Floor"] = (starters["Sh"] + starters["SoT"] + starters["Crs"]*0.7 + starters["KP"] + starters["Pass_Cmp"]*0.02 + starters["Fld"] - starters["Fls"]*0.5 + starters["TklW"] + starters["Int"]*0.5) / starters["90s"]
    starters["FPTS"] = (starters["Gls"]*10 + starters["Ast"]*6 + starters["Sh"] + starters["SoT"] + starters["Crs"]*0.7 + starters["KP"] + starters["Pass_Cmp"]*0.02 + starters["Fld"] - starters["Fls"]*0.5 + starters["TklW"] + starters["Int"]*0.5 - starters["CrdY"]*1.5) / starters["90s"]

    # xG Old
    fav_xG = sum(starters[starters["Team"] == fav_abbrev]["xG"])
    notfav_xG = sum(starters[starters["Team"] == notfav_abbrev]["xG"])
    starters["Team_xG"] = 0.1
    for i in range(len(starters["Player"])):
        if (starters["Team"][i] == fav_abbrev):
            starters["Team_xG"][i] = fav_xG
        else:
            starters["Team_xG"][i] = notfav_xG
    starters["xG_Share"] = starters["xG"] / starters["Team_xG"]
    starters["Proj_Gls"] = 0.1
    for i in range(len(starters["Player"])):
        if (starters["Team"][i] == fav_abbrev):
            starters["Proj_Gls"][i] = starters["xG_Share"][i] * fav_goal_odds
        else:
            starters["Proj_Gls"][i] = starters["xG_Share"][i] * notfav_goal_odds
    
    # xA
    fav_GA_ratio = sum(starters[starters["Team"] == fav_abbrev]["xA"]) / sum(starters[starters["Team"] == fav_abbrev]["xG"])
    notfav_GA_ratio = sum(starters[starters["Team"] == notfav_abbrev]["xA"]) / sum(starters[starters["Team"] == notfav_abbrev]["xG"])
    fav_ast_odds = fav_goal_odds * fav_GA_ratio
    notfav_ast_odds = notfav_goal_odds * notfav_GA_ratio
    fav_xA = sum(starters[starters["Team"] == fav_abbrev]["xA"])
    notfav_xA = sum(starters[starters["Team"] == notfav_abbrev]["xA"])
    starters["Team_xA"] = 0.1
    for i in range(len(starters["Player"])):
        if (starters["Team"][i] == fav_abbrev):
            starters["Team_xA"][i] = fav_xA
        else:
            starters["Team_xA"][i] = notfav_xA
    starters["xA_Share"] = starters["xA"] / starters["Team_xA"]
    starters["Proj_Ast"] = 0.1
    for i in range(len(starters["Player"])):
        if (starters["Team"][i] == fav_abbrev):
            starters["Proj_Ast"][i] = starters["xA_Share"][i] * fav_ast_odds
        else:
            starters["Proj_Ast"][i] = starters["xA_Share"][i] * notfav_ast_odds

    # Shot Shares
    starters["Team_Shot_Proj"] = 0.1
    starters["Team_SoT_Proj"] = 0.1
    for i in range(len(starters["Player"])):
        if (starters["Team"][i] == fav_abbrev):
            starters["Team_Shot_Proj"][i] = fav_team_shot_proj
            starters["Team_SoT_Proj"][i] = fav_team_SoT_proj
        else:
            starters["Team_Shot_Proj"][i] = notfav_team_shot_proj
            starters["Team_SoT_Proj"][i] = notfav_team_SoT_proj
    fav_shots = sum(starters[starters["Team"] == fav_abbrev]["Sh"])
    notfav_shots = sum(starters[starters["Team"] == notfav_abbrev]["Sh"])
    starters["Team_Shots"] = 0.1
    for i in range(len(starters["Player"])):
        if (starters["Team"][i] == fav_abbrev):
            starters["Team_Shots"][i] = fav_shots
        else:
            starters["Team_Shots"][i] = notfav_shots
    starters["Shot_Share"] = starters["Sh"] / starters["Team_Shots"]
    starters["Proj_Shots"] = starters["Shot_Share"] * starters["Team_Shot_Proj"]
    starters["SoT%"] = starters["SoT"] / starters["Sh"]
    starters["Proj_SoT"] = starters["Proj_Shots"] * starters["SoT%"]

    # Key Passes
    fav_KP_ratio = sum(starters[starters["Team"] == fav_abbrev]["KP"]) / sum(starters[starters["Team"] == fav_abbrev]["Sh"])
    notfav_KP_ratio = sum(starters[starters["Team"] == notfav_abbrev]["KP"]) / sum(starters[starters["Team"] == notfav_abbrev]["Sh"])
    fav_KP = sum(starters[starters["Team"] == fav_abbrev]["KP"])
    notfav_KP = sum(starters[starters["Team"] == notfav_abbrev]["KP"])
    starters["Team_KP"] = 0.1
    for i in range(len(starters["Player"])):
        if (starters["Team"][i] == fav_abbrev):
            starters["Team_KP"][i] = fav_KP
        else:
            starters["Team_KP"][i] = notfav_KP
    starters["KP_Share"] = starters["KP"] / starters["Team_KP"]
    starters["Proj_KP"] = starters["KP_Share"]*(starters["Team_Shot_Proj"]*(starters["Team_KP"]/starters["Team_Shots"]))

    # Fill NA's with zeros for later arithmetic
    starters = starters.fillna(0)

    # Pts_w_StartMins
    odds_avg = (fav_odds + notfav_odds)/2
    starters["Pts_w_StartMins"] = 0.1
    for i in range(len(starters["Player"])):
        if (starters["Starting"][i] == "y"):
            starters["Pts_w_StartMins"][i] = starters["Proj_Gls"][i]*10 +                             \
                                             starters["Proj_Ast"][i]*6 +                              \
                                             starters["Proj_Shots"][i] + starters["Proj_SoT"][i] +    \
                                             starters["Proj_KP"][i] +                                 \
                                             (starters["Crs"][i]*0.7 + starters["Pass_Cmp"][i]*0.02 + \
                                              starters["Fld"][i] - starters["Fls"][i]*0.5 +           \
                                              starters["TklW"][i] + starters["Int"][i]*0.5 -          \
                                              starters["CrdY"][i]*1.5)/starters["90s"][i] *           \
                                             (starters["Start_Mins"][i]/90)
        else:
            starters["Pts_w_StartMins"][i] = starters["Proj_Gls"][i]*10 +                             \
                                             starters["Proj_Ast"][i]*6 +                              \
                                             starters["Proj_Shots"][i] + starters["Proj_SoT"][i] +    \
                                             starters["Proj_KP"][i] +                                 \
                                             (starters["Crs"][i]*0.7 + starters["Pass_Cmp"][i]*0.02 + \
                                              starters["Fld"][i] - starters["Fls"][i]*0.5 +           \
                                              starters["TklW"][i] + starters["Int"][i]*0.5 -          \
                                              starters["CrdY"][i]*1.5)/starters["90s"][i] *           \
                                             (starters["Sub_Mins"][i]/90)

    # Add Odds and Goal Odds Columns
    starters["Team_Odds"] = 0.1
    starters["Team_Goal_Odds"] = 0.1
    for i in range(len(starters["Player"])):
        if (starters["Team"][i] == fav_abbrev):
            starters["Team_Odds"][i] = fav_odds
            starters["Team_Goal_Odds"][i] = fav_goal_odds
        else:
            starters["Team_Odds"][i] = notfav_odds
            starters["Team_Goal_Odds"][i] = notfav_goal_odds

    starters["Player_"] = starters["Player"]
    starters = starters.sort_values(by="Pts_w_StartMins", ascending=False).reset_index().drop(columns=["index"])

    cols = ["Player", "Team", "Pos", "Salary", "90s", "Gls", "Ast", "Sh", "SoT", "Crs", "KP", "Pass_Cmp",
            "Fld", "Fls", "TklW", "Int", "CrdY", "Floor", "FPTS", "xG", "Team_xG", "xG_Share",
            "Proj_Gls", "xA", "Team_xA", "xA_Share", "Proj_Ast", "Team_Shot_Proj", "Team_SoT_Proj", "Team_Shots",
            "Shot_Share", "Proj_Shots", "SoT%", "Proj_SoT", "Team_KP", "KP_Share", "Proj_KP", 
            "Team_Odds", "Team_Goal_Odds", "Pts_w_StartMins", "Start_Mins", "Sub_Mins", "Starting", "Player_"]
    starters = starters[cols]

    starters.to_csv("Matchup_Spreadsheets/"+home_abbrev+"_"+away_abbrev+"_"+date+"_spreadsheet_finished.csv")

    return(starters)



def make_finished_spreadsheet_p90(starters, fav_abbrev, notfav_abbrev, fav_odds, notfav_odds, fav_goal_odds, notfav_goal_odds, fav_team_shot_proj, notfav_team_shot_proj, fav_team_SoT_proj, notfav_team_SoT_proj):
    starters["90s"] = starters["90s"].astype('float')
    for i in range(len(starters)):
        if starters["90s"][i] > 67.5:
            starters["90s"][i] = starters["90s"][i]/90
        else:
            starters["90s"][i] = 0.75
    starters["Floor"] = (starters["Sh"] + starters["SoT"] + starters["Crs"]*0.7 + starters["KP"] + starters["Pass_Cmp"]*0.02 + starters["Fld"] - starters["Fls"]*0.5 + starters["TklW"] + starters["Int"]*0.5) / starters["90s"]
    starters["FPTS"] = (starters["Gls"]*10 + starters["Ast"]*6 + starters["Sh"] + starters["SoT"] + starters["Crs"]*0.7 + starters["KP"] + starters["Pass_Cmp"]*0.02 + starters["Fld"] - starters["Fls"]*0.5 + starters["TklW"] + starters["Int"]*0.5 - starters["CrdY"]*1.5) / starters["90s"]

    
    # xG New - p90
    starters["xG"] = (starters["xG"]/starters["90s"])*(starters["Start_Mins"]/90)
    fav_xG = sum(starters[starters["Team"] == fav_abbrev]["xG"])
    notfav_xG = sum(starters[starters["Team"] == notfav_abbrev]["xG"])
    starters["Team_xG"] = 0.1
    for i in range(len(starters["Player"])):
        if (starters["Team"][i] == fav_abbrev):
            starters["Team_xG"][i] = fav_xG
        else:
            starters["Team_xG"][i] = notfav_xG
    starters["xG_Share"] = starters["xG"] / starters["Team_xG"]
    starters["Proj_Gls"] = 0.1
    for i in range(len(starters["Player"])):
        if (starters["Team"][i] == fav_abbrev):
            starters["Proj_Gls"][i] = starters["xG_Share"][i] * fav_goal_odds
        else:
            starters["Proj_Gls"][i] = starters["xG_Share"][i] * notfav_goal_odds

    # xA
    starters["xA"] = (starters["xA"]/starters["90s"])*(starters["Start_Mins"]/90)
    fav_GA_ratio = sum(starters[starters["Team"] == fav_abbrev]["xA"]) / sum(starters[starters["Team"] == fav_abbrev]["xG"])
    notfav_GA_ratio = sum(starters[starters["Team"] == notfav_abbrev]["xA"]) / sum(starters[starters["Team"] == notfav_abbrev]["xG"])
    fav_ast_odds = fav_goal_odds * fav_GA_ratio
    notfav_ast_odds = notfav_goal_odds * notfav_GA_ratio
    fav_xA = sum(starters[starters["Team"] == fav_abbrev]["xA"])
    notfav_xA = sum(starters[starters["Team"] == notfav_abbrev]["xA"])
    starters["Team_xA"] = 0.1
    for i in range(len(starters["Player"])):
        if (starters["Team"][i] == fav_abbrev):
            starters["Team_xA"][i] = fav_xA
        else:
            starters["Team_xA"][i] = notfav_xA
    starters["xA_Share"] = starters["xA"] / starters["Team_xA"]
    starters["Proj_Ast"] = 0.1
    for i in range(len(starters["Player"])):
        if (starters["Team"][i] == fav_abbrev):
            starters["Proj_Ast"][i] = starters["xA_Share"][i] * fav_ast_odds
        else:
            starters["Proj_Ast"][i] = starters["xA_Share"][i] * notfav_ast_odds

    # Shot Shares
    starters["Team_Shot_Proj"] = 0.1
    starters["Team_SoT_Proj"] = 0.1
    for i in range(len(starters["Player"])):
        if (starters["Team"][i] == fav_abbrev):
            starters["Team_Shot_Proj"][i] = fav_team_shot_proj
            starters["Team_SoT_Proj"][i] = fav_team_SoT_proj
        else:
            starters["Team_Shot_Proj"][i] = notfav_team_shot_proj
            starters["Team_SoT_Proj"][i] = notfav_team_SoT_proj
    fav_shots = sum(starters[starters["Team"] == fav_abbrev]["Sh"])
    notfav_shots = sum(starters[starters["Team"] == notfav_abbrev]["Sh"])
    starters["Team_Shots"] = 0.1
    for i in range(len(starters["Player"])):
        if (starters["Team"][i] == fav_abbrev):
            starters["Team_Shots"][i] = fav_shots
        else:
            starters["Team_Shots"][i] = notfav_shots
    starters["Shot_Share"] = starters["Sh"] / starters["Team_Shots"]
    starters["Proj_Shots"] = starters["Shot_Share"] * starters["Team_Shot_Proj"]
    starters["SoT%"] = starters["SoT"] / starters["Sh"]
    starters["Proj_SoT"] = starters["Proj_Shots"] * starters["SoT%"]

    # Key Passes
    fav_KP_ratio = sum(starters[starters["Team"] == fav_abbrev]["KP"]) / sum(starters[starters["Team"] == fav_abbrev]["Sh"])
    notfav_KP_ratio = sum(starters[starters["Team"] == notfav_abbrev]["KP"]) / sum(starters[starters["Team"] == notfav_abbrev]["Sh"])
    fav_KP = sum(starters[starters["Team"] == fav_abbrev]["KP"])
    notfav_KP = sum(starters[starters["Team"] == notfav_abbrev]["KP"])
    starters["Team_KP"] = 0.1
    for i in range(len(starters["Player"])):
        if (starters["Team"][i] == fav_abbrev):
            starters["Team_KP"][i] = fav_KP
        else:
            starters["Team_KP"][i] = notfav_KP
    starters["KP_Share"] = starters["KP"] / starters["Team_KP"]
    starters["Proj_KP"] = starters["KP_Share"]*(starters["Team_Shot_Proj"]*(starters["Team_KP"]/starters["Team_Shots"]))

    # Fill NA's with zeros for later arithmetic
    starters = starters.fillna(0)

    # Pts_w_StartMins
    odds_avg = (fav_odds + notfav_odds)/2
    starters["Pts_w_StartMins"] = 0.1
    for i in range(len(starters["Player"])):
        if (starters["Starting"][i] == "y"):
            starters["Pts_w_StartMins"][i] = starters["Proj_Gls"][i]*10 +                             \
                                             starters["Proj_Ast"][i]*6 +                              \
                                             starters["Proj_Shots"][i] + starters["Proj_SoT"][i] +    \
                                             starters["Proj_KP"][i] +                                 \
                                             (starters["Crs"][i]*0.7 + starters["Pass_Cmp"][i]*0.02 + \
                                              starters["Fld"][i] - starters["Fls"][i]*0.5 +           \
                                              starters["TklW"][i] + starters["Int"][i]*0.5 -          \
                                              starters["CrdY"][i]*1.5)/starters["90s"][i] *           \
                                             (starters["Start_Mins"][i]/90)
        else:
            starters["Pts_w_StartMins"][i] = starters["Proj_Gls"][i]*10 +                             \
                                             starters["Proj_Ast"][i]*6 +                              \
                                             starters["Proj_Shots"][i] + starters["Proj_SoT"][i] +    \
                                             starters["Proj_KP"][i] +                                 \
                                             (starters["Crs"][i]*0.7 + starters["Pass_Cmp"][i]*0.02 + \
                                              starters["Fld"][i] - starters["Fls"][i]*0.5 +           \
                                              starters["TklW"][i] + starters["Int"][i]*0.5 -          \
                                              starters["CrdY"][i]*1.5)/starters["90s"][i] *           \
                                             (starters["Sub_Mins"][i]/90)

    # Add Odds and Goal Odds Columns
    starters["Team_Odds"] = 0.1
    starters["Team_Goal_Odds"] = 0.1
    for i in range(len(starters["Player"])):
        if (starters["Team"][i] == fav_abbrev):
            starters["Team_Odds"][i] = fav_odds
            starters["Team_Goal_Odds"][i] = fav_goal_odds
        else:
            starters["Team_Odds"][i] = notfav_odds
            starters["Team_Goal_Odds"][i] = notfav_goal_odds

    starters["Player_"] = starters["Player"]
    starters = starters.sort_values(by="Pts_w_StartMins", ascending=False).reset_index().drop(columns=["index"])

    cols = ["Player", "Team", "Pos", "Salary", "90s", "Gls", "Ast", "Sh", "SoT", "Crs", "KP", "Pass_Cmp",
            "Fld", "Fls", "TklW", "Int", "CrdY", "Floor", "FPTS", "xG", "Team_xG", "xG_Share",
            "Proj_Gls", "xA", "Team_xA", "xA_Share", "Proj_Ast", "Team_Shot_Proj", "Team_SoT_Proj", "Team_Shots",
            "Shot_Share", "Proj_Shots", "SoT%", "Proj_SoT", "Team_KP", "KP_Share", "Proj_KP", 
            "Team_Odds", "Team_Goal_Odds", "Pts_w_StartMins", "Start_Mins", "Sub_Mins", "Starting", "Player_"]
    starters = starters[cols]

    starters.to_csv("Matchup_Spreadsheets/"+home_abbrev+"_"+away_abbrev+"_"+date+"_spreadsheet_finished.csv")

    return(starters)

### Scrape the last five games and write out the basic stats

In [13]:
home_full = "Seattle" # Enter the full name of the teams, as they appear in the shotlog.csv files
away_full = "Dallas"

home_abbrev = "SEA"
away_abbrev = "DAL"
date = "12_1_20"

# put in the url of the last 5 matches played (in the league) for the home and away teams, and indicate if they were home or away
home_urls = ["https://fbref.com/en/matches/2495f3d0/Vancouver-Whitecaps-FC-Seattle-Sounders-FC-October-27-2020-Major-League-Soccer",
             "https://fbref.com/en/matches/ec219387/Colorado-Rapids-Seattle-Sounders-FC-November-1-2020-Major-League-Soccer",
             "https://fbref.com/en/matches/382f0485/LA-Galaxy-Seattle-Sounders-FC-November-4-2020-Major-League-Soccer",
             "https://fbref.com/en/matches/05e6cbca/Seattle-Sounders-FC-San-Jose-Earthquakes-November-8-2020-Major-League-Soccer",
             "https://fbref.com/en/matches/1815aed8/Seattle-Sounders-FC-Los-Angeles-FC-November-24-2020-Major-League-Soccer"]

home_homeoraways = ["away", "away", "away", "home", "home"]

away_urls = ["https://fbref.com/en/matches/5b7f1d1c/FC-Dallas-Inter-Miami-October-28-2020-Major-League-Soccer",
             "https://fbref.com/en/matches/3f432543/FC-Dallas-Houston-Dynamo-October-31-2020-Major-League-Soccer",
             "https://fbref.com/en/matches/64a3cd9d/Nashville-SC-FC-Dallas-November-4-2020-Major-League-Soccer",
             "https://fbref.com/en/matches/5768477b/Minnesota-United-FC-Dallas-November-8-2020-Major-League-Soccer",
             "https://fbref.com/en/matches/777307bd/Portland-Timbers-FC-Dallas-November-22-2020-Major-League-Soccer"]

away_homeoraways = ["home", "home", "away", "away", "away"]

# alright now you're good to run the cell
##################################################################################################################

home_stats = scrapeURL(home_urls[0], home_homeoraways[0])
for i in range(1,5):
    tmp = scrapeURL(home_urls[i], home_homeoraways[i])
    
    for name in tmp.index:
        if name in home_stats.index:
            for col in range(0,16):
                home_stats.loc[name][col] = home_stats.loc[name][col] + tmp.loc[name][col]
        else:
            home_stats = home_stats.append(tmp.loc[name])
home_stats["Team"] = home_abbrev
#home_stats = home_stats.sort_values(by="minutes")
            

away_stats = scrapeURL(away_urls[0], away_homeoraways[0])
for i in range(1,5):
    tmp = scrapeURL(away_urls[i], away_homeoraways[i])
    
    for name in tmp.index:
        if name in away_stats.index:
            for col in range(0,16):
                away_stats.loc[name][col] = away_stats.loc[name][col] + tmp.loc[name][col]
        else:
            away_stats = away_stats.append(tmp.loc[name])      
away_stats["Team"] = away_abbrev
#away_stats = away_stats.sort_values(by="minutes")

full_stats = pd.concat([home_stats, away_stats])
full_stats["Pos"] = "DELETE"
full_stats["Salary"] = "DELETE"
full_stats["Start_Mins"] = "DELETE"
full_stats["Sub_Mins"] = "DELETE"
full_stats["Starting"] = "DELETE"
full_stats = full_stats.drop(labels="shirtnumber", axis=1)
full_stats = full_stats.rename(columns={"minutes":"90s",
                                        "goals":"Gls",
                                        "assists":"Ast",
                                        "shots_total":"Sh",
                                        "shots_on_target":"SoT",
                                        "crosses":"Crs",
                                        "assisted_shots":"KP",
                                        "passes_completed":"Pass_Cmp",
                                        "fouled":"Fld",
                                        "fouls":"Fls",
                                        "tackles_won":"TklW",
                                        "interceptions":"Int",
                                        "cards_yellow":"CrdY",
                                        "xg":"xG",
                                        "xa":"xA"})

cols = ["Team", "Pos", "Salary", "90s", "Gls", "Ast", "Sh", "SoT", "Crs", "KP", "Pass_Cmp", "Fld", "Fls",
        "TklW", "Int", "CrdY", "xG", "xA", "Start_Mins", "Sub_Mins", "Starting"]
full_stats = full_stats[cols]

full_stats.to_csv("Matchup_Spreadsheets/"+home_abbrev+"_"+away_abbrev+"_"+date+"_spreadsheet_unfilled.csv")



### Get the shot and SoT projections using linear regression

In [14]:
## Run this cell to get the shot and SoT projections
## Fill in the league to get the write patch to the shotlog file
#################################################################################################################

league = "MLS"  # MLS, PremierLeague, Bundesliga, LaLiga, Ligue1, SerieA

#################################################################################################################
filename = "Shotlogs/" + league + "_shotlog.csv"
    
# Named for MLS shot projections, but works for any league
mls_data = pd.read_csv(filename)
futures = mls_data[mls_data["Real_Shots"].isna()]
futures = futures[futures["Win_Odds"].notna()]
mls_data = mls_data[mls_data["Real_Shots"] > 0]

# Outliers w/ numbers
iqr = stats.iqr(mls_data["Real_Shots"])
outlier_cutoff_low = np.quantile(np.array(mls_data["Real_Shots"]),.25) - iqr*1.5
outlier_cutoff_high = np.quantile(np.array(mls_data["Real_Shots"]),.75) + iqr*1.5
num_outliers = sum(mls_data["Real_Shots"] > outlier_cutoff_high)
outlier_indices = []
for i in range(num_outliers):
    idx = mls_data["Real_Shots"].drop(outlier_indices).idxmax()
    outlier_indices.append(idx)
mls_data = mls_data.drop(outlier_indices).reset_index()

# Set X and y matrices for linear regression
X = np.array(mls_data[["Win_Odds", "FiveThirtyEight_Odds",
                       "Odds_GF", "OddsShark_GF",
                       "Opp_CS_Odds_INV", "Opp_Odds_INV", "Opp_FiveThirtyEight_Odds_INV"]])
y = np.array(mls_data["Real_Shots"])
linear_regression = LinearRegression(normalize=True)
linear_regression.fit(X, y)
y_pred = linear_regression.predict(X)
print("Shot Regression Score:", linear_regression.score(X, y))

home_odds = futures[futures["Team"] == home_full]["Win_Odds"]
away_odds = futures[futures["Team"] == away_full]["Win_Odds"]
#home_goals = (futures[futures["Team"] == home_full]["Odds_GF"] + futures[futures["Team"] == home_full]["OddsShark_GF"])/2
#away_goals = (futures[futures["Team"] == away_full]["Odds_GF"] + futures[futures["Team"] == away_full]["OddsShark_GF"])/2

# Get the shot predictions for both teams
X_home = futures[futures["Team"] == home_full]
X_home = np.array(X_home[["Win_Odds", "FiveThirtyEight_Odds",
                          "Odds_GF", "OddsShark_GF",
                          "Opp_CS_Odds_INV", "Opp_Odds_INV", "Opp_FiveThirtyEight_Odds_INV"]])
home_shot_pred = linear_regression.predict(X_home)

X_away = futures[futures["Team"] == away_full]
X_away = np.array(X_away[["Win_Odds", "FiveThirtyEight_Odds",
                          "Odds_GF", "OddsShark_GF",
                          "Opp_CS_Odds_INV", "Opp_Odds_INV", "Opp_FiveThirtyEight_Odds_INV"]])
away_shot_pred = linear_regression.predict(X_away)

print(home_full, " Shots: ", home_shot_pred[0])
print(away_full, " Shots: ", away_shot_pred[0])

## LETS DO IT AGAIN FOR SoT !!!!!
mls_data = pd.read_csv(filename)
futures = mls_data[mls_data["Real_SoT"].isna()]
futures = futures[futures["Win_Odds"].notna()]
mls_data = mls_data[mls_data["Real_SoT"] > 0]

# Outliers w/ numbers
iqr = stats.iqr(mls_data["Real_SoT"])
outlier_cutoff_low = np.quantile(np.array(mls_data["Real_SoT"]),.25) - iqr*1.5
outlier_cutoff_high = np.quantile(np.array(mls_data["Real_SoT"]),.75) + iqr*1.5
num_outliers = sum(mls_data["Real_SoT"] > outlier_cutoff_high)
outlier_indices = []
for i in range(num_outliers):
    idx = mls_data["Real_SoT"].drop(outlier_indices).idxmax()
    outlier_indices.append(idx)
mls_data = mls_data.drop(outlier_indices).reset_index()

# Set X and y matrices for linear regression
X = np.array(mls_data[["Win_Odds", "FiveThirtyEight_Odds",
                       "Odds_GF", "OddsShark_GF",
                       "Opp_CS_Odds_INV", "Opp_Odds_INV", "Opp_FiveThirtyEight_Odds_INV"]])
y = np.array(mls_data["Real_SoT"])
linear_regression = LinearRegression(normalize=True)
linear_regression.fit(X, y)
y_pred = linear_regression.predict(X)
print("\n")
print("SoT Regression Score:", linear_regression.score(X, y))

# Get the shot predictions for both teams
X_home = futures[futures["Team"] == home_full]
X_home = np.array(X_home[["Win_Odds", "FiveThirtyEight_Odds",
                          "Odds_GF", "OddsShark_GF",
                          "Opp_CS_Odds_INV", "Opp_Odds_INV", "Opp_FiveThirtyEight_Odds_INV"]])
home_sot_pred = linear_regression.predict(X_home)

X_away = futures[futures["Team"] == away_full]
X_away = np.array(X_away[["Win_Odds", "FiveThirtyEight_Odds",
                          "Odds_GF", "OddsShark_GF",
                          "Opp_CS_Odds_INV", "Opp_Odds_INV", "Opp_FiveThirtyEight_Odds_INV"]])
away_sot_pred = linear_regression.predict(X_away)

print(home_full, " SoT: ", home_sot_pred[0])
print(away_full, " SoT: ", away_sot_pred[0])

## ONE MORE TIME FOR GOALS !!!!!
mls_data = pd.read_csv(filename)
futures = mls_data[mls_data["Real_SoT"].isna()]
futures = futures[futures["Win_Odds"].notna()]
mls_data = mls_data[mls_data["Real_SoT"] > 0]

# Outliers w/ numbers
iqr = stats.iqr(mls_data["Real_Goals"])
outlier_cutoff_low = np.quantile(np.array(mls_data["Real_Goals"]),.25) - iqr*1.5
outlier_cutoff_high = np.quantile(np.array(mls_data["Real_Goals"]),.75) + iqr*1.5
num_outliers = sum(mls_data["Real_Goals"] > outlier_cutoff_high)
outlier_indices = []
for i in range(num_outliers):
    idx = mls_data["Real_Goals"].drop(outlier_indices).idxmax()
    outlier_indices.append(idx)
mls_data = mls_data.drop(outlier_indices).reset_index()

# Set X and y matrices for linear regression
X = np.array(mls_data[["Win_Odds", "FiveThirtyEight_Odds",
                       "Odds_GF", "OddsShark_GF",
                       "Opp_CS_Odds_INV", "Opp_Odds_INV", "Opp_FiveThirtyEight_Odds_INV"]])
y = np.array(mls_data["Real_Goals"])
linear_regression = LinearRegression(normalize=True)
linear_regression.fit(X, y)
y_pred = linear_regression.predict(X)
linear_regression_score = linear_regression.score(X, y)

# Now see if the average of Odds_GF and OddsShark_GF regresses better
mls_data["GoalAvg"] = (mls_data["Odds_GF"] + mls_data["OddsShark_GF"])/2
X = np.array(mls_data[["GoalAvg"]])
y = np.array(mls_data["Real_Goals"])
linear_regression_goalavg = LinearRegression()
linear_regression_goalavg.fit(X,y)
y_pred_goalavg = linear_regression_goalavg.predict(X)
linear_regression_goalavg_score = linear_regression_goalavg.score(X,y)

if (linear_regression_score > linear_regression_goalavg_score):
    # Get the shot predictions for both teams
    X_home = futures[futures["Team"] == home_full]
    X_home = np.array(X_home[["Win_Odds", "FiveThirtyEight_Odds",
                              "Odds_GF", "OddsShark_GF",
                              "Opp_CS_Odds_INV", "Opp_Odds_INV", "Opp_FiveThirtyEight_Odds_INV"]])
    home_goals = linear_regression.predict(X_home)

    X_away = futures[futures["Team"] == away_full]
    X_away = np.array(X_away[["Win_Odds", "FiveThirtyEight_Odds",
                              "Odds_GF", "OddsShark_GF",
                              "Opp_CS_Odds_INV", "Opp_Odds_INV", "Opp_FiveThirtyEight_Odds_INV"]])
    away_goals = linear_regression.predict(X_away)
    home_goals = home_goals[0]
    away_goals = away_goals[0]
    print("\n")
    print("Full regression performed better")
    print("Goals Regression Score:", linear_regression_score)
    print(home_full, " Goals: ", home_goals)
    print(away_full, "Goals: ", away_goals)
else:
    home_goals = (futures[futures["Team"] == home_full]["Odds_GF"] + futures[futures["Team"] == home_full]["OddsShark_GF"])/2
    away_goals = (futures[futures["Team"] == away_full]["Odds_GF"] + futures[futures["Team"] == away_full]["OddsShark_GF"])/2
    print("\n")
    print("Goal Odds Avg performed better")
    print("Goals Average Score:", linear_regression_goalavg_score)
    print(home_full, " Goals: ", home_goals)
    print(away_full, "Goals: ", away_goals)
    
#home_goals = (futures[futures["Team"] == home_full]["Odds_GF"] + futures[futures["Team"] == home_full]["OddsShark_GF"])/2
#away_goals = (futures[futures["Team"] == away_full]["Odds_GF"] + futures[futures["Team"] == away_full]["OddsShark_GF"])/2

Shot Regression Score: 0.15868321480914993
Seattle  Shots:  17.31608552390901
Dallas  Shots:  12.687469755680777


SoT Regression Score: 0.10467942797722296
Seattle  SoT:  6.327951094624556
Dallas  SoT:  3.9603722331453066


Full regression performed better
Goals Regression Score: 0.06285149543637591
Seattle  Goals:  1.8510940266611446
Dallas Goals:  1.11967505044709


### Put the spreadsheet w/ start minutes back in, and get FPTS projections 

In [15]:
## Fill out the resulting .csv to plug back in
## Fill out Pos and Salary from the DraftKings Contest, as well as Start_Mins and Sub_Mins from FBRef, and Starting from SofaScore or the team's Twitter
## rename file by taking off the "_unfilled"
## so it is not overwritten if an earlier cell is accidentally run
#################################################################################################################

starters = pd.read_csv("Matchup_Spreadsheets/"+home_abbrev+"_"+away_abbrev+"_"+date+"_spreadsheet.csv")
starters = starters[starters["Starting"].notna()]
starters = starters.reset_index().drop(columns=["index"])
starters = starters[starters["Pos"] != "GK"] # these projections really only work for field player, GKs are a different beast
starters = starters.reset_index().drop(columns=["index"])

#################################################################################################################

starters = make_finished_spreadsheet_p90(starters, home_abbrev, away_abbrev, home_odds, away_odds, home_goals, away_goals,
                                         home_shot_pred[0], away_shot_pred[0], home_sot_pred[0], away_sot_pred[0])
starters

FileNotFoundError: [Errno 2] File Matchup_Spreadsheets/SEA_DAL_12_1_20_spreadsheet.csv does not exist: 'Matchup_Spreadsheets/SEA_DAL_12_1_20_spreadsheet.csv'

### Optimizer to get best possible lineups

In [None]:
layer_names = ["x", "x", "x", "x", "x", "x"]
player_indices = [0, 0, 0, 0, 0]
squad_FPTS = [0, 0, 0, 0, 0]
max_salary = 0
max_fppg = 0
ctr = 0
num_squads_ctr = 0
for cpt_idx in range(1):
    index_combos = list(itertools.combinations(list(np.linspace(0,len(starters)-1,len(starters))), 5))
    for i in range(len(index_combos)):
        ctr += 1
        #if ((ctr % 1000) == 0): # Loop counter
            #percent = (ctr/(len(index_combos)*7))*100
            #print(ctr, "  -  ", percent, "% of ", len(index_combos)*7, " -  Squad 1")
        if (cpt_idx in index_combos[i]):
            continue
        squad_price = starters["Salary"][cpt_idx] * 1.5 + \
                      starters["Salary"][index_combos[i][0]] + \
                      starters["Salary"][index_combos[i][1]] + \
                      starters["Salary"][index_combos[i][2]] + \
                      starters["Salary"][index_combos[i][3]] + \
                      starters["Salary"][index_combos[i][4]]
        squad_fppg = starters["Pts_w_StartMins"][cpt_idx] * 1.5 + \
                     starters["Pts_w_StartMins"][index_combos[i][0]] + \
                     starters["Pts_w_StartMins"][index_combos[i][1]] + \
                     starters["Pts_w_StartMins"][index_combos[i][2]] + \
                     starters["Pts_w_StartMins"][index_combos[i][3]] + \
                     starters["Pts_w_StartMins"][index_combos[i][4]]
        if (squad_price > 50000): # just move on it it's too expensive
            continue
        team_abbrevs = [starters["Team"][cpt_idx],
                        starters["Team"][index_combos[i][0]],
                        starters["Team"][index_combos[i][1]],
                        starters["Team"][index_combos[i][2]],
                        starters["Team"][index_combos[i][3]],
                        starters["Team"][index_combos[i][4]]]
        num_fav=0
        for k in range(6):
            if (team_abbrevs[k] == fav_abbrev):
                num_fav+=1
        if (num_fav == 6): # can't all be from the same team
            continue
            
        if (squad_fppg > 82):  # User change this number in order to get top 1, 3, 5, etc. lineups outputted
            player_names = [starters["Player"][cpt_idx],
                            starters["Player"][index_combos[i][0]],
                            starters["Player"][index_combos[i][1]],
                            starters["Player"][index_combos[i][2]],
                            starters["Player"][index_combos[i][3]],
                            starters["Player"][index_combos[i][4]]]
            if (squad_fppg > max_fppg):
                max_fppg = squad_fppg
            print("Captain: ", player_names[0])
            print("FLEX1:   ", player_names[1])
            print("FLEX2:   ", player_names[2])
            print("FLEX3:   ", player_names[3])
            print("FLEX4:   ", player_names[4])
            print("FLEX5:   ", player_names[5])
            print("Squad Price: ", squad_price)
            print("Squad FPPG:  ", squad_fppg)
            print("\n\n")
            num_squads_ctr += 1
        
    
print(num_squads_ctr, "possible squads")

