# World Cup Prediction Model
### By: David Hoffman and Kyle Kolodziej

## Problem Definition

For this project we would like to take the most recent world cup data (including each team that is playing in the tournament and the different statistical evaluations for these teams) and use this data to predict a winner for the world cup.  The user will be able to change the teams and different statistical ratings for each team and the model will take this new data to predict a winenr.  

## Data Gathering

In [1]:
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup

# Code on how to get data extracted from all the teams for all years on both pages of FIFA's International rankings
data = pd.DataFrame(columns=['Year', 'Team', 'att', 'mid', 'def', 'ovr'])
version = 0
for fifa in range(5, 24):
    toFind = True
#     print("On fifa ", fifa)
    while toFind:
        version += 1
        for page in range(2):
            # https://www.fifaindex.com/teams/fifa22_527/?page=1&type=1
            webAddress = ""
            toCheck = "" # check matching version in responses title
            if fifa < 10:
                toCheck = '0' + str(fifa)
                webAddress = 'https://www.fifaindex.com/teams/fifa0' + str(fifa) + '_' + str(version) + '/?page=' + str(page+1)+'&type=1'
            elif fifa < 23:
                toCheck = str(fifa)
                webAddress = 'https://www.fifaindex.com/teams/fifa' + str(fifa) + '_' + str(version) + '/?page=' + str(page+1)+'&type=1'
            else:
                # Get current stats
                toCheck = str(fifa - 1)
                webAddress = 'https://www.fifaindex.com/teams/?page=' + str(page+1) + '&type=1'
            r = requests.get(webAddress)

            if r.status_code != 200:
                #print("Error getting web address")
                version += 1
                break
                
            soup = BeautifulSoup(r.content, 'html.parser')
            
            if toCheck not in str(soup.title):
#                 print("not in title!")
#                 print("Title: " + str(soup.title))
#                 print("Fifa: " + toCheck + "\n")
                break
            else:
                toFind = False
            s = soup.find('table', class_='table table-striped table-teams')
            content = s.find_all('td')
            
            year = 1999 + fifa # Need to update
            team, attRate, midRate, defRate, ovrRate  = '', None, None, None, None
            #textLines = []
            i = 0
            for line in content:
                
                if line.text != '' and line.text != 'International' and line.text != '\n\n' and line.text != "Men's National":
                    # textLines was keeping track of all lines parsed, useful for seeing which index corresponds with an attribute
                    # textLines.append(line)
                    #print("Line: \n" + str(line) + "\n")
                    
                    
                    # Assign value to respective variable
                    if i == 0:
                        team = line.text
                    elif i == 1:
                        attRate = line.text
                    elif i == 2:
                        midRate = line.text
                    elif i == 3:
                        defRate = line.text
                    elif i == 4:
                        ovrRate = line.text
                        df2 = pd.DataFrame({'Year': [year],
                                'Team': [team],
                                'att': [attRate],
                                'mid': [midRate],
                                'def': [defRate],
                                'ovr': [ovrRate]})
                        data = data.append(df2, ignore_index = True)

                    # Update i
                    # If i is at 5, reset to 0 as at the start of a new team in the content
                    i += 1
                    if i == 5:
                        i = 0

In [2]:
data.head(1)

Unnamed: 0,Year,Team,att,mid,def,ovr
0,2004,France,94,89,84,88


Nice got all the of the stats on each team from 2004

Fifa is missing some of the world cup teams so will need to impute them based off of world cup rankings
* impute rankings from https://www.fifa.com/fifa-world-ranking

In [3]:
def parseFifaRankData(points, page_source, year):
    # Function to parse a page on a Fifa Ranking page
    # Inserts the Team Name along with their Points and Overall Rank for the respective year into the data frame points
    # Points and Overall Rank will be normalized
    
    soup = BeautifulSoup(page_source, 'lxml')
    s = soup.find_all('tr', class_='fc-ranking-item-full_rankingTableFullRow__1nbp7')
    pointArr = []
    rankArr = []
    nameArr = []
    yearArr = []
    for line in s:
        # Each line contains the container for a team
        teamName = line.find('span', class_='d-none d-lg-block').text
        nameArr.append(teamName)
        teamPoints = line.find('div', class_="d-flex ff-mr-16").text
        pointArr.append(float(teamPoints))
        teamRank = line.find('h6', class_="ff-m-0").text
        rankArr.append(float(teamRank))
        yearArr.append(year)

    # Normalize points and rank then add to the data frame
    pointNorm = np.linalg.norm(pointArr)
    normPointArr = pointArr/pointNorm
    
    rankNorm = np.linalg.norm(rankArr)
    normRankArr = rankArr/rankNorm
    
    df2 = pd.DataFrame({'Year': yearArr,
                    'Team': nameArr,
                    'Points': normPointArr,
                    'Rank': normRankArr})
    points = points.append(df2, ignore_index = True)
    return points
        

In [4]:
# Need to use the driver: https://stackoverflow.com/questions/52687372/beautifulsoup-not-returning-complete-html-of-the-page
import time
from bs4 import BeautifulSoup
from selenium import webdriver
import numpy as np

points = pd.DataFrame(columns=['Year', 'Team', 'Points', 'Rank'])

url = "https://www.fifa.com/fifa-world-ranking/men?dateId=id13603"
driver = webdriver.Chrome(executable_path=r"C:/Users/Kyle/Downloads/chromedriver_win32/chromedriver.exe")
driver.get(url)
time.sleep(3) #if you want to wait 3 seconds for the page to load
driver.find_element_by_xpath("//button[@id='onetrust-accept-btn-handler']").click()


for year in range(2004, 2023):
    # Get data of top 100 teams from 2004 to 2022 in February
    date = 'Feb ' + str(year)
    xPath = "//button[@class='ff-dropdown_dropupContentButton__3WmBL' and contains(.,'" + date + "')]"
    driver.find_element_by_xpath("//div[@class='ff-dropdown_dropup__3DoLH null ']").click()
    driver.find_element_by_xpath(xPath).click() # click to this year
    time.sleep(1) # wait for the page to load for 1 sec
    points = parseFifaRankData(points, driver.page_source, year)
    
    driver.find_element_by_xpath("//div[@aria-label='Go to Page 2']").click() # page 2
    time.sleep(1) # wait for the page to load for 1 sec
    points = parseFifaRankData(points, driver.page_source, year)


ModuleNotFoundError: No module named 'selenium'

In [5]:
points.head(1)

NameError: name 'points' is not defined

In [6]:
points.tail(1)

NameError: name 'points' is not defined

Nice got all the points and rank in there with the teams (with points and rank normalized by year)

## Align the Team Name's Syntax Between Points and Data

In [7]:
pointsTeams = points["Team"].unique()
dataTeams = data["Team"].unique()
pointsTeamsNotInData = np.setdiff1d(pointsTeams, dataTeams)
dataTeamsNotInPoints = np.setdiff1d(dataTeams, pointsTeams)

NameError: name 'points' is not defined

In [8]:
pointsTeamsNotInData

NameError: name 'pointsTeamsNotInData' is not defined

In [9]:
dataTeamsNotInPoints

NameError: name 'dataTeamsNotInPoints' is not defined

In [10]:
pointsTeams.sort()
pointsTeams

NameError: name 'pointsTeams' is not defined

In [11]:
dataTeams.sort()
dataTeams

NameError: name 'dataTeams' is not defined

Team names to change in data...
* Austria (National team) --> Austria
* Holland --> Netherlands
* Rep. Of Korea --> Korea Republic
* United States --> USA
* Republic Of Ireland --> Republic of Ireland

Team names to change in points...
* Korea DPR --> Korea Republic

In [12]:
data.loc[data["Team"] == "Austria (National team)", "Team"] = "Austria"
data.loc[data["Team"] == "Holland", "Team"] = "Netherlands"
data.loc[data["Team"] == "Rep. Of Korea", "Team"] = "Korea Republic"
data.loc[data["Team"] == "United States", "Team"] = "USA"
data.loc[data["Team"] == "Republic Of Ireland", "Team"] = "Republic of Ireland"
points.loc[points["Team"] == "Korea DPR", "Team"] = "Korea Republic"

NameError: name 'points' is not defined

## Impute Each Team's OVR

* Merge the data frames points and data on Year and Team
* Drop the NaN's
* Drop the columns Team, att, mid, def
* OHE the years

Now would be able to train/test split for imputing...
* y = ovr
* x = everything else (year OHE, points, rank)


In [13]:
dataToImpute = pd.merge(points, data, how='left', on=['Year','Team'])
dataToImpute = dataToImpute.dropna()
dataToImpute.head(5)

NameError: name 'points' is not defined

In [14]:
# Drop the columns
dataToImpute = dataToImpute.drop(columns=['Team', 'att', 'mid', 'def'])
dataToImpute.head(1)

NameError: name 'dataToImpute' is not defined

In [15]:
len(dataToImpute)

NameError: name 'dataToImpute' is not defined

In [16]:
ohe_data = pd.get_dummies(dataToImpute, columns = ['Year'])
ohe_data["Year"] = dataToImpute["Year"]
ohe_data.head(3)

NameError: name 'dataToImpute' is not defined

In [17]:
ohe_data.tail(3)

NameError: name 'ohe_data' is not defined

## Predict the Overall

Start with a train test split, stratifying on the years

In [18]:
from sklearn.model_selection import train_test_split
X = ohe_data.drop(columns=['ovr', 'Year'])
y = ohe_data['ovr']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=ohe_data.Year)

NameError: name 'ohe_data' is not defined

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
lm = LinearRegression()
lm.fit(X_train,y_train)
lm_predictions = lm.predict(X_test)

lin_mse = mean_squared_error(y_test, lm_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

Not too bad, overall is off by a little over 3 on average with the linear regression model

Let's do a regression tree next

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_train, y_train)
tree_predictions = tree_reg.predict(X_test)
tree_mse = mean_squared_error(y_test, tree_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

Decision tree regressor performs a little worse than the linear regression

Now let's try a random forest regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(X_train, y_train)
forest_predictions = forest_reg.predict(X_test)
forest_mse = mean_squared_error(y_test, forest_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

Random forest regressor performs even better...nice!

## Using Random Forest Regressor Model to Predict the Overall's

In [19]:
points.head(5)

NameError: name 'points' is not defined

In [20]:
imputedOveralls = points.copy()
imputedOveralls = pd.get_dummies(imputedOveralls, columns = ['Year'])

xImpute = imputedOveralls.drop(columns=['Team'])

overall_predictions = forest_reg.predict(xImpute)
overall_predictions = np.round(overall_predictions)
overall_predictions

NameError: name 'points' is not defined

In [21]:
points['ovr'] = overall_predictions.astype(int)
points.head(2)

NameError: name 'overall_predictions' is not defined

## Using LM To Predict Points Overall

In [22]:
imputedOveralls = points.copy()
imputedOveralls = pd.get_dummies(imputedOveralls, columns = ['Year'])

xImpute = imputedOveralls.drop(columns=['Team'])

overall_predictions = lm.predict(xImpute)
overall_predictions = np.round(overall_predictions)
overall_predictions

NameError: name 'points' is not defined

In [23]:
points['ovr'] = overall_predictions.astype(int)
points.head(2)

NameError: name 'overall_predictions' is not defined

## Get Previous International Soccer Game Data

Get previous game data

In [None]:
game = pd.read_csv("results.csv")
game["Year"] = game.date.str[:4].astype(str) # string splice first 4 letters of entire date column

game.head(5)

In [None]:
game = game.drop(columns=['date', 'city', 'country', 'neutral'])
game = game.where(game['Year'] >= '2004')
game = game.dropna()
game.head(5)

In [None]:
len(game)

In [None]:
game["tournament"].value_counts()

Friendlies take up almost a third of the data...going to try building models with and without friendlies included and see which one performs the best

In [None]:
game.head(1)

In [None]:
game['score'] = game['home_score'] - game['away_score']
game['outcome'] = None
game.loc[game["score"] == 0, "outcome"] = 0
game.loc[game["score"] > 0, "outcome"] = 1
game.loc[game["score"] < 0, "outcome"] = -1
game.head(5)

Predict if home team wins
outcome:
* 1 if home team wins
* 0 if draw
* -1 if away team wins




## Match Syntax of the Teams in Game Data with the Points and Data Teams

follow process of above

In [None]:
home = game["home_team"].unique()
away = game["away_team"].unique()
allGameTeams = list(set(home).union(set(away)))
allGameTeams.sort()
allGameTeams

look at teams in game that arent in data
of these teams, look at what teams aren't in points too

change up whatever possible to match them up
look into scraping another page or two?

In [None]:
pointsTeams = points["Team"].unique()
dataTeams = data["Team"].unique()
allGameTeamsNotInData = np.setdiff1d(allGameTeams, dataTeams)
gameTeamsNotInDataAndNotInPoints = np.setdiff1d(allGameTeamsNotInData, pointsTeams)

In [None]:
len(allGameTeamsNotInData)

In [None]:
dataTeams.sort()
dataTeams

In [None]:
allGameTeamsNotInData

In [None]:
pointsTeams.sort()
pointsTeams

In [None]:
gameTeamsNotInDataAndNotInPoints

Teams that need to be changed in game data
* Ivory Coast --> Côte d'Ivoire
* South Korea --> Korea Republic
* United States --> USA

Points Teams name changes
* IR Iran --> Iran

In [None]:
game.head(1)

In [None]:
game.loc[game["home_team"] == "Ivory Coast", "home_team"] = "Côte d'Ivoire"
game.loc[game["away_team"] == "Ivory Coast", "away_team"] = "Côte d'Ivoire"
game.loc[game["home_team"] == "South Korea", "home_team"] = "Korea Republic"
game.loc[game["away_team"] == "South Korea", "away_team"] = "Korea Republic"
game.loc[game["home_team"] == "United States", "home_team"] = "USA"
game.loc[game["away_team"] == "United States", "away_team"] = "USA"

In [None]:
game.loc[game["home_team"] == "Ivory Coast", "home_team"] = "Côte d'Ivoire"
game.loc[game["away_team"] == "Ivory Coast", "away_team"] = "Côte d'Ivoire"
game.loc[game["home_team"] == "South Korea", "home_team"] = "Korea Republic"
game.loc[game["away_team"] == "South Korea", "away_team"] = "Korea Republic"
game.loc[game["home_team"] == "United States", "home_team"] = "USA"
game.loc[game["away_team"] == "United States", "away_team"] = "USA"
points.loc[points["Team"] == "IR Iran", "Team"] = "Iran"

## Add in Home Team and Away Team Overall

In [None]:
game.head(1)

loop through game
for each team:
    check to see if there is a match in data for the team
    if not:
        check to see if there is a match in points
    if not in points and data:
        remove the row from game

In [None]:
data.head(1)

In [None]:
game = game.reset_index() # Need to reset after getting rid of games before 2004
game['home_att'] = None
game['home_mid'] = None
game['home_def'] = None
game['home_ovr'] = None
game['away_att'] = None
game['away_mid'] = None
game['away_def'] = None
game['away_ovr'] = None
indexesToRemove = [] # keep track of rows that need to be removed
for idx in range(len(game)):
    year = int(game.at[idx, 'Year'])
    homeTeam = game.at[idx, 'home_team']
    awayTeam = game.at[idx, 'away_team']
    dataHomeMatchIdx = np.where((data['Team'] == homeTeam) & (data['Year'] == year))
    if len(dataHomeMatchIdx[0]) == 0:
        # home team is not in data
        # check if home team is in points
        pointsHomeMatchIdx = np.where((points['Team'] == homeTeam) & (points['Year'] == year))
        if len(pointsHomeMatchIdx[0]) != 0:
            # home team is in points
            # check if away team exists in data first
            dataAwayMatchIdx = np.where((data['Team'] == awayTeam) & (data['Year'] == year))
            if len(dataAwayMatchIdx[0]) == 0:
                # home team in points
                # away team is not in data
                # check if away team in points
                pointsAwayMatchIdx = np.where((points['Team'] == awayTeam) & (points['Year'] == year))
                if len(pointsAwayMatchIdx[0]) == 0:
                    # away team not in either so remove
                    #print('remove 1')
                    # REMOVE HERE
                    game = game.drop(index=idx)
                else:
                    # away team is in points
                    # home team in points
                    attHome = points.at[pointsHomeMatchIdx[0][0], 'ovr']
                    midHome = points.at[pointsHomeMatchIdx[0][0], 'ovr']
                    defHome = points.at[pointsHomeMatchIdx[0][0], 'ovr']
                    ovrHome = points.at[pointsHomeMatchIdx[0][0], 'ovr']

                    attAway = points.at[pointsAwayMatchIdx[0][0], 'ovr']
                    midAway = points.at[pointsAwayMatchIdx[0][0], 'ovr']
                    defAway = points.at[pointsAwayMatchIdx[0][0], 'ovr']
                    ovrAway = points.at[pointsAwayMatchIdx[0][0], 'ovr']
                    
                    # UPDATE DATA FRAME
                    game.at[idx, 'home_att'] = attHome
                    game.at[idx, 'home_mid'] = midHome
                    game.at[idx, 'home_def'] = defHome
                    game.at[idx, 'home_ovr'] = ovrHome
                    game.at[idx, 'away_att'] = attAway
                    game.at[idx, 'away_mid'] = midAway
                    game.at[idx, 'away_def'] = defAway
                    game.at[idx, 'away_ovr'] = ovrAway
            else:
                # away team is in data
                # home team is in points
                
                attHome = points.at[pointsHomeMatchIdx[0][0], 'ovr']
                midHome = points.at[pointsHomeMatchIdx[0][0], 'ovr']
                defHome = points.at[pointsHomeMatchIdx[0][0], 'ovr']
                ovrHome = points.at[pointsHomeMatchIdx[0][0], 'ovr']

                attAway = data.at[dataAwayMatchIdx[0][0], 'att']
                midAway = data.at[dataAwayMatchIdx[0][0], 'mid']
                defAway = data.at[dataAwayMatchIdx[0][0], 'def']
                ovrAway = data.at[dataAwayMatchIdx[0][0], 'ovr']

                # update the columns in the data frame
                game.at[idx, 'home_att'] = attHome
                game.at[idx, 'home_mid'] = midHome
                game.at[idx, 'home_def'] = defHome
                game.at[idx, 'home_ovr'] = ovrHome
                game.at[idx, 'away_att'] = attAway
                game.at[idx, 'away_mid'] = midAway
                game.at[idx, 'away_def'] = defAway
                game.at[idx, 'away_ovr'] = ovrAway
        else:
            # no home team match
            #print('remove 2')
            #print(homeTeam + ", " + awayTeam)
            game = game.drop(index=idx)
         
    else:
        # home team is in data
        # now see if away team is in there too
        dataAwayMatchIdx = np.where((data['Team'] == awayTeam) & (data['Year'] == year))
        if len(dataAwayMatchIdx[0]) == 0:
            # away team not in data
            # check if away team is in points
            pointsAwayMatchIdx = np.where((points['Team'] == awayTeam) & (points['Year'] == year))
            if len(pointsAwayMatchIdx[0]) == 0:
                # away team not in either so remove
                #print('remove 3')
                game = game.drop(index=idx)
            else:
                # home team in data
                # away team in points
                attHome = data.at[dataHomeMatchIdx[0][0], 'att']
                midHome = data.at[dataHomeMatchIdx[0][0], 'mid']
                defHome = data.at[dataHomeMatchIdx[0][0], 'def']
                ovrHome = data.at[dataHomeMatchIdx[0][0], 'ovr']
            
                attAway = points.at[pointsAwayMatchIdx[0][0], 'ovr']
                midAway = points.at[pointsAwayMatchIdx[0][0], 'ovr']
                defAway = points.at[pointsAwayMatchIdx[0][0], 'ovr']
                ovrAway = points.at[pointsAwayMatchIdx[0][0], 'ovr']
                
                # Update data frame
                game.at[idx, 'home_att'] = attHome
                game.at[idx, 'home_mid'] = midHome
                game.at[idx, 'home_def'] = defHome
                game.at[idx, 'home_ovr'] = ovrHome
                game.at[idx, 'away_att'] = attAway
                game.at[idx, 'away_mid'] = midAway
                game.at[idx, 'away_def'] = defAway
                game.at[idx, 'away_ovr'] = ovrAway
                
                
            
        else:
            # both home and away in data
            attHome = data.at[dataHomeMatchIdx[0][0], 'att']
            midHome = data.at[dataHomeMatchIdx[0][0], 'mid']
            defHome = data.at[dataHomeMatchIdx[0][0], 'def']
            ovrHome = data.at[dataHomeMatchIdx[0][0], 'ovr']
            
            attAway = data.at[dataAwayMatchIdx[0][0], 'att']
            midAway = data.at[dataAwayMatchIdx[0][0], 'mid']
            defAway = data.at[dataAwayMatchIdx[0][0], 'def']
            ovrAway = data.at[dataAwayMatchIdx[0][0], 'ovr']
            
            # update the columns in the data frame
            game.at[idx, 'home_att'] = attHome
            game.at[idx, 'home_mid'] = midHome
            game.at[idx, 'home_def'] = defHome
            game.at[idx, 'home_ovr'] = ovrHome
            game.at[idx, 'away_att'] = attAway
            game.at[idx, 'away_mid'] = midAway
            game.at[idx, 'away_def'] = defAway
            game.at[idx, 'away_ovr'] = ovrAway
    

In [None]:
game.head(5)

In [None]:
game["home_att"] = pd.to_numeric(game["home_att"])
game["home_mid"] = pd.to_numeric(game["home_mid"])
game["home_def"] = pd.to_numeric(game["home_def"])
game["home_ovr"] = pd.to_numeric(game["home_ovr"])
game["away_att"] = pd.to_numeric(game["away_att"])
game["away_mid"] = pd.to_numeric(game["away_mid"])
game["away_def"] = pd.to_numeric(game["away_def"])
game["away_ovr"] = pd.to_numeric(game["away_ovr"])

In [None]:
# Now let's combine the home and away attributes to be home - away
game['att'] = game['home_att'] - game['away_att']
game['mid'] = game['home_mid'] - game['away_mid']
game['def'] = game['home_def'] - game['away_def']
game['ovr'] = game['home_ovr'] - game['away_ovr']
game.head(2)

In [None]:
game = game.drop(columns=['index','home_team','away_team','home_score'])

In [None]:
gameDrop = game.copy()
gameDrop = gameDrop.drop(columns=['index','home_team','away_team','home_score', 'away_score','score','home_att',
                                 'home_mid','home_def','home_ovr','away_att','away_mid','away_def','away_ovr' ])
gameDrop.head(2)

## Modeling with All Game Data (Includes Friendlies)

In [None]:
gameWithFriendly = gameDrop.copy()
gameWithFriendly = gameWithFriendly.drop(columns=['tournament'])
gameWithFriendly.head(2)

## Train/Test Split

In [None]:
from sklearn.model_selection import train_test_split
X = gameWithFriendly.drop(columns=['outcome', 'Year'])
y = gameWithFriendly['outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=gameWithFriendly.Year)

## Linear Regression Model

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
lm = LinearRegression()
lm.fit(X_train,y_train)
lm_predictions = lm.predict(X_test)
lm_predictions[lm_predictions <= -.1] = -1
lm_predictions[lm_predictions >= .1] = 1
lm_predictions[(lm_predictions < .1) & (lm_predictions > -.1)] = 0
lm_predictions

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, lm_predictions)

## Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_train, y_train)
tree_predictions = tree_reg.predict(X_test)
tree_predictions[tree_predictions <= -.1] = -1
tree_predictions[tree_predictions >= .1] = 1
tree_predictions[(tree_predictions < .1) & (tree_predictions > -.1)] = 0
accuracy_score(y_test, tree_predictions)

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(X_train, y_train)
forest_predictions = forest_reg.predict(X_test)
forest_predictions[forest_predictions <= -.1] = -1
forest_predictions[forest_predictions >= .1] = 1
forest_predictions[(forest_predictions < .1) & (forest_predictions > -.1)] = 0
accuracy_score(y_test, forest_predictions)

## Modeling without Friendlies

In [None]:
# do work here

## Simulating the 2022 World Cup

doing this one with friendly model

In [None]:
groupA = ['Qatar', 'Ecuador', 'Netherlands', 'Senegal']
groupB = ['England', 'Iran', 'USA', 'Wales'] # not sure w Wales
groupC = ['Argentina', 'Poland', 'Mexico', 'Saudi Arabia']
groupD = ['France', 'Denmark', 'Tunisia', 'Peru'] # Peru not sure
groupE = ['Spain', 'Germany', 'Japan', 'Costa Rica'] # Costa rica not sure
groupF = ['Belgium', 'Canada', 'Morocco', 'Croatia']
groupG = ['Brazil', 'Serbia', 'Switzerland', 'Cameroon']
groupH = ['Portugal', 'Ghana', 'Uruguay', 'Korea Republic']

worldCupTeams = groupA + groupB + groupC + groupD + groupE + groupF + groupG + groupH

# Left side
# A1 v B2 : C2 v D2
# E1 v F2 : G1 v H2

# Right side
# D1 v C2 : B1 v A2
# F1 v E2 : G2 v H1

worldCup = pd.DataFrame(columns = ['Team', 'Points', 'Group', 'att', 'mid', 'def', 'ovr'])
idx = 0
teamsNotFound = []
for team in groupA:
    worldCup.at[idx, 'Team'] = team
    worldCup.at[idx, 'Points'] = 0
    worldCup.at[idx, 'Group'] = 'A'
    dataIdx = np.where((data['Team'] == team) & (data['Year'] == 2022))
    if len(dataIdx[0]) == 0:
        # team not in data, check points
        pointsIdx = np.where((points['Team'] == team) & (points['Year'] == 2022))
        if len(pointsIdx[0]) == 0:
            teamsNotFound.append(team)
            att = 80
            mid = 80
            defRate = 80
            ovr = 80
            worldCup.at[idx, 'att'] = att
            worldCup.at[idx, 'mid'] = mid
            worldCup.at[idx, 'def'] = defRate
            worldCup.at[idx, 'ovr'] = ovr
        else:
            # found in points
            att = points.at[pointsIdx[0][0], 'ovr']
            mid = points.at[pointsIdx[0][0], 'ovr']
            defRate = points.at[pointsIdx[0][0], 'ovr']
            ovr = points.at[pointsIdx[0][0], 'ovr']
            worldCup.at[idx, 'att'] = att
            worldCup.at[idx, 'mid'] = mid
            worldCup.at[idx, 'def'] = defRate
            worldCup.at[idx, 'ovr'] = ovr
    else:
        # found in data
        att = data.at[dataIdx[0][0], 'att']
        mid = data.at[dataIdx[0][0], 'mid']
        defRate = data.at[dataIdx[0][0], 'def']
        ovr = data.at[dataIdx[0][0], 'ovr']
        
        worldCup.at[idx, 'att'] = att
        worldCup.at[idx, 'mid'] = mid
        worldCup.at[idx, 'def'] = defRate
        worldCup.at[idx, 'ovr'] = ovr
    idx += 1
    
for team in groupB:
    worldCup.at[idx, 'Team'] = team
    worldCup.at[idx, 'Points'] = 0
    worldCup.at[idx, 'Group'] = 'B'
    dataIdx = np.where((data['Team'] == team) & (data['Year'] == 2022))
    if len(dataIdx[0]) == 0:
        # team not in data, check points
        pointsIdx = np.where((points['Team'] == team) & (points['Year'] == 2022))
        if len(pointsIdx[0]) == 0:
            teamsNotFound.append(team)
            att = 80
            mid = 80
            defRate = 80
            ovr = 80
            worldCup.at[idx, 'att'] = att
            worldCup.at[idx, 'mid'] = mid
            worldCup.at[idx, 'def'] = defRate
            worldCup.at[idx, 'ovr'] = ovr
        else:
            # found in points
            att = points.at[pointsIdx[0][0], 'ovr']
            mid = points.at[pointsIdx[0][0], 'ovr']
            defRate = points.at[pointsIdx[0][0], 'ovr']
            ovr = points.at[pointsIdx[0][0], 'ovr']
            worldCup.at[idx, 'att'] = att
            worldCup.at[idx, 'mid'] = mid
            worldCup.at[idx, 'def'] = defRate
            worldCup.at[idx, 'ovr'] = ovr
    else:
        # found in data
        att = data.at[dataIdx[0][0], 'att']
        mid = data.at[dataIdx[0][0], 'mid']
        defRate = data.at[dataIdx[0][0], 'def']
        ovr = data.at[dataIdx[0][0], 'ovr']
        
        worldCup.at[idx, 'att'] = att
        worldCup.at[idx, 'mid'] = mid
        worldCup.at[idx, 'def'] = defRate
        worldCup.at[idx, 'ovr'] = ovr
    
    idx += 1
for team in groupC:
    worldCup.at[idx, 'Team'] = team
    worldCup.at[idx, 'Points'] = 0
    worldCup.at[idx, 'Group'] = 'C'
    dataIdx = np.where((data['Team'] == team) & (data['Year'] == 2022))
    if len(dataIdx[0]) == 0:
        # team not in data, check points
        pointsIdx = np.where((points['Team'] == team) & (points['Year'] == 2022))
        if len(pointsIdx[0]) == 0:
            teamsNotFound.append(team)
            att = 80
            mid = 80
            defRate = 80
            ovr = 80
            worldCup.at[idx, 'att'] = att
            worldCup.at[idx, 'mid'] = mid
            worldCup.at[idx, 'def'] = defRate
            worldCup.at[idx, 'ovr'] = ovr
        else:
            # found in points
            att = points.at[pointsIdx[0][0], 'ovr']
            mid = points.at[pointsIdx[0][0], 'ovr']
            defRate = points.at[pointsIdx[0][0], 'ovr']
            ovr = points.at[pointsIdx[0][0], 'ovr']
            worldCup.at[idx, 'att'] = att
            worldCup.at[idx, 'mid'] = mid
            worldCup.at[idx, 'def'] = defRate
            worldCup.at[idx, 'ovr'] = ovr
    else:
        # found in data
        att = data.at[dataIdx[0][0], 'att']
        mid = data.at[dataIdx[0][0], 'mid']
        defRate = data.at[dataIdx[0][0], 'def']
        ovr = data.at[dataIdx[0][0], 'ovr']
        
        worldCup.at[idx, 'att'] = att
        worldCup.at[idx, 'mid'] = mid
        worldCup.at[idx, 'def'] = defRate
        worldCup.at[idx, 'ovr'] = ovr
    
    idx += 1
    
for team in groupD:
    worldCup.at[idx, 'Team'] = team
    worldCup.at[idx, 'Points'] = 0
    worldCup.at[idx, 'Group'] = 'D'
    dataIdx = np.where((data['Team'] == team) & (data['Year'] == 2022))
    if len(dataIdx[0]) == 0:
        # team not in data, check points
        pointsIdx = np.where((points['Team'] == team) & (points['Year'] == 2022))
        if len(pointsIdx[0]) == 0:
            teamsNotFound.append(team)
            att = 80
            mid = 80
            defRate = 80
            ovr = 80
            worldCup.at[idx, 'att'] = att
            worldCup.at[idx, 'mid'] = mid
            worldCup.at[idx, 'def'] = defRate
            worldCup.at[idx, 'ovr'] = ovr
        else:
            # found in points
            att = points.at[pointsIdx[0][0], 'ovr']
            mid = points.at[pointsIdx[0][0], 'ovr']
            defRate = points.at[pointsIdx[0][0], 'ovr']
            ovr = points.at[pointsIdx[0][0], 'ovr']
            worldCup.at[idx, 'att'] = att
            worldCup.at[idx, 'mid'] = mid
            worldCup.at[idx, 'def'] = defRate
            worldCup.at[idx, 'ovr'] = ovr
    else:
        # found in data
        att = data.at[dataIdx[0][0], 'att']
        mid = data.at[dataIdx[0][0], 'mid']
        defRate = data.at[dataIdx[0][0], 'def']
        ovr = data.at[dataIdx[0][0], 'ovr']
        
        worldCup.at[idx, 'att'] = att
        worldCup.at[idx, 'mid'] = mid
        worldCup.at[idx, 'def'] = defRate
        worldCup.at[idx, 'ovr'] = ovr
    
    idx += 1
    
for team in groupE:
    worldCup.at[idx, 'Team'] = team
    worldCup.at[idx, 'Points'] = 0
    worldCup.at[idx, 'Group'] = 'E'
    dataIdx = np.where((data['Team'] == team) & (data['Year'] == 2022))
    if len(dataIdx[0]) == 0:
        # team not in data, check points
        pointsIdx = np.where((points['Team'] == team) & (points['Year'] == 2022))
        if len(pointsIdx[0]) == 0:
            teamsNotFound.append(team)
            att = 80
            mid = 80
            defRate = 80
            ovr = 80
            worldCup.at[idx, 'att'] = att
            worldCup.at[idx, 'mid'] = mid
            worldCup.at[idx, 'def'] = defRate
            worldCup.at[idx, 'ovr'] = ovr
        else:
            # found in points
            att = points.at[pointsIdx[0][0], 'ovr']
            mid = points.at[pointsIdx[0][0], 'ovr']
            defRate = points.at[pointsIdx[0][0], 'ovr']
            ovr = points.at[pointsIdx[0][0], 'ovr']
            worldCup.at[idx, 'att'] = att
            worldCup.at[idx, 'mid'] = mid
            worldCup.at[idx, 'def'] = defRate
            worldCup.at[idx, 'ovr'] = ovr
    else:
        # found in data
        att = data.at[dataIdx[0][0], 'att']
        mid = data.at[dataIdx[0][0], 'mid']
        defRate = data.at[dataIdx[0][0], 'def']
        ovr = data.at[dataIdx[0][0], 'ovr']
        
        worldCup.at[idx, 'att'] = att
        worldCup.at[idx, 'mid'] = mid
        worldCup.at[idx, 'def'] = defRate
        worldCup.at[idx, 'ovr'] = ovr
    idx += 1
    
for team in groupF:
    worldCup.at[idx, 'Team'] = team
    worldCup.at[idx, 'Points'] = 0
    worldCup.at[idx, 'Group'] = 'F'
    dataIdx = np.where((data['Team'] == team) & (data['Year'] == 2022))
    if len(dataIdx[0]) == 0:
        # team not in data, check points
        pointsIdx = np.where((points['Team'] == team) & (points['Year'] == 2022))
        if len(pointsIdx[0]) == 0:
            teamsNotFound.append(team)
            att = 80
            mid = 80
            defRate = 80
            ovr = 80
            worldCup.at[idx, 'att'] = att
            worldCup.at[idx, 'mid'] = mid
            worldCup.at[idx, 'def'] = defRate
            worldCup.at[idx, 'ovr'] = ovr
        else:
            # found in points
            att = points.at[pointsIdx[0][0], 'ovr']
            mid = points.at[pointsIdx[0][0], 'ovr']
            defRate = points.at[pointsIdx[0][0], 'ovr']
            ovr = points.at[pointsIdx[0][0], 'ovr']
            worldCup.at[idx, 'att'] = att
            worldCup.at[idx, 'mid'] = mid
            worldCup.at[idx, 'def'] = defRate
            worldCup.at[idx, 'ovr'] = ovr
    else:
        # found in data
        att = data.at[dataIdx[0][0], 'att']
        mid = data.at[dataIdx[0][0], 'mid']
        defRate = data.at[dataIdx[0][0], 'def']
        ovr = data.at[dataIdx[0][0], 'ovr']
        
        worldCup.at[idx, 'att'] = att
        worldCup.at[idx, 'mid'] = mid
        worldCup.at[idx, 'def'] = defRate
        worldCup.at[idx, 'ovr'] = ovr
    idx += 1
    
for team in groupG:
    worldCup.at[idx, 'Team'] = team
    worldCup.at[idx, 'Points'] = 0
    worldCup.at[idx, 'Group'] = 'G'
    dataIdx = np.where((data['Team'] == team) & (data['Year'] == 2022))
    if len(dataIdx[0]) == 0:
        # team not in data, check points
        pointsIdx = np.where((points['Team'] == team) & (points['Year'] == 2022))
        if len(pointsIdx[0]) == 0:
            teamsNotFound.append(team)
            att = 80
            mid = 80
            defRate = 80
            ovr = 80
            worldCup.at[idx, 'att'] = att
            worldCup.at[idx, 'mid'] = mid
            worldCup.at[idx, 'def'] = defRate
            worldCup.at[idx, 'ovr'] = ovr
        else:
            # found in points
            att = points.at[pointsIdx[0][0], 'ovr']
            mid = points.at[pointsIdx[0][0], 'ovr']
            defRate = points.at[pointsIdx[0][0], 'ovr']
            ovr = points.at[pointsIdx[0][0], 'ovr']
            worldCup.at[idx, 'att'] = att
            worldCup.at[idx, 'mid'] = mid
            worldCup.at[idx, 'def'] = defRate
            worldCup.at[idx, 'ovr'] = ovr
    else:
        # found in data
        att = data.at[dataIdx[0][0], 'att']
        mid = data.at[dataIdx[0][0], 'mid']
        defRate = data.at[dataIdx[0][0], 'def']
        ovr = data.at[dataIdx[0][0], 'ovr']
        
        worldCup.at[idx, 'att'] = att
        worldCup.at[idx, 'mid'] = mid
        worldCup.at[idx, 'def'] = defRate
        worldCup.at[idx, 'ovr'] = ovr
    idx += 1
    
for team in groupH:
    worldCup.at[idx, 'Team'] = team
    worldCup.at[idx, 'Points'] = 0
    worldCup.at[idx, 'Group'] = 'H'
    dataIdx = np.where((data['Team'] == team) & (data['Year'] == 2022))
    if len(dataIdx[0]) == 0:
        # team not in data, check points
        pointsIdx = np.where((points['Team'] == team) & (points['Year'] == 2022))
        if len(pointsIdx[0]) == 0:
            teamsNotFound.append(team)
            att = 80
            mid = 80
            defRate = 80
            ovr = 80
            worldCup.at[idx, 'att'] = att
            worldCup.at[idx, 'mid'] = mid
            worldCup.at[idx, 'def'] = defRate
            worldCup.at[idx, 'ovr'] = ovr
        else:
            # found in points
            att = points.at[pointsIdx[0][0], 'ovr']
            mid = points.at[pointsIdx[0][0], 'ovr']
            defRate = points.at[pointsIdx[0][0], 'ovr']
            ovr = points.at[pointsIdx[0][0], 'ovr']
            worldCup.at[idx, 'att'] = att
            worldCup.at[idx, 'mid'] = mid
            worldCup.at[idx, 'def'] = defRate
            worldCup.at[idx, 'ovr'] = ovr
    else:
        # found in data
        att = data.at[dataIdx[0][0], 'att']
        mid = data.at[dataIdx[0][0], 'mid']
        defRate = data.at[dataIdx[0][0], 'def']
        ovr = data.at[dataIdx[0][0], 'ovr']
        
        worldCup.at[idx, 'att'] = att
        worldCup.at[idx, 'mid'] = mid
        worldCup.at[idx, 'def'] = defRate
        worldCup.at[idx, 'ovr'] = ovr
    idx += 1

In [None]:
teamsNotFound

In [None]:
worldCup.head(5)

In [None]:
tester = worldCup[worldCup['Team'] == 'Qatar']
tester

In [None]:
type(tester['att'])

In [None]:
worldCup["att"] = pd.to_numeric(worldCup["att"])
worldCup["mid"] = pd.to_numeric(worldCup["mid"])
worldCup["def"] = pd.to_numeric(worldCup["def"])
worldCup["ovr"] = pd.to_numeric(worldCup["ovr"])

In [None]:
worldCup[worldCup['Team'] == 'Qatar']['Points'] = 3 + worldCup[worldCup['Team'] == 'Qatar']['Points']
worldCup.head(5)

In [None]:
worldCup.at[team1WC.index[0], 'Points'] = 0
worldCup.head(5)

In [None]:
groupsToRun = [groupA, groupB, groupC, groupD, groupE, groupF, groupG, groupH]
for group in groupsToRun:
    for i in range(len(group)-1):
        team1 = group[i]
        team1WC = worldCup[worldCup['Team'] == team1]
        predictor = pd.DataFrame(columns = ['att', 'mid', 'def', 'ovr'])
        for j in range(i + 1, len(group)):
            team2 = group[j]
            team2WC = worldCup[worldCup['Team'] == team2]

            predictor.at[0,'att'] = team1WC['att'].get_values()[0] - team2WC['att'].get_values()[0]
            predictor.at[0,'mid'] = team1WC['mid'].get_values()[0] - team2WC['mid'].get_values()[0]
            predictor.at[0,'def'] = team1WC['def'].get_values()[0] - team2WC['def'].get_values()[0]
            predictor.at[0,'ovr'] = team1WC['ovr'].get_values()[0] - team2WC['ovr'].get_values()[0]
            predictedVal = lm.predict(predictor)
            predictedVal[predictedVal <= -.1] = -1
            predictedVal[predictedVal >= .1] = 1
            predictedVal[(predictedVal < .1) & (predictedVal > -.1)] = 0
            if predictedVal == 1:
                # team 1 wins
                worldCup.at[team1WC.index[0], 'Points'] += 3
               # worldCup.at[team1WC.index[0], 'Points'] = 0
            elif predictedVal == 0:
                # draw
                worldCup.at[team1WC.index[0], 'Points'] += 1
                worldCup.at[team2WC.index[0], 'Points'] += 1
                #worldCup.at[team1WC.index[0], 'Points'] = 0
                #worldCup.at[team2WC.index[0], 'Points'] = 0
            else:
                # team 2 wins
                worldCup.at[team2WC.index[0], 'Points'] += 3
                #worldCup.at[team2WC.index[0], 'Points'] = 0

In [None]:
worldCup

In [None]:
# maybe just use points

In [None]:
points.tail(100)

## Data Preperation and EDA

In [None]:
# Drop data from before 2004 as don't have any metrics
import numpy as np
game = game.where(game['Year'] >= '2004')
game = game.dropna()
game.head(5)

In [None]:
game.info()

In [None]:
game["Year"].value_counts()

In [None]:
game["tournament"].value_counts()

## World Cup Bracket Breakdown

In [None]:
groupA = ['Qatar', 'Ecuador', 'Netherlands', 'Senegal']
groupB = ['England', 'Iran', 'USA', 'Wales'] # not sure w Wales
groupC = ['Argentina', 'Poland', 'Mexico', 'Saudi Arabia']
groupD = ['France', 'Denmark', 'Tunisia', 'Peru'] # Peru not sure
groupE = ['Spain', 'Germany', 'Japan', 'Costa Rica'] # Costa rica not sure
groupF = ['Belgium', 'Canada', 'Morocco', 'Croatia']
groupG = ['Brazil', 'Serbia', 'Switzerland', 'Cameroon']
groupH = ['Portugal', 'Ghana', 'Uruguay', 'Korea Republic']

worldCupTeams = groupA + groupB + groupC + groupD + groupE + groupF + groupG + groupH

# Left side
# A1 v B2 : C2 v D2
# E1 v F2 : G1 v H2

# Right side
# D1 v C2 : B1 v A2
# F1 v E2 : G2 v H1

In [None]:
worldCupTeams

## THIS ONE WORKS

In [None]:
# Need to use the driver: https://stackoverflow.com/questions/52687372/beautifulsoup-not-returning-complete-html-of-the-page
import time
from bs4 import BeautifulSoup
from selenium import webdriver

driver = webdriver.Chrome(executable_path=r"C:/Users/Kyle/Downloads/chromedriver_win32/chromedriver.exe")
driver.get(url)
time.sleep(3) #if you want to wait 3 seconds for the page to load
#driver.find_element_by_link_text("I'm OK with that").click()
#driver.find_element_by_xpath('//button[normalize-space()=I'm OK with that]').click()
driver.find_element_by_xpath("//button[@id='onetrust-accept-btn-handler']").click()
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'lxml')
#print(soup.prettify())

s = soup.find_all('tr', class_='fc-ranking-item-full_rankingTableFullRow__1nbp7')
# print(s)
rankedTeams = []

# for link in soup.findAll('tr', class_='fc-ranking-item-full_rankingTableFullRow__1nbp7'):
#         print(link.get('onclick'))
for line in s:
    # Each line contains the container for a team
    teamName = line.find('span', class_='d-none d-lg-block')
    teamName = teamName.text
    rankedTeams.append(teamName)
    teamPoints = line.find('div', class_="d-flex ff-mr-16")

#driver.find_elements_by_xpath('//*[@id="content"]/main/section[2]/div/div/div[2]/div/div/div/div/div[2]/div[2]').click()
# driver.find_element_by_css_selector()
#python_button.click()
driver.find_element_by_xpath("//div[@aria-label='Go to Page 2']").click()
page_source = driver.page_source
soup2 = BeautifulSoup(page_source, 'lxml')
#print(soup2.prettify())
s = soup2.find_all('tr', class_='fc-ranking-item-full_rankingTableFullRow__1nbp7')
for line in s:
    # Each line contains the container for a team
    teamName = line.find('span', class_='d-none d-lg-block')
    teamName = teamName.text
    rankedTeams.append(teamName)
    teamPoints = line.find('div', class_="d-flex ff-mr-16")

In [None]:
rankedTeams

## Current Work

In [None]:
game.head(5)

In [None]:
home = game["home_team"].unique()
away = game["away_team"].unique()
allTeams = list(set(home).union(set(away)))
allTeams.sort()
allTeams

In [None]:
len(allTeams)

In [None]:
missingTeams = set(worldCupTeams).difference(set(rankedTeams))
missingTeams

## Save the Model

In [None]:
import pickle
# open a file, where you ant to store the data
file = open('world_cup_model.pkl', 'wb')

# dump information to that file
pickle.dump(lm, file)

## References

[1] Deploying Machine Learning Model Using Heroku. https://datamahadev.com/deploying-machine-learning-model-using-heroku/

[2] Using Machine Learning to Simulate World Cup Matches. https://towardsdatascience.com/using-machine-learning-to-simulate-world-cup-matches-959e24d0731

[3] World Cup 2018 Prediction. https://www.kaggle.com/code/agostontorok/soccer-world-cup-2018-winner