# Creating Dataset

In [1]:
import pandas as pd
import numpy as np

In [2]:
import asyncio
import json
import aiohttp
from understat import Understat
import nest_asyncio
nest_asyncio.apply()
#__import__('IPython').embed()

In [3]:
matches = pd.read_csv('soccer-spi/soccer-spi/spi_matches_2021_top5.csv')

matches = matches.dropna()

In [4]:
spi = pd.read_csv('soccer-spi/soccer-spi/spi_global_rankings.csv')
spi = spi.dropna()

In [5]:
def getVal(team, val):
    return spi[spi['name'] == team][val].values[0]

In [6]:
def getAll(team):
    return [getVal(team, "spi"), getVal(team, "off"), getVal(team, "def")]

In [7]:
def getXGandG(team):
    async def main():
        async with aiohttp.ClientSession() as session:
            understat = Understat(session)
            teamStats = await understat.get_team_stats(team,2021)
            #xG
            fOpenPlayxG = teamStats['situation']['OpenPlay']['xG']
            fCornerxG = teamStats['situation']['FromCorner']['xG']
            fSetPiecexG = teamStats['situation']['SetPiece']['xG']
            fFreeKickxG = teamStats['situation']['DirectFreekick']['xG']
            fPenaltyxG = teamStats['situation']['Penalty']['xG']
            xfSum = fOpenPlayxG + fCornerxG + fSetPiecexG + fFreeKickxG + fPenaltyxG
            
            aOpenPlayxG = teamStats['situation']['OpenPlay']['against']['xG']
            aCornerxG = teamStats['situation']['FromCorner']['against']['xG']
            aSetPiecexG = teamStats['situation']['SetPiece']['against']['xG']
            aFreeKickxG = teamStats['situation']['DirectFreekick']['against']['xG']
            aPenaltyxG = teamStats['situation']['Penalty']['against']['xG']
            xaSum = aOpenPlayxG + aCornerxG + aSetPiecexG + aFreeKickxG + aPenaltyxG
            
            #G
            fOpenPlayG = teamStats['situation']['OpenPlay']['goals']
            fCornerG = teamStats['situation']['FromCorner']['goals']
            fSetPieceG = teamStats['situation']['SetPiece']['goals']
            fFreeKickG = teamStats['situation']['DirectFreekick']['goals']
            fPenaltyG = teamStats['situation']['Penalty']['goals']
            fSum = fOpenPlayG + fCornerG + fSetPieceG + fFreeKickG + fPenaltyG
            
            aOpenPlayG = teamStats['situation']['OpenPlay']['against']['goals']
            aCornerG = teamStats['situation']['FromCorner']['against']['goals']
            aSetPieceG = teamStats['situation']['SetPiece']['against']['goals']
            aFreeKickG = teamStats['situation']['DirectFreekick']['against']['goals']
            aPenaltyG = teamStats['situation']['Penalty']['against']['goals']
            aSum = aOpenPlayG + aCornerG + aSetPieceG + aFreeKickG + aPenaltyG

            return [xfSum, xaSum, fSum, aSum]
    loop = asyncio.get_event_loop()
    return loop.run_until_complete(main())         
            

In [8]:
teamList = matches['team1'].unique().tolist()
teamList.sort()
gfDict = {}
gaDict = {}
xgfDict = {}
xgaDict = {}
for team in teamList:
    g = getXGandG(team)
    gfDict[team] = g[2]
    gaDict[team] = g[3]
    xgfDict[team] = g[0]
    xgaDict[team] = g[1]

In [9]:
def getGamesPlayed(league):
    dict = {'French Ligue 1': 38,
            'Barclays Premier League': 38,
            'Italy Serie A': 38,
            'German Bundesliga': 34,
            'Spanish Primera Division': 38,}
    return dict[league]

In [10]:
def createChromosomeList(matches):
    #Name, SPI, Off, Def, avg xGF, avg xGA, avg GF, avg GA
    matchesList = matches.values.tolist()
    chromosomeList = []
    labels = []
    for i in range(0,len(matchesList)):
        t1 = matchesList[i][3]
        t2 = matchesList[i][4]
        t1v = getAll(t1)
        t2v = getAll(t2)
        p = getGamesPlayed(matchesList[i][2])
        team1List = [t1,t1v[0],t1v[1],t1v[2],xgfDict[t1]/p,xgaDict[t1]/p,gfDict[t1]/p,gaDict[t1]/p]
        team2List = [t2,t2v[0],t2v[1],t2v[2],xgfDict[t2]/p,xgaDict[t2]/p,gfDict[t2]/p,gaDict[t2]/p]
        chromosomeList.append(team1List + team2List)
        #Labels
        t1s = matchesList[i][7]
        t2s = matchesList[i][8]
        if(t1s>t2s):
            labels.append("H")
        elif(t1s<t2s):
            labels.append("A")
        else:
            labels.append("D")
    return chromosomeList, labels

chromosomeList, labels = createChromosomeList(matches)

In [11]:
import pickle
chromosomeFile = "matchesList.txt"
labelsFile = "labelList.txt"

with open(chromosomeFile, 'wb') as f:
    pickle.dump(chromosomeList, f)

with open(labelsFile, 'wb') as f:
    pickle.dump(labels,f)

In [14]:
with open("matchListReadable.txt", 'w') as f:
    f.write("Team1, Team1 SPI, Team1 Off, Team1 Def, Team1 avg xGF, Team1 avg xGA, Team1 avg GF, Team1 avg GA, Team2, Team2 SPI, Team2 Off, Team2 Def, Team2 avg xGF, Team2 avg xGA, Team2 avg GF, Team2 avg GA\n")
    for i in chromosomeList:
        f.write(str(i).replace("[","").replace("]","").replace("'","") + "\n")