In [1]:
import pandas as pd
import numpy as np
from lxml import etree
import xml.etree.ElementTree as ET
import time
import math
import pylab as pl
import matplotlib.pyplot as plt
import seaborn as sns # improves plot aesthetics

# Function

In [2]:
class DataReader:
  """
  Main class to read in xml files provided by the DFB.

  Returns:
      ElementTree root: The root of the xml document.

  """

  def __init__(self,xml_file):
    self.xml_root = self._load_data(xml_file)


  def _load_data(self,xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    return root

  def create_dataframe(self,df_cols=None,findall_string=".//Object",time_search=None):
    """
    Create a dataframe from the xml inputs

    Args:
        df_cols (list): Optional, the column names. If no names given, columns
          detected automatically
        findall_string (str): The xpath string to iterate trees (https://www.w3schools.com/xml/xml_xpath.asp)
        time_search (float): Will currently look at parent if it's an Event, it
          will compare the time_search to the EventTime. I.e. this is a means of getting
          all passes before the frame time.

    Returns:
        pd.dataframe
    """

    populate_column_names=False
    if df_cols is None:
        df_cols = []
        populate_column_names=True

    out_df = pd.DataFrame(columns = df_cols)

    parent_map = dict((c, p) for p in self.xml_root.getiterator() for c in p)
    for node in self.xml_root.findall(findall_string):
        if time_search is not None:
          if (parent_map[node].tag != "Event"): continue
          event_time = parent_map[node].attrib["EventTime"]
          ts = ciso8601.parse_datetime(event_time)
          # to get time in seconds:
          seconds=time.mktime(ts.timetuple())
          # If timestamp is larger than the search, exit
          if seconds > time_search: continue

        # If column names are not provided, get them
        if populate_column_names==True:
            for key,value in node.attrib.items():
                df_cols.append(key)
                populate_column_names=False

        res = []
        for c in df_cols:
            res.append(node.attrib.get(c))
        out_df = out_df.append(pd.Series(res, index = df_cols), ignore_index=True)
    return out_df


In [3]:
def get_tracking_data(xml_positions, half='firstHalf'):
        """Returns a dataframe with the tracking (position: x, y, z) of all players + the ball
        
        Dropping rows where ball positions is NAN

        +-------+-----------+-----------+
        |       |   BALL    |  TeamId   |
        +       +-----------+-----------+
        |       |  BallId   | PlayerId  |
        + N | T +---+---+---+---+---+---+
        |       | X | Y | Z | X | Y | A |
        +=======+===+===+===+===+===+===+
        | 0 | 0 |123|456|789|012|456|789|
        +-------+-----------+-----------+

        :param xml_positions: the parsed xml tracking data (lxml object)
        :param half: the description of the half: either "firstHalf" or "secondHalf", etc.
        :return: a dataframe containing the tracking data: position of all players and the ball
        """
        def change_type(df, frameset):
            """
            change type of columns
            different columns names and tpyes for the ball and for the rest of the players
            """
            if frameset.attrib.get('TeamId') == 'BALL':
                return df.astype(
                    # change types of columns for the ball
                    dtype={
                        'A':float, 
                        'D':float, 
                        'M':int, 
                        'N':int, 
                        'S':float, 
                        'X':float, 
                        'Y':float,
                        'Z':float,
                        #'T':'datetime64[ns]', # this breaks the timezone
                        'BallPossession':int, 
                        'BallStatus':int,
                    }, 
                    errors='raise'
                )
            else:
                return df.astype(
                    # change types of columns for the players
                    dtype={
                        'A':float, 
                        'D':float, 
                        'M':int, 
                        'N':int, 
                        'S':float, 
                        'X':float, 
                        'Y':float,
                        #'T':'datetime64[ns]', # this breaks the timezone
                    }, 
                    errors='raise'
                )
        

        def create_columns(df, frameset):
            """A utility function to create a multiindex columns"""
            person_id = frameset.attrib.get('PersonId')
            team_id = frameset.attrib.get('TeamId')
            df.columns = pd.MultiIndex.from_product(
                [[team_id], [person_id], df.columns],
                names=['TeamId', 'PersonId', 'Position']
            )
            return df

        tracking_players = pd.concat([
            pd.DataFrame.from_records(
                [dict(frame.attrib) for frame in frameset.getchildren()]
            ).pipe(change_type, frameset=frameset)
             .set_index(['N', 'T'])
             .pipe(create_columns, frameset=frameset)
            for frameset in xml_positions.xpath(F'//Positions/FrameSet[@GameSection = "{half}"]')
        ], axis=1, sort=False)
        
        # this is casting the 'T' values in the index to datetime with the correct time zone
        tracking_players.index.set_levels(pd.to_datetime(tracking_players.index.get_level_values(1)), level=1, inplace = True)

        return tracking_players

In [4]:
def calculateDistance(x1,y1,x2,y2):  
    if np.isnan(x1)==True or np.isnan(y1)==True or np.isnan(x2)==True or np.isnan(y2)==True:
        dist=float('NaN')
    else:
        dist = math.sqrt((x2 - x1)**2 + (y2 - y1)**2)  
    return dist  

In [15]:
def getdirectionofplay(teamid,half):
    if matchinfo.get(half)["Dataframe"].loc[matchinfo.get(half)["Dataframe"].index.get_level_values(0)[0],(teamid,slice(None),"X")].mean(axis=1).iat[0] < 0:
        dirofplay = 'left to right'
    if matchinfo.get(half)["Dataframe"].loc[matchinfo.get(half)["Dataframe"].index.get_level_values(0)[0],(teamid,slice(None),"X")].mean(axis=1).iat[0] > 0:
        dirofplay = 'right to left'
    return  dirofplay

In [16]:
def getteamsheet(teamid,half):
    teamsheet=matchinfo.get(half)["Dataframe"].loc[matchinfo.get(half)["Dataframe"].index.get_level_values(0)[0],(teamid,slice(None),"X")].columns.get_level_values(level=1).unique()
    return teamsheet

# GER EST

In [5]:
path='C:/Users/Tim/Desktop/QIndex/data/19-06-12_Mainz_GER_EST Sportec Data/'    #Privat

In [6]:
xml_positions = etree.parse('C:/Users/Tim/Desktop/QIndex/data/19-06-12_Mainz_GER_EST Sportec Data/DFL_04_03_positions_raw_observed_DFL-COM-000001_DFL-MAT-003BEU.xml') ##privat
kpimerged = pd.read_excel('C:/Users/Tim/Desktop/QIndex/data/19-06-12_Mainz_GER_EST Sportec Data/DFB_KPI_Merged.xlsx') ##privat

In [7]:
kpimergednew = kpimerged.rename({'FRAME_NUMBER': 'N'}, axis=1)

In [8]:
positions=get_tracking_data(xml_positions)     
##create dataframe from xml input

In [13]:
positions2=get_tracking_data(xml_positions,"secondHalf") 

In [9]:
CUID1 = 'DFL-CLU-000N8Y'
cuid1_player_info = "DFL_01_05_masterdata_{}_DFL-SEA-0001K3_player (1).xml".format(CUID1)
cuid1_team_data = DataReader(path+cuid1_player_info)
cuid1_team_meta_df = cuid1_team_data.create_dataframe()
cuid1_player_IDs = cuid1_team_meta_df.ObjectId.tolist()

In [10]:
CUID2 = 'DFL-CLU-000N8Z'
cuid2_player_info = "DFL_01_05_masterdata_{}_DFL-SEA-0001K3_player (1).xml".format(CUID2)
cuid2_team_data = DataReader(path+cuid2_player_info)
cuid2_team_meta_df = cuid2_team_data.create_dataframe()
cuid2_player_IDs = cuid2_team_meta_df.ObjectId.tolist()

In [14]:
matchinfo = {
    CUID1: {
        "BallPoss":1,
        "TeamID":CUID1,
    },
    CUID2: {
        "BallPoss":2,
        "TeamID":CUID2,
    },
    1: {
        "Dataframe":positions,
    },
    2: {
        "Dataframe":positions2,
    }
}

In [17]:
matchinfo = {
    CUID1: {
        1:getdirectionofplay(CUID1,1),
        2:getdirectionofplay(CUID1,2),
        "BallPoss":1,
        "TeamID":CUID1,
    },
    CUID2: {
        1:getdirectionofplay(CUID2,1),
        2:getdirectionofplay(CUID2,2),
        "BallPoss":2,
        "TeamID":CUID2,
    },
    1: {
        "Dataframe":positions,
        CUID1:getteamsheet(CUID1,1),
        CUID2:getteamsheet(CUID2,1),
        "Startframe":positions.index.get_level_values(0)[0],
        "Endframe":positions.index.get_level_values(0)[positions.shape[0]-1],
    },
    2: {
        "Dataframe":positions2,
        CUID1:getteamsheet(CUID1,2),
        CUID2:getteamsheet(CUID2,2),
        "Startframe":positions2.index.get_level_values(0)[0],
        "Endframe":positions2.index.get_level_values(0)[positions.shape[0]-1],
    },
    "left to right":{
        "Goalline":52.5
    },
    "right to left":{
        "Goalline":-52.5
    }
}

In [18]:
players=positions.columns.get_level_values(level=1).unique()
players=players.append(positions2.columns.get_level_values(level=1).unique())
players=players.unique()
players=players.drop("DFL-OBJ-0000XT")
players
for player in players:
    if player in matchinfo.get(1)[CUID1]:
        positions[CUID1,player,'DtB']=positions.apply(lambda x: calculateDistance(x[CUID1,player,'X'],x[CUID1,player,'Y'],x['BALL','DFL-OBJ-0000XT','X'],x['BALL','DFL-OBJ-0000XT','Y']),axis=1)
    if player in matchinfo.get(1)[CUID2]:
        positions[CUID2,player,'DtB']=positions.apply(lambda x: calculateDistance(x[CUID2,player,'X'],x[CUID2,player,'Y'],x['BALL','DFL-OBJ-0000XT','X'],x['BALL','DFL-OBJ-0000XT','Y']),axis=1)

In [19]:
for player in players:
    if player in matchinfo.get(2)[CUID1]:
        positions2[CUID1,player,'DtB']=positions2.apply(lambda x: calculateDistance(x[CUID1,player,'X'],x[CUID1,player,'Y'],x['BALL','DFL-OBJ-0000XT','X'],x['BALL','DFL-OBJ-0000XT','Y']),axis=1)
    if player in matchinfo.get(2)[CUID2]:
        positions2[CUID2,player,'DtB']=positions2.apply(lambda x: calculateDistance(x[CUID2,player,'X'],x[CUID2,player,'Y'],x['BALL','DFL-OBJ-0000XT','X'],x['BALL','DFL-OBJ-0000XT','Y']),axis=1)

In [20]:
balldf=positions.loc[:,("BALL","DFL-OBJ-0000XT",slice(None))]   ##Ballkoordinaten 1. Halbzeit
CUID1df = positions.loc[:,(CUID1,slice(None),slice(None))]     ##Koordinaten von CUID1 1. Halbzeit
cuid1players=CUID1df.columns.get_level_values(level=1).unique()        ##Spieler, die in der 1. Halbzeit auf dem Platz standen
cuid1balldf=pd.merge(CUID1df, balldf, left_index=True, right_index=True)  

CUID2df = positions.loc[:,(CUID2,slice(None),slice(None))]     ##Koordinaten von CUID2 1. Halbzeit
cuid2players=CUID2df.columns.get_level_values(level=1).unique()        ##Spieler, die in der 1. Halbzeit auf dem Platz standen
cuid2balldf=pd.merge(CUID2df, balldf, left_index=True, right_index=True)   ##Gemergter Dataframe Ball und CUID1 1. Halbzeit

balldf2=positions2.loc[:,("BALL","DFL-OBJ-0000XT",slice(None))]   ##Ballkoordinaten 2. Halbzeit
CUID1df2 = positions2.loc[:,(CUID1,slice(None),slice(None))]     ##Koordinaten von CUID1 2. Halbzeit
cuid1players2=CUID1df2.columns.get_level_values(level=1).unique()        ##Spieler, die in der 2. Halbzeit auf dem Platz standen
cuid1balldf2=pd.merge(CUID1df2, balldf2, left_index=True, right_index=True)   ##Gemergter Dataframe Ball und CUID1 2. Halbzeit

CUID2df2 = positions2.loc[:,(CUID2,slice(None),slice(None))]     ##Koordinaten von CUID1 1. Halbzeit
cuid2players2=CUID2df2.columns.get_level_values(level=1).unique()        ##Spieler, die in der 1. Halbzeit auf dem Platz standen
cuid2balldf2=pd.merge(CUID2df2, balldf2, left_index=True, right_index=True)   ##Gemergter Dataframe Ball und CUID1 1. Halbzeit

In [21]:
matchinfo = {
    CUID1: {
        1:getdirectionofplay(CUID1,1),
        2:getdirectionofplay(CUID1,2),
        "BallPoss":1,
        "TeamID":CUID1,
        "TeamBallDf1":cuid1balldf,
        "TeamBallDf2":cuid1balldf2,
    },
    CUID2: {
        1:getdirectionofplay(CUID2,1),
        2:getdirectionofplay(CUID2,2),
        "BallPoss":2,
        "TeamID":CUID2,
        "TeamBallDf1":cuid2balldf,
        "TeamBallDf2":cuid2balldf2,
    },
    1: {
        "BallDf":balldf,
        "Dataframe":positions,
        CUID1:getteamsheet(CUID1,1),
        CUID2:getteamsheet(CUID2,1),
        "Startframe":positions.index.get_level_values(0)[0],
        "Endframe":positions.index.get_level_values(0)[positions.shape[0]-1],
    },
    2: {
        "BallDf":balldf2,
        "Dataframe":positions2,
        CUID1:getteamsheet(CUID1,2),
        CUID2:getteamsheet(CUID2,2),
        "Startframe":positions2.index.get_level_values(0)[0],
        "Endframe":positions2.index.get_level_values(0)[positions.shape[0]-1],
    },
    "left to right":{
        "Goalline":52.5
    },
    "right to left":{
        "Goalline":-52.5
    }
}

In [22]:
kpimergednew

Unnamed: 0,EVENT_ID,MUID,MATCH_DAY,PUID1,PUID2,SHIRT_NUMBER1,SHIRT_NUMBER2,NOMATCH,GDCP_EVENT_TIME,TRACKING_TIME,...,AssistTyp,DISTANCE,ANGLE,FREEKICK,FOOT,SPEED,PRESSURE_ON_SHOT,DEFENDERS,DISTANCE_GOALKEEPER,GK_INGOAL
0,10429827,DFL-MAT-003BEU,1,DFL-OBJ-0027G6,,20,0,0,2019-06-11 20:47:00.637,2019-06-11 20:47:01.920,...,,,,,,,,,,
1,10429828,DFL-MAT-003BEU,1,DFL-OBJ-0027G6,DFL-OBJ-000191,20,18,0,2019-06-11 20:47:00.637,2019-06-11 20:47:01.920,...,,,,,,,,,,
2,10429825,DFL-MAT-003BEU,1,DFL-OBJ-000191,DFL-OBJ-0002F5,18,6,0,2019-06-11 20:47:03.797,2019-06-11 20:47:03.280,...,,,,,,,,,,
3,10429826,DFL-MAT-003BEU,1,DFL-OBJ-0002F5,DFL-OBJ-0000OJ,6,21,0,2019-06-11 20:47:05.220,2019-06-11 20:47:04.680,...,,,,,,,,,,
4,10429819,DFL-MAT-003BEU,1,DFL-OBJ-0000OJ,DFL-OBJ-0002F5,21,6,0,2019-06-11 20:47:06.357,2019-06-11 20:47:05.640,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1718,10431248,DFL-MAT-003BEU,1,DFL-OBJ-0002F5,DFL-OBJ-0001F4,6,16,0,2019-06-11 22:34:08.070,2019-06-11 22:34:09.760,...,,,,,,,,,,
1719,10431249,DFL-MAT-003BEU,1,DFL-OBJ-0001F4,DFL-OBJ-0002F5,16,6,0,2019-06-11 22:34:11.663,2019-06-11 22:34:12.720,...,,,,,,,,,,
1720,10431246,DFL-MAT-003BEU,1,DFL-OBJ-0002F5,DFL-OBJ-0001F4,6,16,0,2019-06-11 22:34:13.343,2019-06-11 22:34:14.360,...,AssistShotAtGoal,23.5,17.3,0.0,1.0,11.4,0.711,3.0,1.9,1.0
1721,10429611,DFL-MAT-003BEU,1,DFL-OBJ-002FZX,,12,0,0,2019-06-11 22:34:13.343,2019-06-11 22:34:15.320,...,,,,,,,,,,
