### Importing libraries and reading the dataset

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from IPython.display import display

pd.options.display.max_columns = None

In [2]:
df = pd.read_csv('dataset/Features Dataset.csv')
df.head()

Unnamed: 0,Match,ID,Sport,Interviewee,Opponent,Sex,Date,Rank,Rank Opp.,Int. Age,Opp. Age,Health,Psychics,Prev. Match,Confidence,Result,Final Result
0,BG_Box_AleksandarPovetkin_vs_AnthonyDjoshua_NO,0,Box,Aleksandar Povetkin,Anthony Djoshua,Male,22.09.2018,34-1,21-0,39,28,H,5,N,3,TKO 7 (12),NO
1,BG_Box_AndiRuiz_vs_AnthonyDjoshua_YES,1,Box,Andy Ruiz,Anthony Djoshua,Male,01.06.2019,32-1,22-0,29,29,H,5,N,4,TKO 7 (12),YES
2,BG_Box_AnthonyDjoshua_vs_AleksandarPovetkin_YES,2,Box,Anthony Djoshua,Aleksandar Povetkin,Male,22.09.2018,21-0,34-1,28,39,H,5,N,4,TKO 7 (12),YES
3,BG_Box_AnthonyDjoshua_vs_AndiRuiz_NO,3,Box,Anthony Djoshua,Andy Ruiz,Male,01.06.2019,22-0,32-1,29,29,H,5,N,4,TKO 7 (12),NO
4,BG_Box_AntonyDjoshua_vs_VladimirKlichko_YES,4,Box,Anthony Djoshua,Wladimir Klitschko,Male,29.04.2017,18-0,64-4,27,41,H,5,N,4,TKO 11 (12),YES


In [3]:
df.index = list(df.ID)

drop_list = ['Match', 'ID', 'Result']
df.drop(drop_list, axis=1, inplace=True)

In [4]:
df.head()

Unnamed: 0,Sport,Interviewee,Opponent,Sex,Date,Rank,Rank Opp.,Int. Age,Opp. Age,Health,Psychics,Prev. Match,Confidence,Final Result
0,Box,Aleksandar Povetkin,Anthony Djoshua,Male,22.09.2018,34-1,21-0,39,28,H,5,N,3,NO
1,Box,Andy Ruiz,Anthony Djoshua,Male,01.06.2019,32-1,22-0,29,29,H,5,N,4,YES
2,Box,Anthony Djoshua,Aleksandar Povetkin,Male,22.09.2018,21-0,34-1,28,39,H,5,N,4,YES
3,Box,Anthony Djoshua,Andy Ruiz,Male,01.06.2019,22-0,32-1,29,29,H,5,N,4,NO
4,Box,Anthony Djoshua,Wladimir Klitschko,Male,29.04.2017,18-0,64-4,27,41,H,5,N,4,YES


### Transforming and removing attributes

In [5]:
df["Date"] = pd.to_datetime(df["Date"] )
df.head()

Unnamed: 0,Sport,Interviewee,Opponent,Sex,Date,Rank,Rank Opp.,Int. Age,Opp. Age,Health,Psychics,Prev. Match,Confidence,Final Result
0,Box,Aleksandar Povetkin,Anthony Djoshua,Male,2018-09-22,34-1,21-0,39,28,H,5,N,3,NO
1,Box,Andy Ruiz,Anthony Djoshua,Male,2019-01-06,32-1,22-0,29,29,H,5,N,4,YES
2,Box,Anthony Djoshua,Aleksandar Povetkin,Male,2018-09-22,21-0,34-1,28,39,H,5,N,4,YES
3,Box,Anthony Djoshua,Andy Ruiz,Male,2019-01-06,22-0,32-1,29,29,H,5,N,4,NO
4,Box,Anthony Djoshua,Wladimir Klitschko,Male,2017-04-29,18-0,64-4,27,41,H,5,N,4,YES


In [6]:
column_date = df['Date']

df_date = pd.DataFrame({"year": column_date.dt.year,
                        "month": column_date.dt.month,
                        "day": column_date.dt.day,
                        "hour": column_date.dt.hour,
                        "dayofyear": column_date.dt.dayofyear,
                        "week": column_date.dt.week,
                        "weekofyear": column_date.dt.weekofyear,
                        "dayofweek": column_date.dt.dayofweek,
                        "weekday": column_date.dt.weekday,
                        "quarter": column_date.dt.quarter,
                       })

In [7]:
df_date.head()

Unnamed: 0,day,dayofweek,dayofyear,hour,month,quarter,week,weekday,weekofyear,year
0,22,5,265,0,9,3,38,5,38,2018
1,6,6,6,0,1,1,1,6,1,2019
2,22,5,265,0,9,3,38,5,38,2018
3,6,6,6,0,1,1,1,6,1,2019
4,29,5,119,0,4,2,17,5,17,2017


### Filling NaN values

In [8]:
def health_transform(v):
    if v == 'H':
        return 4
    if v == 'S':
        return 3
    if v == 'A':
        return 2
    if v == 'I':
        return 1
    return 4

In [9]:
def prev_match_transform(v):
    if v == 'W':
        return 2
    if v == 'L':
        return 1
    return 0

In [10]:
def rank_transform(v):
    v = v.replace('-', ' ')
    split = v.split(' ')
    wins = float(split[0])
    loses = float(split[1])
    loses = loses if loses > 0 else 0.1
    rank = wins / loses
    return rank

In [11]:
df['Sex'] = df['Sex'].map(lambda s: 1 if s == 'Male' else 0)
df['Health'] = df['Health'].apply(health_transform)
df['Prev. Match'] = df['Prev. Match'].apply(prev_match_transform)

In [12]:
df['Rank_T'] = 0

In [13]:
for index, row in df.iterrows():
    if row['Sport'] != 'Tennis':
        df.loc[(df.index==index), 'Rank_T'] = rank_transform(row['Rank'])
    else:
        df.loc[(df.index==index), 'Rank_T'] = float(row['Rank'])

In [14]:
df['Rank'] = df['Rank_T']

In [15]:
df['Rank_T'] = 0

In [16]:
for index, row in df.iterrows():
    if row['Sport'] != 'Tennis':
        df.loc[(df.index==index), 'Rank_T'] = rank_transform(row['Rank Opp.'])
    else:
        df.loc[(df.index==index), 'Rank_T'] = float(row['Rank Opp.'])

In [17]:
df['Rank Opp.'] = df['Rank_T']

In [18]:
def sport_transform(v):
    if v == 'Box':
        return 1
    if v == 'MMA':
        return 2
    if v == 'Tennis':
        return 3
    return None

In [19]:
df['Sport'] = df['Sport'].apply(sport_transform)

In [20]:
df.drop('Rank_T', axis=1, inplace=True)

In [21]:
def label_transform(v):
    if v == 'YES':
        return 1
    if v == 'NO':
        return 0
    return None

In [22]:
df['Final Result'] = df['Final Result'].apply(label_transform)

In [23]:
interviewee = set(df['Interviewee'].unique())
opponent = set(df['Opponent'].unique())

In [24]:
players = interviewee.union(opponent)
players

{'Aleksandar Povetkin',
 'Aleksandra Toncheva',
 'Andy Mury',
 'Andy Ruiz',
 'Anita Doganova',
 'Anthony Djoshua',
 'Atanas Mihaylov',
 'Ben Rothwell',
 'Blagoi Ivanov',
 'Bogdan Dinu',
 'Caroline Wozniacki',
 'Cvetana Pironkova',
 'Daniel Zlatkov',
 'David Gofen',
 'Dereck Chisora',
 'Deyan Topalski',
 'Dominic Thiem',
 'Fabio Fonini',
 'Francesco Pianeta',
 'Gael Montfis',
 'Grigor Dimitrov',
 'Grigor Saruhanian',
 'Hughie Fury',
 'Jerzy Janowicz',
 'Kevin Johnson',
 'Kubrat Pulev',
 'Leonardo Bruzzese',
 'Marcos Bagdatis',
 'Maria Sharapova',
 'Mario Cilic',
 'Maurice Harris',
 'Muslim Salihov',
 'Nikolas Basilashvili',
 'Novak Djokovic',
 'Piotr Strus',
 'Rafael Nadal',
 'Roger Federer',
 'Stan Wawrinka',
 'Stefanos Cicipas',
 'Svetlozar Savov',
 'Tervel Pulev',
 'Tommy Robredo',
 'Tony Thompson',
 'Tsvetozar Iliev',
 'Tyson Fury',
 'Viktor Troicky',
 'Wladimir Klitschko'}

In [25]:
d = dict.fromkeys(players, 0)

In [26]:
counter = 0
for k, v in d.items():
    d[k] = counter
    counter += 1

In [27]:
def set_id(v):
    return d[v]

In [28]:
df['Interviewee'] = df['Interviewee'].apply(set_id)
df['Opponent'] = df['Opponent'].apply(set_id)

In [29]:
df.drop('Date', axis=1, inplace=True)
df = df.join(df_date)

In [30]:
df['label'] = 0
df['label'] = df['Final Result']
df.drop('Final Result', axis=1, inplace=True)

In [31]:
display(df)

Unnamed: 0,Sport,Interviewee,Opponent,Sex,Rank,Rank Opp.,Int. Age,Opp. Age,Health,Psychics,Prev. Match,Confidence,day,dayofweek,dayofyear,hour,month,quarter,week,weekday,weekofyear,year,label
0,1,5,18,1,34.0,210.0,39,28,4,5,0,3,22,5,265,0,9,3,38,5,38,2018,0
1,1,34,18,1,32.0,220.0,29,29,4,5,0,4,6,6,6,0,1,1,1,6,1,2019,1
2,1,18,5,1,210.0,34.0,28,39,4,5,0,4,22,5,265,0,9,3,38,5,38,2018,1
3,1,18,34,1,220.0,32.0,29,29,4,5,0,4,6,6,6,0,1,1,1,6,1,2019,0
4,1,18,6,1,180.0,16.0,27,41,4,5,0,4,29,5,119,0,4,2,17,5,17,2017,1
5,1,43,39,1,18.0,26.0,32,37,4,5,1,4,23,5,82,0,3,1,12,5,12,2019,0
6,1,21,3,1,0.26087,1.0,34,32,4,5,0,3,5,1,64,0,3,1,10,1,10,2019,0
7,1,22,39,1,5.0,22.0,32,35,4,5,0,5,5,1,187,0,7,3,27,1,27,2016,0
8,1,4,39,1,21.0,25.0,24,37,4,5,0,5,27,5,300,0,10,4,43,5,43,2018,0
9,1,1,39,1,4.285714,24.0,37,35,4,5,0,5,28,4,118,0,4,2,17,4,17,2017,0


### Save Dataframe as csv file

In [32]:
# file_name = 'Dataset/pool_matches_numerical_classified.csv'
# df.to_csv(file_name, sep=',', encoding='utf-8', header=True, index=True)

### Normalization

In [33]:
x = df.values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_norm = pd.DataFrame(x_scaled, index=df.index, columns=df.columns)

In [34]:
df_norm.head()

Unnamed: 0,Sport,Interviewee,Opponent,Sex,Rank,Rank Opp.,Int. Age,Opp. Age,Health,Psychics,Prev. Match,Confidence,day,dayofweek,dayofyear,hour,month,quarter,week,weekday,weekofyear,year,label
0,0.0,0.088889,0.391304,1.0,0.125461,0.774908,0.948718,0.409091,1.0,1.0,0.0,0.5,0.689655,0.833333,0.794479,0.0,0.8,0.666667,0.787234,0.833333,0.787234,0.888889,0.0
1,0.0,0.733333,0.391304,1.0,0.118081,0.811808,0.692308,0.454545,1.0,1.0,0.0,0.75,0.137931,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
2,0.0,0.377778,0.108696,1.0,0.774908,0.125461,0.666667,0.909091,1.0,1.0,0.0,0.75,0.689655,0.833333,0.794479,0.0,0.8,0.666667,0.787234,0.833333,0.787234,0.888889,1.0
3,0.0,0.377778,0.73913,1.0,0.811808,0.118081,0.692308,0.454545,1.0,1.0,0.0,0.75,0.137931,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,0.0,0.377778,0.130435,1.0,0.664207,0.059041,0.641026,1.0,1.0,1.0,0.0,0.75,0.931034,0.833333,0.346626,0.0,0.3,0.333333,0.340426,0.833333,0.340426,0.777778,1.0


### Save normalized Dataframe as csv file

In [35]:
file_name = 'dataset/features_norm.csv'
df_norm.to_csv(file_name, sep=',', encoding='utf-8', header=True, index=True)