In [18]:
import seaborn as sns
sns.set_theme()
import os
import json
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

data_dir = "data"
df = pd.read_excel(os.path.join(data_dir, "Updated spreadsheet for Ellen Rushe.xlsx", ))

In [19]:
df["6. Defender being coded"].value_counts()

6. Defender being coded
0    1148
2     352
Name: count, dtype: int64

In [20]:
df["6. Defender being coded"] = df["6. Defender being coded"].replace(2, 1)
df["6. Defender being coded"].value_counts()

6. Defender being coded
0    1148
1     352
Name: count, dtype: int64

In [21]:
# Keep naming convention consistent - all other features have numbers before them. 
df = df.rename(columns={'Team' : '0. Team'})
# Check for any Nan values. 
df.isna().any()

0. Team                                   False
1. Previous phase                         False
2. Pass number                            False
3. Match period                           False
4. Defensive direction                    False
5. Number of defenders                    False
6. Defender being coded                   False
7. Tackle sequence                        False
8. Positional grouping of tackler         False
9. Positional grouping of ball-carrier    False
10. Distance from ball-carrier            False
11. Defensive team performance            False
12. Anticipation of ball-carrier          False
13. Come to balance                       False
14. Body position of tackler              False
15. Body position of ball-carrier         False
16. Drop height                           False
17. Dominant contact angle                False
18. Tackler head position                 False
19. Tackler arm position                  False
20. Tackler foot placement close to BC  

In [22]:
# Create a dictionary which will map existing names to those without spaces (will be used later on).
column_name_map  = dict(zip(list(df.columns.str.replace(' ', '')), list(df.columns)))
for k, v in column_name_map.items():
    assert k == v.replace(" ", "" )

# with open(os.path.join(data_dir, 'column_name_map.json'), 'w') as f:
#     json.dump(column_name_map , f)

# Remove spaces from column names for convenience after saving original names above. 
df.columns = df.columns.str.replace(' ', '') 


In [23]:
# Create a second DataFrame with one-hot-encoding for all variables. 
df_encoded = pd.get_dummies(df, columns=df.columns, dtype=float)
# Get feature names to use later on. 
X_feature_names = list(df_encoded.columns)
print("{} features after target feature drop: {}".format( len(df.columns), df.columns))
print("{} features after dummy columns added: {}".format( len(df_encoded.columns), df_encoded.columns))


31 features after target feature drop: Index(['0.Team', '1.Previousphase', '2.Passnumber', '3.Matchperiod',
       '4.Defensivedirection', '5.Numberofdefenders', '6.Defenderbeingcoded',
       '7.Tacklesequence', '8.Positionalgroupingoftackler',
       '9.Positionalgroupingofball-carrier', '10.Distancefromball-carrier',
       '11.Defensiveteamperformance', '12.Anticipationofball-carrier',
       '13.Cometobalance', '14.Bodypositionoftackler',
       '15.Bodypositionofball-carrier', '16.Dropheight',
       '17.Dominantcontactangle', '18.Tacklerheadposition',
       '19.Tacklerarmposition', '20.TacklerfootplacementclosetoBC',
       '21.Tacklersshouldersinfrontofhips', '22.Speedoftackler',
       '23.SpeedofBC', '24.BCdirectionofmovement', '25.Orientationoftackler',
       '26.Tackletype', '27.Directionoftackle', '28.Tacklerheadplacement',
       '29.Bodyregionstruckontackler', '30.BodyregionstruckonBC'],
      dtype='object')
80 features after dummy columns added: Index(['0.Team_0', '0

In [9]:
data_dir = "data"
# if not os.path.exists(data_dir): os.mkdir(data_dir)
# Convert feature DataFrame to NumPy array
 
# Split data into train and test set. 
train, test = train_test_split(df_encoded.values, test_size=0.2, random_state=42)
train_df = pd.DataFrame(train, columns=X_feature_names)
test_df = pd.DataFrame(test, columns=X_feature_names)

#Write to CSVs
train_df.to_csv(os.path.join(data_dir,"train_31_vars.csv"), index=False)
test_df.to_csv(os.path.join(data_dir,"test_31_vars.csv"), index=False)
