In [2]:
import pandas as pd
import numpy as np

In [3]:
football_df = pd.read_csv('data.csv')

In [4]:
football_df

Unnamed: 0.1,Unnamed: 0,match_event_id,location_x,location_y,remaining_min,power_of_shot,knockout_match,game_season,remaining_sec,distance_of_shot,...,lat/lng,type_of_shot,type_of_combined_shot,match_id,team_id,remaining_min.1,power_of_shot.1,knockout_match.1,remaining_sec.1,distance_of_shot.1
0,0,10.0,167.0,72.0,10.0,1.0,0.0,2000-01,27.0,38.0,...,"45.539131, -122.651648",shot - 30,,20000012,1610612747,10.00,1.00,50.608,54.2000,38.000
1,1,12.0,-157.0,0.0,10.0,1.0,0.0,2000-01,22.0,35.0,...,"45.539131, -122.651648",shot - 45,,20000012,1610612747,10.00,1.00,28.800,22.0000,35.000
2,2,35.0,-101.0,135.0,7.0,1.0,0.0,2000-01,45.0,36.0,...,"45.539131, -122.651648",shot - 25,,20000012,1610612747,92.64,1.00,0.000,63.7216,54.400
3,3,43.0,138.0,175.0,6.0,1.0,0.0,2000-01,52.0,42.0,...,"45.539131, -122.651648",,shot - 3,20000012,1610612747,,1.00,122.608,52.0000,42.000
4,4,155.0,0.0,0.0,,2.0,0.0,2000-01,19.0,20.0,...,"45.539131, -122.651648",,shot - 1,20000012,1610612747,42.64,2.00,0.000,19.0000,20.000
5,5,244.0,-145.0,-11.0,9.0,3.0,0.0,,32.0,34.0,...,"45.539131, -122.651648",shot - 17,,20000012,1610612747,9.00,3.00,0.000,,34.000
6,6,251.0,0.0,0.0,8.0,,0.0,2000-01,52.0,20.0,...,"45.539131, -122.651648",,shot - 4,20000012,1610612747,8.00,3.00,0.000,112.2000,89.400
7,7,254.0,1.0,28.0,8.0,3.0,0.0,2000-01,5.0,22.0,...,"45.539131, -122.651648",,shot - 3,20000012,1610612747,68.64,3.00,0.000,5.0000,22.000
8,8,265.0,-65.0,,6.0,3.0,0.0,2000-01,12.0,32.0,...,"45.539131, -122.651648",shot - 36,,20000012,1610612747,6.00,3.00,0.000,12.0000,32.000
9,9,294.0,-33.0,,3.0,3.0,0.0,2000-01,36.0,32.0,...,"45.539131, -122.651648",shot - 44,,20000012,1610612747,3.00,3.00,0.000,52.2000,


# Data Preprocessing

In [5]:
cols_to_drop = [
    "type_of_combined_shot",
    "team_id",
    "team_name"
]

In [6]:
football_df['type_of_shot'] = [ s1 if not pd.isna(s1) else s2 for s1, s2 in zip(football_df['type_of_shot'], football_df['type_of_combined_shot']) ]

In [7]:
football_df.drop(cols_to_drop, inplace = True, axis = 1)

In [8]:
home_latitude_map = { home: l for home, l in zip(football_df['home/away'], football_df['lat/lng']) if not pd.isna(home) and not pd.isna(l)}

In [9]:
match_latitude_map = { match: l for match, l in zip(football_df['match_id'], football_df['lat/lng']) if not pd.isna(match) and not pd.isna(l)}


In [10]:
football_df['lat/lng'] = football_df.apply(
    lambda x: 
        home_latitude_map[x['home/away']] 
        if not pd.isna(x['home/away']) 
        else (
            match_latitude_map[x['match_id']] 
            if not pd.isna(x['match_id']) 
            else x['lat/lng']
        ),
    axis=1
)

In [11]:
cols_to_drop2 = [
    "lat/lng",
    "home/away",
    "date_of_game",
]

In [12]:
football_df.drop(cols_to_drop2, inplace= True, axis =1)

In [13]:
football_df.columns

Index(['Unnamed: 0', 'match_event_id', 'location_x', 'location_y',
       'remaining_min', 'power_of_shot', 'knockout_match', 'game_season',
       'remaining_sec', 'distance_of_shot', 'is_goal', 'area_of_shot',
       'shot_basics', 'range_of_shot', 'shot_id_number', 'type_of_shot',
       'match_id', 'remaining_min.1', 'power_of_shot.1', 'knockout_match.1',
       'remaining_sec.1', 'distance_of_shot.1'],
      dtype='object')

In [14]:
# res = football_df[~football_df['power_of_shot.1'].isnull()][['power_of_shot', 'power_of_shot.1', 'distance_of_shot', 'distance_of_shot.1', 'type_of_shot', 'shot_basics', 'range_of_shot']]

In [15]:
range_of_shot_unique = list(set(football_df['range_of_shot']))

In [16]:
range_of_shot_unique.pop(0)

nan

In [17]:
#Required unique classes of shot ranges
range_of_shot_unique

['16-24 ft.', '24+ ft.', '8-16 ft.', 'Less Than 8 ft.', 'Back Court Shot']

In [18]:
range_map = {}

In [19]:
football_df_clean = football_df[football_df['distance_of_shot'] == football_df['distance_of_shot.1']]

In [20]:
for val in range_of_shot_unique:
    d1 = football_df_clean[football_df_clean['range_of_shot'] == val]['distance_of_shot.1']
    d2 = football_df_clean[football_df_clean['range_of_shot'] == val]['distance_of_shot']
    range_map[val] = range(int(min(d1.min(), d2.min())), int(max(d1.max(), d2.max())))

In [21]:
range_map

{'16-24 ft.': range(36, 43),
 '24+ ft.': range(42, 63),
 '8-16 ft.': range(28, 36),
 'Less Than 8 ft.': range(20, 28),
 'Back Court Shot': range(60, 99)}

In [22]:
def range_return(dist): 
    for l, r in range_map.items():
        if dist in r:
            return l

In [23]:
def range_update(x):
    if not pd.isna(x['range_of_shot']): return x['range_of_shot']
    if x['distance_of_shot'] != x['distance_of_shot.1']:
        if range_return(x['distance_of_shot']):
            return range_return(x['distance_of_shot'])
        else:
            return range_return(x['distance_of_shot.1'])
    else:
        return range_return(x['distance_of_shot'])

In [24]:
football_df['range_of_shot'] = football_df.apply(range_update, axis=1)

In [25]:
football_df.drop(["distance_of_shot","distance_of_shot.1"], axis = 1, inplace=True)

In [26]:
football_df['power_of_shot'] = football_df.apply(
    lambda x:
        x['power_of_shot']
        if not pd.isna(x['power_of_shot'])
        else (
            x['power_of_shot.1']
            if not pd.isna(x['power_of_shot.1'])
            else x['power_of_shot']
        ),
    axis = 1
)

In [27]:
football_df.columns

Index(['Unnamed: 0', 'match_event_id', 'location_x', 'location_y',
       'remaining_min', 'power_of_shot', 'knockout_match', 'game_season',
       'remaining_sec', 'is_goal', 'area_of_shot', 'shot_basics',
       'range_of_shot', 'shot_id_number', 'type_of_shot', 'match_id',
       'remaining_min.1', 'power_of_shot.1', 'knockout_match.1',
       'remaining_sec.1'],
      dtype='object')

In [28]:
football_df.drop("shot_basics", inplace=True, axis = 1)

In [29]:
#Mapping area of shot with location x and y since x and y represent the same data
area_map = {}
football_df_clean = football_df[~football_df['area_of_shot'].isnull()]
for val in set(football_df_clean['area_of_shot']):
    x = football_df_clean[football_df_clean['area_of_shot'] == val]
    area_map[val] = \
        range(int(x['location_x'].min()), int(x['location_x'].max())), \
        range(int(x['location_y'].min()), int(x['location_y'].max()))

In [30]:
area_map

{'Right Side(R)': (range(41, 248), range(-44, 138)),
 'Left Side(L)': (range(-250, -42), range(-44, 137)),
 'Left Side Center(LC)': (range(-246, -51), range(88, 396)),
 'Center(C)': (range(-98, 128), range(-35, 396)),
 'Mid Ground(MG)': (range(-223, 217), range(398, 791)),
 'Right Side Center(RC)': (range(51, 241), range(89, 394))}

In [31]:
def area_update(x):
    if not pd.isna(x['area_of_shot']): return x['area_of_shot']
    for key, value in area_map.items():
        if not pd.isna(x['location_x']) and not pd.isna(x['location_y']):
            if x['location_x'] in value[0] and x['location_y'] in value[1]:
                return key
        elif not pd.isna(x['location_x']):
            if x['location_x'] in value[0]:
                return key
        elif not pd.isna(x['location_y']):
            if x['location_y'] in value[1]:
                return key
    return x['area_of_shot']

In [32]:
football_df['area_of_shot'] = football_df.apply(area_update, axis=1)

In [33]:
football_df.drop(["location_x","location_y"], inplace = True, axis = 1)

In [34]:
football_df['remaining_min'] = football_df.apply(
    lambda x:
        x['remaining_min']
        if not pd.isna(x['remaining_min'])
        else (
            x['remaining_min.1']
            if x['remaining_min.1'] <= 11
            else x['remaining_min']
        ),
    axis = 1
)

In [35]:
football_df['remaining_sec'] = football_df.apply(
    lambda x:
        x['remaining_sec']
        if not pd.isna(x['remaining_sec'])
        else (
            x['remaining_sec.1']
            if x['remaining_sec.1'] <= 59
            else x['remaining_sec']
        ),
    axis = 1
)

In [36]:
football_df['remaining_min'] = football_df.apply(
    lambda x:
        x['remaining_min']
        if not pd.isna(x['remaining_min'])
        else x['remaining_min.1'] % 11,
    axis = 1
)

In [37]:
football_df['remaining_sec'] = football_df.apply(
    lambda x:
        x['remaining_sec']
        if not pd.isna(x['remaining_sec'])
        else x['remaining_sec.1'] % 60,
    axis = 1
)

In [38]:
football_df['remaining_min'] = football_df.apply(
    lambda x:
        x['remaining_min']
        if not pd.isna(x['remaining_min'])
        else 0,
    axis = 1
)

In [39]:
football_df['remaining_sec'] = football_df.apply(
    lambda x:
        x['remaining_sec']
        if not pd.isna(x['remaining_sec'])
        else 0,
    axis = 1
)

In [40]:
cols_to_drop3 = [
    "remaining_min.1",
    "remaining_sec.1",
    "Unnamed: 0",
    "power_of_shot.1"
]

In [41]:
football_df.drop(cols_to_drop3, axis = 1, inplace = True)

In [42]:
football_df.columns

Index(['match_event_id', 'remaining_min', 'power_of_shot', 'knockout_match',
       'game_season', 'remaining_sec', 'is_goal', 'area_of_shot',
       'range_of_shot', 'shot_id_number', 'type_of_shot', 'match_id',
       'knockout_match.1'],
      dtype='object')

In [43]:
#encoding categorical data using enumeration method

area_of_shot_mapping = {}
for i, val in enumerate(set(football_df['area_of_shot'])):
    area_of_shot_mapping[val] = i

type_of_shot_mapping = {}
for i, val in enumerate(set(football_df['type_of_shot'])):
    type_of_shot_mapping[val] = i
    
range_of_shot_mapping = {}
for i, val in enumerate(set(football_df['range_of_shot'])):
    range_of_shot_mapping[val] = i

game_season_mapping = {}
for i, val in enumerate(set(football_df['game_season'])):
    game_season_mapping[val] = i

In [44]:
def mapping_categorical_values(c, d):
    def func(x):
        return d[x[c]]
    return func

In [45]:
df_back = football_df

In [46]:
mapping_list = [
    ("game_season", game_season_mapping),
    ("area_of_shot", area_of_shot_mapping),
    ("range_of_shot", range_of_shot_mapping),
    ("type_of_shot", type_of_shot_mapping) 
]

In [47]:
for col, d in mapping_list:
    football_df[col] = football_df.apply(mapping_categorical_values(col, d), axis = 1)

In [48]:
unique_powers = football_df.power_of_shot.unique()
unique_powers = unique_powers[~np.isnan(unique_powers)]

In [49]:
count_powers = football_df.groupby("power_of_shot").size()
count_powers /= count_powers.sum()

In [50]:
football_df['power_of_shot'] = football_df.apply(
    lambda x:
        x['power_of_shot']
        if not pd.isna(x['power_of_shot'])
        else np.random.choice(unique_powers, p = count_powers),
    axis = 1
)

In [51]:
def knockout_match_update(x):
    if not pd.isna(x['knockout_match']): return x['knockout_match']
    if not pd.isna(x['knockout_match.1']) and int(x['knockout_match.1']) in [0, 1]:
        return x['knockout_match.1']
    return x['knockout_match']

In [52]:
football_df['knockout_match'] = football_df.apply(knockout_match_update, axis = 1)

In [53]:
count_knockout = football_df.groupby("knockout_match").size()
count_knockout /= count_knockout.sum()

In [54]:
football_df["knockout_match"] = football_df.knockout_match.map(
    lambda x:
        np.random.choice([0,1], p = count_knockout) 
        if np.isnan(x) 
        else x 
    )

In [55]:
intermediate = football_df[~football_df['game_season'].isnull()][['game_season', 'match_id']]

In [56]:
match_id_game_season_map = dict(intermediate.groupby('game_season')['match_id'].apply(list))

In [57]:
def game_season_mapping(x):
    if not pd.isna(x['game_season']): return x['game_season']
    for key, value in match_id_game_season_map.items():
        if x['match_id'] in value:
            return key
    return np.random.choice(list(match_id_season_map.keys()))

In [58]:
football_df['game_season'] = football_df.apply(game_season_mapping, axis = 1)

In [59]:
football_df.drop("knockout_match.1", axis = 1, inplace = True)

In [60]:
football_df.head()

Unnamed: 0,match_event_id,remaining_min,power_of_shot,knockout_match,game_season,remaining_sec,is_goal,area_of_shot,range_of_shot,shot_id_number,type_of_shot,match_id
0,10.0,10.0,1.0,0.0,4.0,27.0,,1,0,1.0,2,20000012
1,12.0,10.0,1.0,0.0,4.0,22.0,0.0,2,3,2.0,0,20000012
2,35.0,7.0,1.0,0.0,4.0,45.0,1.0,3,0,3.0,41,20000012
3,43.0,6.0,1.0,0.0,4.0,52.0,0.0,6,0,4.0,32,20000012
4,155.0,9.64,2.0,0.0,4.0,19.0,1.0,4,4,5.0,14,20000012


# Model Training and Prediction

In [61]:
from sklearn.svm import SVR

In [62]:
clf = SVR()

In [63]:
headers = list(football_df.columns)[1:]

In [64]:
headers.remove('is_goal')
headers.remove('shot_id_number')
headers.remove("match_id")

In [65]:
condition = football_df["is_goal"].isnull()

In [66]:
X = np.array(football_df[~condition][headers])
Y = np.array(football_df[~condition]['is_goal'])

In [None]:
clf.fit(X, Y)



In [303]:
condition = football_df['is_goal'].isnull() & ~football_df['shot_id_number'].isnull()

In [304]:
id_numbers = np.array(football_df[condition]['shot_id_number'])
X_pred = np.array(football_df[condition][headers])

In [305]:
Y_pred = clf.predict(X_pred)

In [307]:
Y_pred[1]

0.45646414473849134

# Writing to CSV File

In [308]:
import csv

In [309]:
with open("bipin_kalra_110597_code_7.csv", "w") as file:
    writer = csv.writer(file)
    headers = ['shot_id_number','is_goal']
    writer.writerow(headers)
    writer.writerows(zip(id_numbers, Y_pred))