## New things learnt in CR Data

1. Label encoding - converting categorical columns to integer datatypes
2. forward filling of nan values - using ffill() and fillna(method=ffill)
3. fillna will be done only for values having empty values in the dataframe selected
4. finding and replacing the values based on probability of occurance of an event
5. Using apply function rigorously
6. Using isnull(), value_counts(), unique() methods
7. finding a dataframe on particular condition - df["wanted dataframe columns"]["conditon"]
8. selecting a dataframe based on null values of a column - df[df["column name"].isnull()]

In [83]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [111]:
data = pd.read_csv("yds_data.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,match_event_id,location_x,location_y,remaining_min,power_of_shot,knockout_match,game_season,remaining_sec,distance_of_shot,...,lat/lng,type_of_shot,type_of_combined_shot,match_id,team_id,remaining_min.1,power_of_shot.1,knockout_match.1,remaining_sec.1,distance_of_shot.1
0,0,10.0,167.0,72.0,10.0,1.0,0.0,2000-01,27.0,38.0,...,"45.539131, -122.651648",shot - 30,,20000012,1610612747,10.0,1.0,50.608,54.2,38.0
1,1,12.0,-157.0,0.0,10.0,1.0,0.0,2000-01,22.0,35.0,...,"45.539131, -122.651648",shot - 45,,20000012,1610612747,10.0,1.0,28.8,22.0,35.0
2,2,35.0,-101.0,135.0,7.0,1.0,0.0,2000-01,45.0,36.0,...,"45.539131, -122.651648",shot - 25,,20000012,1610612747,92.64,1.0,0.0,63.7216,54.4
3,3,43.0,138.0,175.0,6.0,1.0,0.0,2000-01,52.0,42.0,...,"45.539131, -122.651648",,shot - 3,20000012,1610612747,,1.0,122.608,52.0,42.0
4,4,155.0,0.0,0.0,,2.0,0.0,2000-01,19.0,20.0,...,"45.539131, -122.651648",,shot - 1,20000012,1610612747,42.64,2.0,0.0,19.0,20.0


In [85]:
## removing un wanted columns
print(data.columns)
data.drop(['Unnamed: 0','remaining_min.1','power_of_shot.1', 'knockout_match.1', 'remaining_sec.1','distance_of_shot.1'], axis=1, inplace=True)
print(data.columns)

Index(['Unnamed: 0', 'match_event_id', 'location_x', 'location_y',
       'remaining_min', 'power_of_shot', 'knockout_match', 'game_season',
       'remaining_sec', 'distance_of_shot', 'is_goal', 'area_of_shot',
       'shot_basics', 'range_of_shot', 'team_name', 'date_of_game',
       'home/away', 'shot_id_number', 'lat/lng', 'type_of_shot',
       'type_of_combined_shot', 'match_id', 'team_id', 'remaining_min.1',
       'power_of_shot.1', 'knockout_match.1', 'remaining_sec.1',
       'distance_of_shot.1'],
      dtype='object')
Index(['match_event_id', 'location_x', 'location_y', 'remaining_min',
       'power_of_shot', 'knockout_match', 'game_season', 'remaining_sec',
       'distance_of_shot', 'is_goal', 'area_of_shot', 'shot_basics',
       'range_of_shot', 'team_name', 'date_of_game', 'home/away',
       'shot_id_number', 'lat/lng', 'type_of_shot', 'type_of_combined_shot',
       'match_id', 'team_id'],
      dtype='object')


In [86]:
data.info()
data.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30697 entries, 0 to 30696
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   match_event_id         29134 non-null  float64
 1   location_x             29236 non-null  float64
 2   location_y             29157 non-null  float64
 3   remaining_min          29135 non-null  float64
 4   power_of_shot          29211 non-null  float64
 5   knockout_match         29180 non-null  float64
 6   game_season            24835 non-null  object 
 7   remaining_sec          29103 non-null  float64
 8   distance_of_shot       29130 non-null  float64
 9   is_goal                24429 non-null  float64
 10  area_of_shot           29195 non-null  object 
 11  shot_basics            29122 non-null  object 
 12  range_of_shot          29133 non-null  object 
 13  team_name              29162 non-null  object 
 14  date_of_game           29147 non-null  object 
 15  ho

match_event_id            1563
location_x                1461
location_y                1540
remaining_min             1562
power_of_shot             1486
knockout_match            1517
game_season               5862
remaining_sec             1594
distance_of_shot          1567
is_goal                   6268
area_of_shot              1502
shot_basics               1575
range_of_shot             1564
team_name                 1535
date_of_game              1550
home/away                 1497
shot_id_number            1563
lat/lng                   1565
type_of_shot             15280
type_of_combined_shot    15417
match_id                     0
team_id                      0
dtype: int64

In [87]:
## converting dateofgame column to datetime value
data["date_of_game"] = pd.to_datetime(data["date_of_game"], errors="coerce")
data["date_of_game"].dtype

dtype('<M8[ns]')

In [88]:
data["game_season"] = data["game_season"].astype("str")
data["game_season"].dtype

dtype('O')

In [89]:
## Label encoding the game season

l_gs = data["game_season"].unique()
v_gs = np.arange(len(l_gs))

data["game_season"].replace(to_replace=l_gs,value = v_gs,inplace=True)
data["game_season"] = data["game_season"].astype("int32")
data["game_season"]  ## converting the data type from int64 to int32

0         0
1         0
2         0
3         0
4         0
         ..
30692    20
30693    20
30694    20
30695    20
30696    20
Name: game_season, Length: 30697, dtype: int32

## Handling missing values

In [90]:
data.isnull().sum()

match_event_id            1563
location_x                1461
location_y                1540
remaining_min             1562
power_of_shot             1486
knockout_match            1517
game_season                  0
remaining_sec             1594
distance_of_shot          1567
is_goal                   6268
area_of_shot              1502
shot_basics               1575
range_of_shot             1564
team_name                 1535
date_of_game              1550
home/away                 1497
shot_id_number            1563
lat/lng                   1565
type_of_shot             15280
type_of_combined_shot    15417
match_id                     0
team_id                      0
dtype: int64

In [91]:
data["power_of_shot"] = data["power_of_shot"].fillna(data["power_of_shot"].mean()) ## filling power of shot with mean
data["remaining_sec"] = data["remaining_sec"].fillna(data["remaining_sec"].median()) ## filling remaining time with median

## filling type of combined shot with mode
data["type_of_combined_shot"] = data["type_of_combined_shot"].fillna(data["type_of_combined_shot"].value_counts().keys()[0])

In [92]:
## filling shot_id_number with continuous values
val = np.arange(1, len(data["shot_id_number"])+1)  
data["shot_id_number"] = val

In [93]:
## filling location_x and location_y columns having null values

data["location_x"] = data["location_x"].fillna(0)
data["location_y"] = data["location_y"].fillna(0)

In [94]:
## filling data with forward filling 

cols = ["match_event_id","home/away","lat/lng","team_name","team_id","match_id","remaining_min","knockout_match","game_season"]

data[cols] = data[cols].ffill()

In [107]:
al = data["shot_basics"][data["range_of_shot"] == "Back Court Shot"].value_counts().keys().to_list()
asum = data["shot_basics"][data["range_of_shot"] == "Back Court Shot"].value_counts().sum()
av = (data["shot_basics"][data["range_of_shot"] == "Back Court Shot"].value_counts()/asum).to_list()
av

[0.8441558441558441, 0.15584415584415584]

In [50]:
# Filling Missing Values In "shot_basics" based on "range_of_short" column!

data.loc[(data["range_of_shot"]=="Less Than 8 ft.")&(data["shot_basics"].isnull()),"shot_basics"] = data.loc[(data["range_of_shot"]=="Less Than 8 ft.")&(data["shot_basics"].isnull()),"shot_basics"].apply(lambda x:x if type(x)==str else np.random.choice(["Goal Area","Goal Line"],1,p=[0.7590347263095939, 0.24096527369040613])[0])
data.loc[(data.range_of_shot == '16-24 ft.'), 'shot_basics'] = data.loc[(data.range_of_shot == '16-24 ft.'),"shot_basics"].fillna(value='Mid Range')
data.loc[(data["range_of_shot"]=="24+ ft.")&(data["shot_basics"].isnull()),"shot_basics"] = data.loc[(data["range_of_shot"]=="24+ ft.")&(data["shot_basics"].isnull()),"shot_basics"].apply(lambda x:x if type(x) == str else np.random.choice(["Penalty Spot","Right Corner","Left Corner"],1,p=[0.8932384341637011,0.06192170818505338,0.044839857651245554])[0])
data.loc[(data["range_of_shot"]=="8-16 ft.")&(data["shot_basics"].isnull()),"shot_basics"] = data.loc[(data["range_of_shot"]=="8-16 ft.")&(data["shot_basics"].isnull()),"shot_basics"].apply(lambda x:x if type(x)==str else np.random.choice(["Mid Range","Goal Line"],1,p=[0.6488754615642833, 0.35112453843571667])[0])
data.loc[(data["range_of_shot"]=="Back Court Shot")&(data["shot_basics"].isnull()),"shot_basics"] = data.loc[(data["range_of_shot"]=="Back Court Shot")&(data["shot_basics"].isnull()),"shot_basics"].apply(lambda x:x if type(x)==str else np.random.choice(["Mid Ground Line","Penalty Spot"],1,p=[0.8441558441558441, 0.15584415584415584])[0])
data.isnull().sum()

match_event_id               0
location_x                   0
location_y                   0
remaining_min                0
power_of_shot                0
knockout_match               0
game_season                  0
remaining_sec                0
distance_of_shot          1567
is_goal                   6268
area_of_shot              1502
shot_basics                 66
range_of_shot             1564
team_name                    0
date_of_game              1550
home/away                    0
shot_id_number               0
lat/lng                      0
type_of_shot             15280
type_of_combined_shot        0
match_id                     0
team_id                      0
dtype: int64

In [108]:
## calculations for less than 8 ft
df1 = data[data["shot_basics"]=="Goal Line"]
count1 = df1["shot_basics"][data["range_of_shot"]=="Less Than 8 ft."].count()
df2 = data[data["shot_basics"]=="Goal Area"]
count2 = df2["shot_basics"][data["range_of_shot"]=="Less Than 8 ft."].count()
findf = data[data["range_of_shot"]=="Less Than 8 ft."]
final = findf["shot_basics"][findf["shot_basics"].notnull()].count()
p1,p2 = (count1/final,count2/final)
#print(p1,p2,p1+p2)
## calculations for 16-24 feet
data["shot_basics"][data["range_of_shot"]=="16-24 ft."].value_counts()
## calculations for 24+ feet
data["shot_basics"][data["range_of_shot"]=="24+ ft."].value_counts()
df1 = data[data["shot_basics"]=="Penalty Spot"]
count1 = df1["shot_basics"][data["range_of_shot"]=="24+ ft."].count()
df2 = data[data["shot_basics"]=="Right Corner"]
count2 = df2["shot_basics"][data["range_of_shot"]=="24+ ft."].count()
df3 = data[data["shot_basics"]=="Left Corner"]
count3 = df3["shot_basics"][data["range_of_shot"]=="24+ ft."].count()
findf = data[data["range_of_shot"]=="24+ ft."]
final = findf["shot_basics"][findf["shot_basics"].notnull()].count()
p1,p2,p3 = (count1/final,count2/final,count3/final)
#print(p1,p2,p3,p1+p2+p3)
## calculations for 8-16 feet
data["shot_basics"][data["range_of_shot"]=="8-16 ft."].value_counts()
df1 = data[data["shot_basics"]=="Mid Range"]
count1 = df1["shot_basics"][data["range_of_shot"]=="8-16 ft."].count()
df2 = data[data["shot_basics"]=="Goal Line"]
count2 = df2["shot_basics"][data["range_of_shot"]=="8-16 ft."].count()
findf = data[data["range_of_shot"]=="8-16 ft."]
final = findf["shot_basics"][findf["shot_basics"].notnull()].count()
p1,p2 = (count1/final,count2/final)
#print(p1,p2,p1+p2)
## calculations for back court shot
print(data["shot_basics"][data["range_of_shot"]=="Back Court Shot"].value_counts())
df1 = data[data["shot_basics"]=="Mid Ground Line"]
count1 = df1["shot_basics"][data["range_of_shot"]=="Back Court Shot"].count()
df2 = data[data["shot_basics"]=="Penalty Spot"]
count2 = df2["shot_basics"][data["range_of_shot"]=="Back Court Shot"].count()
findf = data[data["range_of_shot"]=="Back Court Shot"]
final = findf["shot_basics"][findf["shot_basics"].notnull()].count()
p1,p2 = (count1/final,count2/final)
print(p1,p2,p1+p2)

Mid Ground Line    65
Penalty Spot       12
Name: shot_basics, dtype: int64
0.8441558441558441 0.15584415584415584 1.0


In [52]:
## same to be done for 
# Filling Missing Values In "range_of_short" based on "short_basics" column!

# if shot_basics is Goal Area, then range of shot is Less Than 8 ft
data.loc[(data.shot_basics == 'Goal Area'), 'range_of_shot']       = data[data.shot_basics == 'Goal Area'].range_of_shot.fillna(value='Less Than 8 ft.')
# if shot_basics is Penalty Spot, then range of shot is  24+ ft.
data.loc[(data.shot_basics == 'Penalty Spot'), 'range_of_shot']    = data[data.shot_basics == 'Penalty Spot'].range_of_shot.fillna(value= '24+ ft.')
# if shot_basics is Right Corner, then range of shot is  24+ ft.
data.loc[(data.shot_basics == 'Right Corner'), 'range_of_shot']    = data[data.shot_basics == 'Right Corner'].range_of_shot.fillna(value='24+ ft.')
# if shot_basics is Left Corner, then range of shot is  24+ ft.
data.loc[(data.shot_basics == 'Left Corner'), 'range_of_shot']     = data[data.shot_basics == 'Left Corner'].range_of_shot.fillna(value='24+ ft.')
# if shot_basics is Mid Ground Line , then range of shot is  Back Court Shot
data.loc[(data.shot_basics == 'Mid Ground Line'), 'range_of_shot'] = data[data.shot_basics == 'Mid Ground Line'].range_of_shot.fillna(value='Back Court Shot')
# if shot_basics is Mid Range then randomly assign '16-24 ft.' or  '8-16 ft.' to range of shot
data.loc[(data.shot_basics == 'Mid Range')&(data.range_of_shot.isnull()), 'range_of_shot']       = pd.Series(data[(data.shot_basics == 'Mid Range')&(data.range_of_shot.isnull())].range_of_shot.apply(lambda x: x if type(x)==str else np.random.choice(['16-24 ft.', '8-16 ft.'],1,p=[0.6527708850289495, 0.34722911497105047])[0]))
# if shot_basics is Goal Line then randomly assign ''8-16 ft.' or  'Less Than 8 ft.' to range of shot
data.loc[(data.shot_basics == 'Goal Line')&(data.range_of_shot.isnull()), 'range_of_shot']       = pd.Series(data[(data.shot_basics == 'Goal Line')&(data.range_of_shot.isnull())].range_of_shot.apply(lambda x: x if type(x)==str else np.random.choice(['8-16 ft.', 'Less Than 8 ft.'],1,p=[0.5054360956752839, 0.49456390432471614])[0]))

data.isnull().sum() # number of missing values for range_of_shot column should have been reduced

match_event_id               0
location_x                   0
location_y                   0
remaining_min                0
power_of_shot                0
knockout_match               0
game_season                  0
remaining_sec                0
distance_of_shot          1567
is_goal                   6268
area_of_shot              1502
shot_basics                 66
range_of_shot               66
team_name                    0
date_of_game              1550
home/away                    0
shot_id_number               0
lat/lng                      0
type_of_shot             15280
type_of_combined_shot        0
match_id                     0
team_id                      0
dtype: int64

In [53]:
## remaining 66 values can be filled with forward filling
## forward filling can be used in other way with fillna method as well as shown below
## forward fill also fills only for the places having nan value
data["shot_basics"].ffill(inplace=True)
data["range_of_shot"].fillna(method="ffill",inplace=True)
data.isnull().sum()  

match_event_id               0
location_x                   0
location_y                   0
remaining_min                0
power_of_shot                0
knockout_match               0
game_season                  0
remaining_sec                0
distance_of_shot          1567
is_goal                   6268
area_of_shot              1502
shot_basics                  0
range_of_shot                0
team_name                    0
date_of_game              1550
home/away                    0
shot_id_number               0
lat/lng                      0
type_of_shot             15280
type_of_combined_shot        0
match_id                     0
team_id                      0
dtype: int64

In [54]:
## filling area_of_shot with center_c
data["area_of_shot"].fillna('Center(C)',inplace=True)
data.isnull().sum()  

match_event_id               0
location_x                   0
location_y                   0
remaining_min                0
power_of_shot                0
knockout_match               0
game_season                  0
remaining_sec                0
distance_of_shot          1567
is_goal                   6268
area_of_shot                 0
shot_basics                  0
range_of_shot                0
team_name                    0
date_of_game              1550
home/away                    0
shot_id_number               0
lat/lng                      0
type_of_shot             15280
type_of_combined_shot        0
match_id                     0
team_id                      0
dtype: int64

In [55]:
## probalbility calculation
df1 = data["distance_of_shot"][data["distance_of_shot"]==20].count()
df2 = data["distance_of_shot"][data["distance_of_shot"]==45].count()
df3 = data["distance_of_shot"][data["distance_of_shot"]==44].count()
df4 = data["distance_of_shot"][data["distance_of_shot"]==37].count()
count = df1+df2+df3+df4
print(df1/count,df2/count,df3/count,df4/count,df1/count+df2/count+df3/count+df4/count)

0.5278056615137523 0.18630797028709095 0.14384661714515157 0.1420397510540052 1.0


In [77]:
## different method

cl = data["distance_of_shot"].value_counts().head(4).keys().to_list()
csum = data["distance_of_shot"].value_counts().head(4).sum()
cv = (data["distance_of_shot"].value_counts().head(4)/csum).to_list()
cv

[0.5284933645589384,
 0.1859658253100876,
 0.14320409402376616,
 0.1423367161072079]

In [56]:
data.loc[(data["distance_of_shot"].isnull()),"distance_of_shot"] = data.loc[(data["distance_of_shot"].isnull()),"distance_of_shot"].apply(lambda x:x if type(x) == str else np.random.choice([20,45,44,37],1,p=[0.5278056615137523, 0.18630797028709095, 0.14384661714515157 ,0.1420397510540052])[0])
data.isnull().sum()  

match_event_id               0
location_x                   0
location_y                   0
remaining_min                0
power_of_shot                0
knockout_match               0
game_season                  0
remaining_sec                0
distance_of_shot             0
is_goal                   6268
area_of_shot                 0
shot_basics                  0
range_of_shot                0
team_name                    0
date_of_game              1550
home/away                    0
shot_id_number               0
lat/lng                      0
type_of_shot             15280
type_of_combined_shot        0
match_id                     0
team_id                      0
dtype: int64

# Making test and train data set

In [57]:
train = data[data["is_goal"].notnull()]
test = data[data["is_goal"].isnull()]
train.set_index(np.arange(len(train)))
test.set_index(np.arange(len(test)))
print(train.shape,test.shape)

(24429, 22) (6268, 22)


In [58]:
data["type_of_shot"].value_counts()
df1 = data[data["type_of_shot"]== "shot - 4"]
c1 = df1["type_of_shot"][data["is_goal"]==1 ].count()
df2 = data[data["type_of_shot"]== "shot - 39"]
c2 = df2["type_of_shot"][data["is_goal"]==1 ].count()
df3 = data[data["type_of_shot"]== "shot - 44"]
c3 = df3["type_of_shot"][data["is_goal"]==1 ].count()
df4 = data[data["type_of_shot"]== "shot - 36"]
c4 = df4["type_of_shot"][data["is_goal"]==1 ].count()
df5 = data[data["type_of_shot"]== "shot - 15"]
c5 = df5["type_of_shot"][data["is_goal"]==1 ].count()
df6 = data[data["type_of_shot"]== "shot - 38"]
c6 = df6["type_of_shot"][data["is_goal"]==1 ].count()
count = c1+c2+c3+c4+c5+c6
print(count)
print(c1/count,c2/count,c3/count,c4/count,c5/count,c6/count,c1/count+c2/count+c3/count+c4/count+c5/count+c6/count)

2252
0.2682060390763766 0.19182948490230906 0.14653641207815277 0.1447602131438721 0.12966252220248667 0.11900532859680284 1.0


In [59]:
# another approach
x = train["type_of_shot"][train["is_goal"]==1].value_counts().head(6).sum()
v = train["type_of_shot"][train["is_goal"]==1].value_counts().head(6).keys()
print(v)
l = (train["type_of_shot"][train["is_goal"]==1].value_counts().head(6)/x).to_list()
l

Index(['shot - 4', 'shot - 39', 'shot - 44', 'shot - 36', 'shot - 15',
       'shot - 38'],
      dtype='object')


[0.2682060390763766,
 0.19182948490230906,
 0.14653641207815277,
 0.1447602131438721,
 0.12966252220248667,
 0.11900532859680284]

In [60]:
train.loc[(train["is_goal"]==1)&(train["type_of_shot"].isnull()),"type_of_shot"] = train.loc[(train["is_goal"]==1) & (train["type_of_shot"].isnull()),"type_of_shot"].apply(lambda x: x if type(x)==str else np.random.choice(v,1,l)[0])
train.isnull().sum()

match_event_id              0
location_x                  0
location_y                  0
remaining_min               0
power_of_shot               0
knockout_match              0
game_season                 0
remaining_sec               0
distance_of_shot            0
is_goal                     0
area_of_shot                0
shot_basics                 0
range_of_shot               0
team_name                   0
date_of_game             1237
home/away                   0
shot_id_number              0
lat/lng                     0
type_of_shot             6723
type_of_combined_shot       0
match_id                    0
team_id                     0
dtype: int64

In [61]:
train["type_of_shot"][train["is_goal"]==0].value_counts().head(6)
nv = train["type_of_shot"][train["is_goal"]==0].value_counts().head(6).keys().to_list()
s = train["type_of_shot"][train["is_goal"]==0].value_counts().head(6).sum()
nl = (train["type_of_shot"][train["is_goal"]==0].value_counts().head(6)/s).to_list()
train.loc[(train["is_goal"]==0)&(train["type_of_shot"].isnull()),"type_of_shot"] = train.loc[(train["is_goal"]==0) & (train["type_of_shot"].isnull()),"type_of_shot"].apply(lambda x: x if type(x)==str else np.random.choice(nv,1,nl)[0])
train.isnull().sum()

match_event_id              0
location_x                  0
location_y                  0
remaining_min               0
power_of_shot               0
knockout_match              0
game_season                 0
remaining_sec               0
distance_of_shot            0
is_goal                     0
area_of_shot                0
shot_basics                 0
range_of_shot               0
team_name                   0
date_of_game             1237
home/away                   0
shot_id_number              0
lat/lng                     0
type_of_shot                0
type_of_combined_shot       0
match_id                    0
team_id                     0
dtype: int64

In [62]:
## Making type of shot value in test data as well 
test["type_of_shot"].value_counts().head(3)
fl = test["type_of_shot"].value_counts().head(3).keys().tolist()
fsum = test["type_of_shot"].value_counts().head(3).sum()
fv = (test["type_of_shot"].value_counts().head(3)/fsum).to_list()
test.loc[(test["type_of_shot"].isnull()),"type_of_shot"] = test.loc[(test["type_of_shot"].isnull()),"type_of_shot"].apply(lambda x:x if type(x)==str else np.random.choice(fl,1,fv)[0])
test.isnull().sum()

match_event_id              0
location_x                  0
location_y                  0
remaining_min               0
power_of_shot               0
knockout_match              0
game_season                 0
remaining_sec               0
distance_of_shot            0
is_goal                  6268
area_of_shot                0
shot_basics                 0
range_of_shot               0
team_name                   0
date_of_game              313
home/away                   0
shot_id_number              0
lat/lng                     0
type_of_shot                0
type_of_combined_shot       0
match_id                    0
team_id                     0
dtype: int64

In [63]:
train.dtypes

match_event_id                  float64
location_x                      float64
location_y                      float64
remaining_min                   float64
power_of_shot                   float64
knockout_match                  float64
game_season                       int32
remaining_sec                   float64
distance_of_shot                float64
is_goal                         float64
area_of_shot                     object
shot_basics                      object
range_of_shot                    object
team_name                        object
date_of_game             datetime64[ns]
home/away                        object
shot_id_number                    int64
lat/lng                          object
type_of_shot                     object
type_of_combined_shot            object
match_id                          int64
team_id                           int64
dtype: object

In [64]:
## labelling all the categories to integers
## Label encoding the category values
for col in train.columns:
    if train[col].dtypes == object :
        cats = train[col].unique()
        values = np.arange(len(cats))
        train[col].replace(to_replace=cats,value=values,inplace=True)
        train[col] = train[col].astype("int")
        ## make for test data as well
        tcats = test[col].unique()
        tvalues = np.arange(len(tcats))
        test[col].replace(to_replace=tcats,value=tvalues,inplace=True)
        test[col] = test[col].astype("int")
    

In [65]:
train.drop(["date_of_game","is_goal"],axis=1,inplace=True)
test.drop(["date_of_game","is_goal"],axis=1,inplace=True)

In [66]:
train.head()

Unnamed: 0,match_event_id,location_x,location_y,remaining_min,power_of_shot,knockout_match,game_season,remaining_sec,distance_of_shot,area_of_shot,shot_basics,range_of_shot,team_name,home/away,shot_id_number,lat/lng,type_of_shot,type_of_combined_shot,match_id,team_id
1,12.0,-157.0,0.0,10.0,1.0,0.0,0,22.0,35.0,0,0,0,0,0,2,0,0,0,20000012,1610612747
2,35.0,-101.0,135.0,7.0,1.0,0.0,0,45.0,36.0,1,0,1,0,0,3,0,1,0,20000012,1610612747
3,43.0,138.0,175.0,6.0,1.0,0.0,0,52.0,42.0,2,0,1,0,0,4,0,2,0,20000012,1610612747
4,155.0,0.0,0.0,6.0,2.0,0.0,0,19.0,20.0,3,1,2,0,0,5,0,3,1,20000012,1610612747
5,244.0,-145.0,-11.0,9.0,3.0,0.0,1,32.0,34.0,0,0,0,0,0,6,0,4,0,20000012,1610612747


In [67]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24429 entries, 1 to 30696
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   match_event_id         24429 non-null  float64
 1   location_x             24429 non-null  float64
 2   location_y             24429 non-null  float64
 3   remaining_min          24429 non-null  float64
 4   power_of_shot          24429 non-null  float64
 5   knockout_match         24429 non-null  float64
 6   game_season            24429 non-null  int32  
 7   remaining_sec          24429 non-null  float64
 8   distance_of_shot       24429 non-null  float64
 9   area_of_shot           24429 non-null  int64  
 10  shot_basics            24429 non-null  int64  
 11  range_of_shot          24429 non-null  int64  
 12  team_name              24429 non-null  int64  
 13  home/away              24429 non-null  int64  
 14  shot_id_number         24429 non-null  int64  
 15  la