In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from lightgbm import LGBMClassifier

In [3]:
df = pd.read_csv("Desktop\Cristano_Ronaldo_Final_v1\data.csv")

In [4]:
print("Total number of dataset is {}".format(df.shape[0]))
df.drop_duplicates(keep='first', inplace=True)  # Remove duplicate entries if any
print("Total number of dataset after removing duplicates is {}".format(df.shape[0]))
df.head(2)

Total number of dataset is 30697
Total number of dataset after removing duplicates is 30697


Unnamed: 0.1,Unnamed: 0,match_event_id,location_x,location_y,remaining_min,power_of_shot,knockout_match,game_season,remaining_sec,distance_of_shot,...,lat/lng,type_of_shot,type_of_combined_shot,match_id,team_id,remaining_min.1,power_of_shot.1,knockout_match.1,remaining_sec.1,distance_of_shot.1
0,0,10.0,167.0,72.0,10.0,1.0,0.0,2000-01,27.0,38.0,...,"45.539131, -122.651648",shot - 30,,20000012,1610612747,10.0,1.0,50.608,54.2,38.0
1,1,12.0,-157.0,0.0,10.0,1.0,0.0,2000-01,22.0,35.0,...,"45.539131, -122.651648",shot - 45,,20000012,1610612747,10.0,1.0,28.8,22.0,35.0


In [5]:
df['match_event_id'] =  df['match_event_id'].interpolate()

In [6]:
df.drop(['Unnamed: 0','match_id','team_name','team_id','shot_id_number'], inplace = True, axis = 1)

In [7]:
df[['game_season', 'knockout_match','home/away','power_of_shot','lat/lng','date_of_game']] = df[['game_season','knockout_match','home/away','power_of_shot','lat/lng','date_of_game']].fillna(method='ffill')

In [8]:
df['type_of_shot'] = df['type_of_shot'].replace(np.nan, '', regex=True)
df['type_of_combined_shot'] = df['type_of_combined_shot'].replace(np.nan, '', regex=True)
df['shot'] = df['type_of_shot'] + df['type_of_combined_shot']
df.drop(['type_of_shot','type_of_combined_shot'],axis=1,inplace=True)

In [9]:
# remaining_sec makes more sense as number of seconds should be <=60
df.drop('remaining_sec.1', axis=1, inplace=True)

# for continous values in following cases let us use mean
df.remaining_sec.fillna(value=df.remaining_sec.mean(), inplace=True, limit=None)

In [10]:
df['opp_team'] = df['home/away'].str[-3:]

In [11]:
df['home/away'] = df['home/away'].str[5:-3].map(lambda x: x.strip())
df['home/away'].replace({'@': 'away','vs.': 'home'},inplace=True)

In [12]:
# On the same lines of remaining_sec for remainig_min
df.drop('remaining_min.1', axis=1, inplace=True )

# Fill missing values using mean
df.remaining_min.fillna(value=df.remaining_min.mean(), inplace=True, limit=None)

In [13]:
df.drop(['power_of_shot.1','knockout_match.1'], axis=1, inplace=True)

In [14]:
df["date_of_game"] = pd.to_datetime(df["date_of_game"])
df["year"] = df["date_of_game"].apply(lambda x:x.year)
df["day"] = df["date_of_game"].apply(lambda x:x.day)
df["month"] = df["date_of_game"].apply(lambda x:x.month)
df.drop(['date_of_game'],axis=1,inplace=True)

In [15]:
# both of them is almost same, we can take one df['distance_of_shot'] and drop other
df.drop('distance_of_shot.1', axis = 1, inplace = True)

# Fill missing value in distance_of_shot
#df.distance_of_shot.fillna(value = df['distance_of_shot'].mean(), inplace = True)

In [16]:
df['area_of_shot'][df['shot_basics'] == 'Left Corner']=df['area_of_shot'][df['shot_basics'] == 'Left Corner'].fillna('Left Side(L)')
df['area_of_shot'][df['shot_basics'] == 'Right Corner']=df['area_of_shot'][df['shot_basics'] == 'Right Corner'].fillna('Right Side(R)')
df['area_of_shot'][df['shot_basics'] == 'Goal Area']=df['area_of_shot'][df['shot_basics'] == 'Goal Area'].fillna('Center(C)')
df['area_of_shot'][df['shot_basics'] == 'Mid Ground Line']=df['area_of_shot'][df['shot_basics'] == 'Mid Ground Line'].fillna('Mid Ground(G)')

In [17]:
df['location_y'] = df['location_y'].apply(lambda x: "{:.0f}".format(x) if not pd.isnull(x) else x)
df['location_x'] = df['location_x'].apply(lambda x: "{:.0f}".format(x) if not pd.isnull(x) else x)

In [18]:
d = df[['location_x','location_y','area_of_shot']]

d1 = d.dropna(subset = ['location_x','location_y'])
d2 = d.dropna(subset = ['location_y','area_of_shot'])
d3 = d.dropna(subset = ['area_of_shot','location_x'])

In [19]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(d2['area_of_shot'])
d2['area_of_shot'] = label_encoder.transform(d2['area_of_shot'])

In [20]:
label_encoder.fit(d3['area_of_shot'])
d3['area_of_shot'] = label_encoder.transform(d3['area_of_shot'])

In [21]:
nonempty_area = d1.dropna(axis=0, how='any', subset=['area_of_shot'])
empty_area = d1[~d1.index.isin(nonempty_area.index)]

nonempty_x = d2.dropna(axis=0, how='any', subset=['location_x'])
empty_x = d2[~d2.index.isin(nonempty_x.index)]

nonempty_y = d3.dropna(axis=0, how='any', subset=['location_y'])
empty_y = d3[~d3.index.isin(nonempty_x.index)]

In [22]:
empty_area.drop(['area_of_shot'],inplace=True,axis=1)
empty_x.drop(['location_x'],inplace=True,axis=1)
empty_y.drop(['location_y'],inplace=True,axis=1)

In [23]:
from sklearn.linear_model import LogisticRegression 
lr = LogisticRegression ()

from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor()

In [24]:
nonempty_area_labels = nonempty_area['area_of_shot']
nonempty_area.drop(['area_of_shot'],axis=1,inplace=True)

nonempty_x_labels = nonempty_x['location_x']
nonempty_x.drop(['location_x'],axis=1,inplace=True)

nonempty_y_labels = nonempty_y['location_y']
nonempty_y.drop(['location_y'],axis=1,inplace=True)

In [25]:
lr.fit(nonempty_area,nonempty_area_labels)
pred1 = lr.predict(empty_area)

rfr.fit(nonempty_x,nonempty_x_labels)
pred2 = rfr.predict(empty_x)

rfr.fit(nonempty_y,nonempty_y_labels)
pred3 = rfr.predict(empty_y)

In [26]:
empty_area['area_of_shot'] = pred1
d1.drop(['area_of_shot'],axis=1,inplace=True)

empty_x['location_x'] = pred2
d2.drop(['location_x'],axis=1,inplace=True)

empty_y['location_y'] = pred3
d3.drop(['location_y'],axis=1,inplace=True)

In [27]:
final1 = pd.merge(d, empty_area, left_index=True, right_index=True,how='outer')
final1 = final1.replace(np.nan, '', regex=True)
final1 = final1['area_of_shot_y']

final2 = pd.merge(d, empty_x, left_index=True, right_index=True,how='outer')
final2 = final2.replace(np.nan, '', regex=True)
final2 = final2['location_x_y']

final3 = pd.merge(d, empty_y, left_index=True, right_index=True,how='outer')
final3 = final3.replace(np.nan, '', regex=True)
final3 = final3['location_y_y']

In [28]:
df = pd.merge(df, final1, left_index=True, right_index=True,how='outer')

In [29]:
#df = pd.merge(df, final1, left_index=True, right_index=True,how='outer')
df['area_of_shot'] = df['area_of_shot'].replace(np.nan, '', regex=True)
df['area_of_shot'] = df['area_of_shot']+df['area_of_shot_y']
df['area_of_shot'] = df['area_of_shot'].replace(r'^\s*$', np.nan, regex=True)
df.drop(['area_of_shot_y'],axis=1,inplace=True)

In [30]:
df = pd.merge(df, final2, left_index=True, right_index=True,how='outer')
df['location_x'] = df['location_x'].replace(np.nan, '', regex=True)

In [31]:
df['location_x'] = df['location_x'].astype(str)
df['location_x_y'] = df['location_x_y'].astype(str)

df['location_x'] = df[['location_x', 'location_x_y']].apply(lambda x: ''.join(x), axis=1)

In [32]:
df['location_x'] = df['location_x'].replace(r'^\s*$', np.nan, regex=True)
df['location_x'] = df['location_x'].astype(float)
df['location_x'] = round(df['location_x'])
df.drop(['location_x_y'],axis=1,inplace=True)

In [33]:
df = pd.merge(df, final3, left_index=True, right_index=True,how='outer')
df['location_y'] = df['location_y'].replace(np.nan, '', regex=True)

df['location_y'] = df['location_y'].astype(str)
df['location_y_y'] = df['location_y_y'].astype(str)

df['location_y'] = df[['location_y', 'location_y_y']].apply(lambda x: ''.join(x), axis=1)

df['location_y'] = df['location_y'].replace(np.nan, '', regex=True)
df['location_y'] = df['location_y'].replace(r'^\s*$', np.nan, regex=True)
df['location_y'] = df['location_y'].astype(float)
df['location_y'] = round(df['location_y'])
df.drop(['location_y_y'],axis=1,inplace=True)

In [34]:
def segregate(x):
    l = x["lat/lng"].split(", ")
    return pd.Series(l)
df[["lat","long"]] = df.apply(segregate, axis=1)
df["lat"] = pd.to_numeric(df["lat"])
df["long"] = pd.to_numeric(df["long"])

In [35]:
def _split_columns(array):
    if array.ndim == 1:
        return array[0], array[1] # just a single row
    else:
        return array[:,0], array[:,1]

In [36]:
R = 6378137.0
R_km = R/1000

def haversine(lat,lon):
    """ 
    Calculate the great-circle distance bewteen points_a and points_b
    points_a and points_b can be a single points or lists of points
    """
    points_a=[42.982923, -71.446094]
    
    lat1, lon1 = _split_columns(np.radians(points_a))
    lat2, lon2 = (np.radians(lat)),(np.radians(lon))

    # calculate haversine
    lat = lat2 - lat1
    lon = lon2 - lon1
    d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lon * 0.5) ** 2
    h = 2 * R_km * np.arcsin(np.sqrt(d))
    return h

In [37]:
df['dist'] = haversine(df['lat'],df['long'])
df['dist'] = round(df['dist'])

In [38]:
df.loc[ df['dist'] == 0, 'dist_group']  = 0
df.loc[(df['dist'] > 0) & (df['dist'] <= 1000), 'dist_group'] = 1
df.loc[(df['dist'] > 1000) & (df['dist'] <= 2000), 'dist_group'] = 2
df.loc[(df['dist'] > 2000) & (df['dist'] <= 3000), 'dist_group'] = 3
df.loc[ df['dist'] > 3000, 'dist_group'] = 4 ;

In [40]:
d = df[['distance_of_shot','shot_basics','range_of_shot']]

d1 = d.dropna(subset = ['distance_of_shot','shot_basics'])
d2 = d.dropna(subset = ['range_of_shot','shot_basics'])
d3 = d.dropna(subset = ['range_of_shot','distance_of_shot'])

In [41]:
label_encoder.fit(d1['shot_basics'])
d1['shot_basics'] = label_encoder.transform(d1['shot_basics'])

label_encoder.fit(d2['shot_basics'])
d2['shot_basics'] = label_encoder.transform(d2['shot_basics'])

label_encoder.fit(d2['range_of_shot'])
d2['range_of_shot'] = label_encoder.transform(d2['range_of_shot'])

label_encoder.fit(d3['range_of_shot'])
d3['range_of_shot'] = label_encoder.transform(d3['range_of_shot'])

In [42]:
nonempty_range = d1.dropna(axis=0, how='any', subset=['range_of_shot'])
empty_range = d1[~d1.index.isin(nonempty_range.index)]

nonempty_dist = d2.dropna(axis=0, how='any', subset=['distance_of_shot'])
empty_dist = d2[~d2.index.isin(nonempty_dist.index)]

nonempty_shot = d3.dropna(axis=0, how='any', subset=['shot_basics'])
empty_shot = d3[~d3.index.isin(nonempty_shot.index)]

In [43]:
empty_range.drop(['range_of_shot'],inplace=True,axis=1)
empty_dist.drop(['distance_of_shot'],inplace=True,axis=1)
empty_shot.drop(['shot_basics'],inplace=True,axis=1)

In [44]:
nonempty_range_labels = nonempty_range['range_of_shot']
nonempty_range.drop(['range_of_shot'],axis=1,inplace=True)

nonempty_dist_labels = nonempty_dist['distance_of_shot']
nonempty_dist.drop(['distance_of_shot'],axis=1,inplace=True)

nonempty_shot_labels = nonempty_shot['shot_basics']
nonempty_shot.drop(['shot_basics'],axis=1,inplace=True)

In [45]:
lr.fit(nonempty_range,nonempty_range_labels)
pred1 = lr.predict(empty_range)

rfr.fit(nonempty_dist,nonempty_dist_labels)
pred2 = rfr.predict(empty_dist)

lr.fit(nonempty_shot,nonempty_shot_labels)
pred3 = lr.predict(empty_shot)

In [46]:
empty_range['range_of_shot'] = pred1
d1.drop(['range_of_shot'],axis=1,inplace=True)

empty_dist['distance_of_shot'] = pred2
d2.drop(['distance_of_shot'],axis=1,inplace=True)

empty_shot['shot_basics'] = pred3
d3.drop(['shot_basics'],axis=1,inplace=True)

In [47]:
final1 = pd.merge(d, empty_range, left_index=True, right_index=True,how='outer')
final1 = final1.replace(np.nan, '', regex=True)
final1 = final1['range_of_shot_y']

final2 = pd.merge(d, empty_dist, left_index=True, right_index=True,how='outer')
final2 = final2.replace(np.nan, '', regex=True)
final2 = final2['distance_of_shot_y']

final3 = pd.merge(d, empty_shot, left_index=True, right_index=True,how='outer')
final3 = final3.replace(np.nan, '', regex=True)
final3 = final3['shot_basics_y']

In [48]:
df = pd.merge(df, final1, left_index=True, right_index=True,how='outer')
df['range_of_shot'] = df['range_of_shot'].replace(np.nan, '', regex=True)
df['range_of_shot'] = df['range_of_shot']+df['range_of_shot_y']
df['range_of_shot'] = df['range_of_shot'].replace(r'^\s*$', np.nan, regex=True)
df.drop(['range_of_shot_y'],axis=1,inplace=True)

In [49]:
df = pd.merge(df, final3, left_index=True, right_index=True,how='outer')
df['shot_basics'] = df['shot_basics'].replace(np.nan, '', regex=True)
df['shot_basics'] = df['shot_basics']+df['shot_basics_y']
df['shot_basics'] = df['shot_basics'].replace(r'^\s*$', np.nan, regex=True)
df.drop(['shot_basics_y'],axis=1,inplace=True)

In [50]:
df = pd.merge(df, final2, left_index=True, right_index=True,how='outer')
df['distance_of_shot'] = df['distance_of_shot'].replace(np.nan, '', regex=True)

In [51]:
df['distance_of_shot'] = df['distance_of_shot'].astype(str)
df['distance_of_shot_y'] = df['distance_of_shot_y'].astype(str)

df['distance_of_shot'] = df[['distance_of_shot', 'distance_of_shot_y']].apply(lambda x: ''.join(x), axis=1)

In [52]:
df['distance_of_shot'] = df['distance_of_shot'].replace(r'^\s*$', np.nan, regex=True)
df['distance_of_shot'] = df['distance_of_shot'].astype(float)
df['distance_of_shot'] = round(df['distance_of_shot'])
df.drop(['distance_of_shot_y'],axis=1,inplace=True)

In [53]:
df.drop(['lat/lng'],axis=1,inplace=True)

In [54]:
m1 = df['distance_of_shot'].mean()
df['distance_of_shot'] = df['distance_of_shot'].fillna(m1)

m2 = df['area_of_shot'].mode()
df['area_of_shot'] = df['area_of_shot'].fillna(m2[0])

m3 = df['range_of_shot'].mode()
df['range_of_shot'] = df['range_of_shot'].fillna(m3[0])

m4 = df['location_x'].mean()
df['location_x'] = df['location_x'].fillna(m4)

m5 = df['location_y'].mean()
df['location_y'] = df['location_y'].fillna(m5)

m6 = df['shot_basics'].mode()
df['shot_basics'] = df['shot_basics'].fillna(m6[0])

In [55]:
df["add_loc"] = df["location_x"] + df["location_y"]
df["sub_loc"] = df["location_x"] - df["location_y"]

In [56]:
df = df.reset_index()
df['index'] +=1
df = df.rename(columns={"index": "shot_id_number"})

In [57]:
from sklearn.preprocessing import LabelEncoder

In [62]:
columns = ['shot','opp_team','home/away','area_of_shot','shot_basics','range_of_shot','year','month','game_season']

def encoder(df):
    for col in columns:
        label_encoder = LabelEncoder()
        label_encoder.fit(df[col])
        df[col] = label_encoder.transform(df[col])
    return df

In [63]:
df = encoder(df)

In [66]:
df.drop(['game_season'],axis=1,inplace=True)

In [68]:
df.to_csv('df.csv',index=False)

In [69]:
!jupyter nbconvert --to script zs.ipynb

[NbConvertApp] Converting notebook zs.ipynb to script
[NbConvertApp] Writing 14284 bytes to zs.py
