In [706]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [707]:
football_df = pd.read_csv("data.csv")

In [708]:
football_df.head()

Unnamed: 0.1,Unnamed: 0,match_event_id,location_x,location_y,remaining_min,power_of_shot,knockout_match,game_season,remaining_sec,distance_of_shot,...,lat/lng,type_of_shot,type_of_combined_shot,match_id,team_id,remaining_min.1,power_of_shot.1,knockout_match.1,remaining_sec.1,distance_of_shot.1
0,0,10.0,167.0,72.0,10.0,1.0,0.0,2000-01,27.0,38.0,...,"45.539131, -122.651648",shot - 30,,20000012,1610612747,10.0,1.0,50.608,54.2,38.0
1,1,12.0,-157.0,0.0,10.0,1.0,0.0,2000-01,22.0,35.0,...,"45.539131, -122.651648",shot - 45,,20000012,1610612747,10.0,1.0,28.8,22.0,35.0
2,2,35.0,-101.0,135.0,7.0,1.0,0.0,2000-01,45.0,36.0,...,"45.539131, -122.651648",shot - 25,,20000012,1610612747,92.64,1.0,0.0,63.7216,54.4
3,3,43.0,138.0,175.0,6.0,1.0,0.0,2000-01,52.0,42.0,...,"45.539131, -122.651648",,shot - 3,20000012,1610612747,,1.0,122.608,52.0,42.0
4,4,155.0,0.0,0.0,,2.0,0.0,2000-01,19.0,20.0,...,"45.539131, -122.651648",,shot - 1,20000012,1610612747,42.64,2.0,0.0,19.0,20.0


In [709]:
football_df.columns

Index(['Unnamed: 0', 'match_event_id', 'location_x', 'location_y',
       'remaining_min', 'power_of_shot', 'knockout_match', 'game_season',
       'remaining_sec', 'distance_of_shot', 'is_goal', 'area_of_shot',
       'shot_basics', 'range_of_shot', 'team_name', 'date_of_game',
       'home/away', 'shot_id_number', 'lat/lng', 'type_of_shot',
       'type_of_combined_shot', 'match_id', 'team_id', 'remaining_min.1',
       'power_of_shot.1', 'knockout_match.1', 'remaining_sec.1',
       'distance_of_shot.1'],
      dtype='object')

# Data Cleaning

In [710]:
# football_df.rename(columns={'remaining_min.1':'remaining_min_1',
#                           'distance_of_shot.1':'distance_of_shot_1'}, inplace=True)

In [711]:
#Merging data from redundant columns

In [712]:
# football_df.remaining_min_1.fillna(football_df.remaining_min, inplace=True)

In [713]:
# football_df.distance_of_shot.fillna(football_df.distance_of_shot_1, inplace=True)

In [714]:
unique_mins = football_df["remaining_min"].unique()
unique_mins = unique_mins[~np.isnan(unique_mins)]

count = football_df.groupby("remaining_min").size()
count /= count.sum()

In [716]:
football_df["remaining_min"] = football_df.remaining_min.map(lambda x: np.random.choice(unique_mins, p=count) if np.isnan(x) else x)

In [718]:
#Dropping unecessary and redundant columns

In [717]:
columns_to_drop = [
    "team_name",
    "home/away",
    "date_of_game",
    "team_id",
    "remaining_min.1",
    "power_of_shot.1",
    "knockout_match.1",
    "remaining_sec.1",
    "distance_of_shot.1",
    "Unnamed: 0",
]

In [719]:
football_df.drop(columns_to_drop, axis=1, inplace=True)

In [720]:
football_df.columns

Index(['match_event_id', 'location_x', 'location_y', 'remaining_min',
       'power_of_shot', 'knockout_match', 'game_season', 'remaining_sec',
       'distance_of_shot', 'is_goal', 'area_of_shot', 'shot_basics',
       'range_of_shot', 'shot_id_number', 'lat/lng', 'type_of_shot',
       'type_of_combined_shot', 'match_id'],
      dtype='object')

In [721]:
football_df.match_event_id.fillna(value = -1, inplace = True)

In [722]:
unique_pos = football_df["power_of_shot"].unique()
unique_pos = unique_pos[~np.isnan(unique_pos)]

count_pos = football_df.groupby("power_of_shot").size()
count_pos /= count_pos.sum()

In [723]:
football_df["power_of_shot"] = football_df.power_of_shot.map(lambda x: np.random.choice(unique_pos, p = count_pos) if np.isnan(x) else x )

In [724]:
count_knockout = football_df.groupby("knockout_match").size()
count_knockout /= count_knockout.sum()

In [725]:
football_df["knockout_match"] = football_df.knockout_match.map(lambda x: np.random.choice([0,1], p = count_knockout) if np.isnan(x) else x )

In [726]:
# football_df.drop(["type_of_shot","type_of_combined_shot"], axis = 1, inplace=True)

In [727]:
football_df[["lat","lng"]] = football_df["lat/lng"].str.split(",",expand = True)

In [728]:
football_df.drop("lat/lng", inplace=True, axis = 1)

In [729]:
football_df["game_season"] = football_df["game_season"].fillna(method = 'backfill')

In [730]:
football_df["range_of_shot"] = football_df["range_of_shot"].astype('category')
football_df["range_of_shot"] = football_df["range_of_shot"].cat.codes

football_df["shot_basics"] = football_df["shot_basics"].astype('category')
football_df["shot_basics"] = football_df["shot_basics"].cat.codes

football_df["area_of_shot"] = football_df["area_of_shot"].astype('category')
football_df["area_of_shot"] = football_df["area_of_shot"].cat.codes

football_df["game_season"] = football_df["game_season"].astype('category')
football_df["game_season"] = football_df["game_season"].cat.codes

In [731]:
football_df["shot_id_number"] = football_df.index
football_df.shot_id_number = football_df.shot_id_number.map(lambda x: x+1)

In [None]:
imputer = Imputer(missing_values = 'NaN', strategy = 'median', axis = 0)
imputer = imputer.fit(football_df.iloc[:,6:7])
football_df.iloc[:,6:7] = imputer.transform(football_df.iloc[:,6:7])

In [None]:
imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
imputer = imputer.fit(football_df.iloc[:,1:3])
football_df.iloc[:,1:3] = imputer.transform(football_df.iloc[:,1:3])

In [28]:
imputer = Imputer(missing_values = 'NaN', strategy = 'median', axis = 0)
imputer = imputer.fit(football_df.iloc[:,15:])
football_df.iloc[:,15:] = imputer.transform(football_df.iloc[:,15:])



In [29]:
imputer = Imputer(missing_values = 'NaN', strategy = 'most_frequent', axis = 0)
imputer = imputer.fit(football_df.iloc[:,7:8])
football_df.iloc[:,7:8] = imputer.transform(football_df.iloc[:,7:8])



In [30]:
football_df.type_of_shot.fillna(football_df.type_of_combined_shot, inplace=True)
football_df.drop("type_of_combined_shot", axis = 1, inplace=True)

In [32]:
football_df.head()

Unnamed: 0,match_event_id,location_x,location_y,power_of_shot,knockout_match,game_season,remaining_sec,distance_of_shot,is_goal,area_of_shot,shot_basics,range_of_shot,shot_id_number,type_of_shot,match_id,remaining_min_1,lat,lng
0,10.0,167.0,72.0,1.0,0.0,4,27.0,38.0,,5,4,0,1.0,shot - 30,20000012.0,10.0,45.539131,-122.651648
1,12.0,-157.0,0.0,1.0,0.0,4,22.0,35.0,0.0,2,4,2,2.0,shot - 45,20000012.0,10.0,45.539131,-122.651648
2,35.0,-101.0,135.0,1.0,0.0,4,45.0,36.0,1.0,1,4,0,3.0,shot - 25,20000012.0,9.0,45.539131,-122.651648
3,43.0,138.0,175.0,1.0,0.0,4,52.0,42.0,0.0,4,4,0,4.0,shot - 3,20000012.0,6.0,45.539131,-122.651648
4,155.0,0.0,0.0,2.0,0.0,4,19.0,20.0,1.0,0,0,4,5.0,shot - 1,20000012.0,42.64,45.539131,-122.651648


In [33]:
# football_df = football_df.astype("float32")

In [34]:
# football_df.distance_of_shot.isnull().sum()

In [35]:
#Type_of_shot and type_of_combined_shot cleaned

In [36]:
football_df["type_of_shot"] = football_df.type_of_shot.map(lambda x: int(str(x).split(" - ")[1]), na_action="ignore")
# football_df["type_of_combined_shot"] = football_df.type_of_combined_shot.map(lambda x: int(str(x).split(" - ")[1]), na_action="ignore")

In [37]:
unique_types_of_shot = football_df["type_of_shot"].unique()
unique_types_of_shot = unique_types_of_shot[~np.isnan(unique_types_of_shot)]

In [38]:
# unique_types_of_combined_shot = football_df["type_of_combined_shot"].unique()
# unique_types_of_combined_shot = unique_types_of_combined_shot[~np.isnan(unique_types_of_combined_shot)]

In [39]:
count_types_of_shot = football_df.groupby("type_of_shot").size()
# count_types_of_combined_shot = football_df.groupby("type_of_combined_shot").size()

count_types_of_shot /= count_types_of_shot.sum()
# count_types_of_combined_shot /= count_types_of_combined_shot.sum()

In [40]:
football_df["type_of_shot"] = football_df.type_of_shot.map(lambda x: np.random.choice(unique_types_of_shot,p=count_types_of_shot) if np.isnan(x) else x)
# football_df["type_of_combined_shot"] = football_df.type_of_combined_shot.map(lambda x: np.random.choice(unique_types_of_combined_shot, p=count_types_of_combined_shot) if np.isnan(x) else x)

In [61]:
football_df.head()

Unnamed: 0,match_event_id,location_x,location_y,power_of_shot,knockout_match,game_season,remaining_sec,distance_of_shot,is_goal,area_of_shot,shot_basics,range_of_shot,shot_id_number,type_of_shot,match_id,remaining_min_1,lat,lng
0,10.0,167.0,72.0,1.0,0.0,4,27.0,38.0,,5,4,0,1,30,20000012.0,10.0,45.539131,-122.651648
1,12.0,-157.0,0.0,1.0,0.0,4,22.0,35.0,0.0,2,4,2,2,45,20000012.0,10.0,45.539131,-122.651648
2,35.0,-101.0,135.0,1.0,0.0,4,45.0,36.0,1.0,1,4,0,3,25,20000012.0,9.0,45.539131,-122.651648
3,43.0,138.0,175.0,1.0,0.0,4,52.0,42.0,0.0,4,4,0,4,3,20000012.0,6.0,45.539131,-122.651648
4,155.0,0.0,0.0,2.0,0.0,4,19.0,20.0,1.0,0,0,4,5,1,20000012.0,42.64,45.539131,-122.651648


# Splitting data into Training and Testing data

In [62]:
condition = np.isnan(football_df["is_goal"])

In [63]:
unlabelled_data = football_df[condition]

In [64]:
unlabelled_data.shape

(6268, 18)

In [65]:
labelled_data = football_df[~condition]

In [66]:
labelled_data.shape

(24429, 18)

In [67]:
labelled_data = shuffle(labelled_data)
labelled_data.head()

Unnamed: 0,match_event_id,location_x,location_y,power_of_shot,knockout_match,game_season,remaining_sec,distance_of_shot,is_goal,area_of_shot,shot_basics,range_of_shot,shot_id_number,type_of_shot,match_id,remaining_min_1,lat,lng
14627,262.0,-150.0,91.126933,3.0,0.0,12,59.0,39.0,1.0,1,4,0,14628,3,20801038.0,8.0,41.845137,-87.66045
19814,345.0,-92.0,132.0,3.0,0.0,16,13.0,36.0,0.0,1,4,0,19815,2,21200297.0,2.0,42.982923,-71.446094
10649,377.0,0.0,0.0,4.0,0.0,10,23.0,20.0,1.0,0,0,4,10650,32,20600738.0,6.0,43.717098,-79.395917
11329,521.0,-138.0,210.0,4.0,0.0,10,4.0,45.0,1.0,1,5,1,11330,42,20601144.0,28.64,42.982923,-71.446094
24499,366.0,143.0,-12.0,3.0,0.0,2,2.0,34.0,1.0,5,4,2,24500,3,29800210.0,4.0,42.982923,-71.446094


In [69]:
Y = labelled_data["is_goal"]

In [50]:
Y.shape

(24429,)

In [71]:
X = labelled_data.drop(["is_goal","shot_id_number"], axis = 1)

In [72]:
X.shape

(24429, 16)

In [83]:
shot_id_number = X_Unlabelled.shot_id_number

In [84]:
X_Unlabelled = unlabelled_data.drop(["is_goal","shot_id_number"], axis = 1)

In [85]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

In [86]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [87]:
DT = DecisionTreeClassifier()
RF = RandomForestClassifier(n_estimators = 1000)

In [77]:
# DT.fit(X_train,Y_train)

In [79]:
# DT.score(X_test,Y_test)

In [89]:
RF.fit(X_train,Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [90]:
RF.score(X_test,Y_test)

0.6111338518215309

In [91]:
y_pred = RF.predict(X_Unlabelled)

In [92]:
d = {
    "shot_id_number" : shot_id_number,
    "is_goal" : y_pred
}

In [93]:
final = pd.DataFrame(d)

In [95]:
final.set_index('shot_id_number', inplace=True)

In [96]:
final

Unnamed: 0_level_0,is_goal
shot_id_number,Unnamed: 1_level_1
1,0.0
8,0.0
17,1.0
20,1.0
22,0.0
33,0.0
34,0.0
35,1.0
36,0.0
37,1.0


In [None]:
from sklearn.svm import SVC

In [None]:
clf = SVC(kernel='rbf', gamma=0.01, C=100)
clf.fit(X_train, Y_train)

In [None]:
clf.score(X_test,Y_test)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logisticRegr = LogisticRegression()

In [None]:
logisticRegr.fit(X_train, Y_train)

In [None]:
logisticRegr.score(X_test, Y_test)