In [9]:
import numpy as np

In [1]:
import pandas as pd

data = pd.read_csv("data.csv")
data = data[data.shot_made_flag.isnull()==False] # 같은 표현 : data = data.dropna()

In [2]:
X = data.drop('shot_made_flag', axis=1)
y=data['shot_made_flag']

In [3]:
for col in X.columns:
    print(col)

action_type
combined_shot_type
game_event_id
game_id
lat
loc_x
loc_y
lon
minutes_remaining
period
playoffs
season
seconds_remaining
shot_distance
shot_type
shot_zone_area
shot_zone_basic
shot_zone_range
team_id
team_name
game_date
matchup
opponent
shot_id


In [4]:
X.drop('game_id', axis=1, inplace=True) # Independent
X.drop('game_event_id', axis=1, inplace=True) # Independent

X.drop('lat', axis=1, inplace=True) # Correlated with loc_x
X.drop('lon', axis=1, inplace=True) # Correlated with loc_y

X.drop('team_id', axis=1, inplace=True) # Always one number
X.drop('team_name', axis=1, inplace=True) # Always LA Lakers

In [5]:
# Remaining time
X['seconds_from_period_end'] = 60 * X['minutes_remaining'] + X['seconds_remaining']
X['last_5_sec_in_period'] = X['seconds_from_period_end'] < 5

X.drop('minutes_remaining', axis=1, inplace=True)
X.drop('seconds_remaining', axis=1, inplace=True)
X.drop('seconds_from_period_end', axis=1, inplace=True)

## Matchup - (away/home)
X['home_play'] = X['matchup'].str.contains('vs').astype('int')
X.drop('matchup', axis=1, inplace=True)

# Game date
X['game_date'] = pd.to_datetime(X['game_date'])
X['game_year'] = X['game_date'].dt.year
X['game_month'] = X['game_date'].dt.month
X.drop('game_date', axis=1, inplace=True)

# Loc_x, and loc_y binning
X['loc_x'] = pd.cut(X['loc_x'], 25)
X['loc_y'] = pd.cut(X['loc_y'], 25)

# Replace 20 least common action types with value 'Other'
rare_action_types = X['action_type'].value_counts().sort_values().index.values[:20]
X.loc[X['action_type'].isin(rare_action_types), 'action_type'] = 'Other'

In [6]:
categorial_cols = [
    'action_type', 'combined_shot_type', 'period', 'season', 'shot_type',
    'shot_zone_area', 'shot_zone_basic', 'shot_zone_range', 'game_year',
    'game_month', 'opponent', 'loc_x', 'loc_y']

for cc in categorial_cols:
    dummies = pd.get_dummies(X[cc])
    dummies = dummies.add_prefix("{}-".format(cc))
    X.drop(cc, axis=1, inplace=True)
    X = X.join(dummies)

In [7]:
from sklearn.ensemble import RandomForestClassifier
num_trees = 100
num_features = 10

model = RandomForestClassifier(n_estimators=num_trees, max_features=num_features)
model.fit(X, y)

  from numpy.core.umath_tests import inner1d


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=10, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [10]:
rf_index_list = np.arange(len(model.feature_importances_))

In [11]:
rf_index_list = np.arange(len(model.feature_importances_))
rf_coef_list = []

for index in rf_index_list:
    rf_coef_list.append(model.feature_importances_[index])

rf_coef_list, rf_index_list = (list(t) for t in zip(*sorted(zip(rf_coef_list, rf_index_list), reverse=True)))

important_features = []
for coef, index in zip(rf_coef_list, rf_index_list):
    important_features.append(X.columns[index])

RF_selected_features = important_features[:20]

In [12]:
for feature in RF_selected_features:
    print(feature)

shot_id
shot_distance
action_type-Jump Shot
home_play
period-3
period-1
period-2
period-4
action_type-Layup Shot
combined_shot_type-Dunk
game_month-3
game_month-1
game_month-2
game_month-12
game_month-4
game_month-11
action_type-Driving Layup Shot
loc_x-(-10.96, 8.96]
loc_y-(-10.6, 22.8]
opponent-SAS


In [13]:
RF_X = X[RF_selected_features]
RF_X.head()

Unnamed: 0,shot_id,shot_distance,action_type-Jump Shot,home_play,period-3,period-1,period-2,period-4,action_type-Layup Shot,combined_shot_type-Dunk,game_month-3,game_month-1,game_month-2,game_month-12,game_month-4,game_month-11,action_type-Driving Layup Shot,"loc_x-(-10.96, 8.96]","loc_y-(-10.6, 22.8]",opponent-SAS
1,2,15,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,3,16,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,4,22,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,1,0
5,6,14,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [39]:
from sklearn.model_selection import train_test_split

# train_test_split
X_train, X_test, y_train, y_test = train_test_split(RF_X, y, test_size=0.2, random_state =42)

In [15]:
from sklearn.cross_validation import KFold, cross_val_score

# setting parameters
seed = 7
processors=1
num_folds=5
num_instances=len(X_train)
scoring='log_loss'

kfold = KFold(n=num_instances, n_folds=num_folds, random_state=seed)



In [16]:
# base model를 활용하기 위한 package
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [25]:
# Prepare some basic models
models = []
models.append(('Tree', DecisionTreeClassifier()))
models.append(('Forest', RandomForestClassifier()))

results_train = []
results_test = []
names = []

for name, model in models:
    cv_results_train = cross_val_score(model, X_train, y_train, cv=kfold, n_jobs=processors)
    cv_results_test = cross_val_score(model, X_test, y_test, cv=kfold, n_jobs=processors)
    results_train.append(cv_results_train)
    results_test.append(cv_results_test)
    names.append(name)
    print("{0}: ({1:.3f}) +/- ({2:.3f})".format(name, cv_results_train.mean(), cv_results_train.std()))
    print("{0}: ({1:.3f}) +/- ({2:.3f})".format(name, cv_results_test.mean(), cv_results_test.std()))   
print('\n')

IndexError: positional indexers are out-of-bounds

In [18]:
# 대표적인 ensemble models
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, AdaBoostClassifier

In [28]:
Tree = DecisionTreeClassifier()
num_trees = 100

model = BaggingClassifier(base_estimator=Tree, n_estimators=num_trees, random_state=seed)

tree_results_train = cross_val_score(model, X_train, y_train, cv=kfold, n_jobs=processors)


print("Train Result: ({0:.3f}) +/- ({1:.3f})".format(tree_results_train.mean(), tree_results_train.std()))


Train Reslut: (0.622) +/- (0.004)


In [41]:
model.score(model, X_test, y_test, cv=kfold, n_jobs=processors)

TypeError: score() got an unexpected keyword argument 'cv'

In [29]:
num_trees = 100
num_features = 10

model = RandomForestClassifier(n_estimators=num_trees, max_features=num_features)

results = cross_val_score(model, X_test, y_test, cv=kfold, n_jobs=processors)
print("Train Result: ({0:.3f}) +/- ({1:.3f})".format(results.mean(), results.std()))

IndexError: positional indexers are out-of-bounds

In [34]:
nmc = 50
training_accuracy = []
test_accuracy = []
c_settings = [100, 1, 0.01]


for c in c_settings:
    for i in range(nmc):   
        logreg = LogisticRegression(C=c)
        logreg.fit(X_train, y_train)
        training_accuracy.append(logreg.score(X_train, y_train))
        test_accuracy.append(logreg.score(X_test, y_test))

print(np.mean(training_accuracy))
print(np.std(training_accuracy))
print(np.mean(test_accuracy))
print(np.std(training_accuracy))

0.6564349532195034
0.0003210421409288793
0.6542801556420232
0.0003210421409288793


NameError: name 'yGuess' is not defined

In [30]:
len(X_test)

5140

In [31]:
len(y_test)

5140

In [32]:
len(X_train)

20557

In [33]:
len(X_test)

5140