# Preprocessing

### Data Loading

In [12]:
import pandas as pd

data = pd.read_csv("data.csv")
data = data[data.shot_made_flag.isnull()==False] # 같은 표현 : data = data.dropna()

In [13]:
X = data.drop('shot_made_flag', axis=1)
y=data['shot_made_flag']

# Eliminate columns that we don't need

In [14]:
for col in X.columns:
    print(col)

action_type
combined_shot_type
game_event_id
game_id
lat
loc_x
loc_y
lon
minutes_remaining
period
playoffs
season
seconds_remaining
shot_distance
shot_type
shot_zone_area
shot_zone_basic
shot_zone_range
team_id
team_name
game_date
matchup
opponent
shot_id


In [15]:
X.drop('game_id', axis=1, inplace=True) # Independent
X.drop('game_event_id', axis=1, inplace=True) # Independent

X.drop('lat', axis=1, inplace=True) # Correlated with loc_x
X.drop('lon', axis=1, inplace=True) # Correlated with loc_y

X.drop('team_id', axis=1, inplace=True) # Always one number
X.drop('team_name', axis=1, inplace=True) # Always LA Lakers

### Data Transforming

In [16]:
# Remaining time
X['seconds_from_period_end'] = 60 * X['minutes_remaining'] + X['seconds_remaining']
X['last_5_sec_in_period'] = X['seconds_from_period_end'] < 5

X.drop('minutes_remaining', axis=1, inplace=True)
X.drop('seconds_remaining', axis=1, inplace=True)
X.drop('seconds_from_period_end', axis=1, inplace=True)

## Matchup - (away/home)
X['home_play'] = X['matchup'].str.contains('vs').astype('int')
X.drop('matchup', axis=1, inplace=True)

# Game date
X['game_date'] = pd.to_datetime(X['game_date'])
X['game_year'] = X['game_date'].dt.year
X['game_month'] = X['game_date'].dt.month
X.drop('game_date', axis=1, inplace=True)

# Loc_x, and loc_y binning
X['loc_x'] = pd.cut(X['loc_x'], 25)
X['loc_y'] = pd.cut(X['loc_y'], 25)

# Replace 20 least common action types with value 'Other'
rare_action_types = X['action_type'].value_counts().sort_values().index.values[:20]
X.loc[X['action_type'].isin(rare_action_types), 'action_type'] = 'Other'

### Categorical variables to dummies 

In [17]:
categorial_cols = [
    'action_type', 'combined_shot_type', 'period', 'season', 'shot_type',
    'shot_zone_area', 'shot_zone_basic', 'shot_zone_range', 'game_year',
    'game_month', 'opponent', 'loc_x', 'loc_y']

for cc in categorial_cols:
    dummies = pd.get_dummies(X[cc])
    dummies = dummies.add_prefix("{}-".format(cc))
    X.drop(cc, axis=1, inplace=True)
    X = X.join(dummies)

In [18]:
X.head()

Unnamed: 0,playoffs,shot_distance,shot_id,last_5_sec_in_period,home_play,action_type-Alley Oop Dunk Shot,action_type-Alley Oop Layup shot,action_type-Driving Dunk Shot,action_type-Driving Finger Roll Layup Shot,action_type-Driving Finger Roll Shot,...,"loc_y-(457.0, 490.4]","loc_y-(490.4, 523.8]","loc_y-(523.8, 557.2]","loc_y-(557.2, 590.6]","loc_y-(590.6, 624.0]","loc_y-(624.0, 657.4]","loc_y-(657.4, 690.8]","loc_y-(690.8, 724.2]","loc_y-(724.2, 757.6]","loc_y-(757.6, 791.0]"
1,0,15,2,False,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,16,3,False,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,22,4,False,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,5,False,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,14,6,False,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
from sklearn.model_selection import train_test_split

# train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state =42)

In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.tools.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Imputer
from sklearn import cross_validation
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.cross_validation import KFold, cross_val_score


%matplotlib inline



In [None]:
from sklearn.neural_network import MLPClassifier
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('RF', RandomForestClassifier()))
models.append(('MLP', MLPClassifier()))

In [None]:
models

In [None]:
# Prepare some basic models
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('K-NN', KNeighborsClassifier(n_neighbors=5)))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('RF', RandomForestClassifier()))
models.append(('MLP', MLPClassifier()))


results1 = []
names1 = []

for name, model in models:
    cv_results = cross_val_score(model, X, y, cv=kfold, n_jobs=processors) # accuracy를 보려면 scoring을 지우면 default처리
    results1.append(cv_results)
    names1.append(name)
    print("{0}: ({1:.3f}) +/- ({2:.3f})".format(name, cv_results.mean(), cv_results.std()))   
print('\n')

results2 = []
names2 = []

for name, model in models:
    cv_results = cross_val_score(model, X, y, cv=kfold, n_jobs=processors) # accuracy를 보려면 scoring을 지우면 default처리
    results2.append(cv_results)
    names2.append(name)
    print("{0}: ({1:.3f}) +/- ({2:.3f})".format(name, cv_results.mean(), cv_results.std()))
print('\n')

results3 = []
names3 = []

for name, model in models:
    cv_results = cross_val_score(model, X, y, cv=kfold, n_jobs=processors) # accuracy를 보려면 scoring을 지우면 default처리
    results3.append(cv_results)
    names3.append(name)
    print("{0}: ({1:.3f}) +/- ({2:.3f})".format(name, cv_results.mean(), cv_results.std()))
print('\n')    

In [None]:
# Compare Algorithms
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results3)
ax.set_xticklabels(names3)
plt.show()

In [None]:
mlp = MLPClassifier(random_state=42)
mlp.fit(X_train, y_train)

print("Accuracy on training set: {:.2f}".format(mlp.score(X_train, y_train)))
print("Accuracy on test set: {:.2f}".format(mlp.score(X_test, y_test)))

In [None]:
# compute the mean value per feature on the training set
mean_on_train = X_train.mean(axis=0)
# compute the standard deviation of each feature on the training set
std_on_train = X_train.std(axis=0)

# subtract the mean, and scale by inverse standard deviation
# afterward, mean=0 and std=1
X_train_scaled = (X_train - mean_on_train) / std_on_train
# use THE SAME transformation (using training mean and std) on the test set
X_test_scaled = (X_test - mean_on_train) / std_on_train

mlp = MLPClassifier(alpha=3.15)
mlp.fit(X_train_scaled, y_train)

print("Accuracy on training set: {:.3f}".format(mlp.score(X_train_scaled, y_train)))
print("Accuracy on test set: {:.3f}".format(mlp.score(X_test_scaled, y_test)))

In [None]:
nmc = 1
training_accuracy = []
test_accuracy = []
mean_training_accuracy = []
mean_test_accuracy = []

for axx, n_hidden_nodes in zip(axes, [10, 100]):
    for ax, alpha in zip(axx, [0.0001, 0.01, 0.1, 1]):
         for i in range(nmc):   
            mlp = MLPClassifier(solver='lbfgs', random_state=0,
                                hidden_layer_sizes=[n_hidden_nodes, n_hidden_nodes],
                                alpha=alpha)
            mlp.fit(X_train, y_train)
            training_accuracy.append(mlp.score(X_train, y_train))
            test_accuracy.append(mlp.score(X_test, y_test))
    mean_training_accuracy.append(np.mean(training_accuracy))
    mean_test_accuracy.append(np.mean(test_accuracy))
        
        
plt.plot((n_hidden_nodes, alpha), training_accuracy, label="Train Accuracy")
plt.plot((n_hidden_nodes, alpha), test_accuracy, label="Test Accuracy")
plt.ylabel("Accuracy")
plt.xlabel("(n_hidden_nodes, alpha)")
plt.legend()

In [None]:
nmc = 1
training_accuracy = []
test_accuracy = []
mean_training_accuracy = []
mean_test_accuracy = []
hidden_layer_settings = [10, 100]
alpha_settings =  [0.0001, 0.01, 0.1, 1]

for n_hidden_nodes in hidden_layer_settings:
    for alpha in alpha_settings:
         for i in range(nmc):   
            mlp = MLPClassifier(solver='lbfgs', random_state=0,
                                hidden_layer_sizes=[n_hidden_nodes, n_hidden_nodes],
                                alpha=alpha)
            mlp.fit(X_train, y_train)
            training_accuracy.append(mlp.score(X_train, y_train))
            test_accuracy.append(mlp.score(X_test, y_test))
    mean_training_accuracy.append(np.mean(training_accuracy))
    mean_test_accuracy.append(np.mean(test_accuracy))
        
        
plt.plot((n_hidden_nodes, alpha), training_accuracy, label="Train Accuracy")
plt.plot((n_hidden_nodes, alpha), test_accuracy, label="Test Accuracy")
plt.ylabel("Accuracy")
plt.xlabel("(n_hidden_nodes, alpha)")
plt.legend()

In [None]:
fig, axes = plt.subplots(2, 4, figsize=(20, 8))
for axx, n_hidden_nodes in zip(axes, [10, 100]):
    for ax, alpha in zip(axx, [0.0001, 0.01, 0.1, 1]):
        mlp = MLPClassifier(solver='lbfgs', random_state=0,
                            hidden_layer_sizes=[n_hidden_nodes, n_hidden_nodes],
                            alpha=alpha)
        mlp.fit(X_train, y_train)
        plt.plot_2d_separator(mlp, X_train, fill=True, alpha=.3, ax=ax)
        plt.discrete_scatter(X_train[:, 0], X_train[:, 1], y_train, ax=ax)
        ax.set_title("n_hidden=[{}, {}]\nalpha={:.4f}".format(
                      n_hidden_nodes, n_hidden_nodes, alpha))

In [None]:
training_accuracy = []
test_accuracy = []

fig, axes = plt.subplots(2, 4, figsize=(20, 8))
for axx, n_hidden_nodes in zip(axes, [10, 100]):
    for ax, alpha in zip(axx, [0.0001, 0.01, 0.1, 1]):
        mlp = MLPClassifier(solver='lbfgs', random_state=0,
                            hidden_layer_sizes=[n_hidden_nodes, n_hidden_nodes],
                            alpha=alpha)
        mlp.fit(X_train, y_train)
        training_accuracy.append(mlp.score(X_train, y_train))
        test_accuracy.append(mlp.score(X_test, y_test))
        mean_training_accuracy.append(np.mean(training_accuracy))
        mean_test_accuracy.append(np.mean(test_accuracy))
        ax.set_title("n_hidden=[{}, {}]\nalpha={:.4f}".format(
                      n_hidden_nodes, n_hidden_nodes, alpha))
        
plt.semilogx(c_settings, mean_training_accuracy, label="Train Accuracy")
plt.semilogx(c_settings, mean_test_accuracy, label="Test Accuracy")
plt.ylabel("Accuracy")
plt.xlabel("C")
plt.legend()



In [None]:
import mglearn

nmc = 50

fig, axes = plt.subplots(2, 4, figsize=(20, 8))
for axx, n_hidden_nodes in zip(axes, [10, 100]):
    for ax, alpha in zip(axx, [0.0001, 0.01, 0.1, 1]):
        for i in range(nmc):   
            mlp = MLPClassifier(solver='lbfgs', random_state=0,
                                hidden_layer_sizes=[n_hidden_nodes, n_hidden_nodes],
                                alpha=alpha)
            mlp.fit(X_train, y_train)
            mglearn.plots.plot_2d_separator(mlp, X_train, fill=True, alpha=.3, ax=ax)
            mglearn.discrete_scatter(X_train[:, 0], X_train[:, 1], y_train, ax=ax)
            ax.set_title("n_hidden=[{}, {}]\nalpha={:.4f}".format(
                          n_hidden_nodes, n_hidden_nodes, alpha))

In [9]:
from pprint import pprint
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_curve, auc

  from numpy.core.umath_tests import inner1d


In [10]:
# C가 클수록 weak regularization
penalty_set = ['l1']
C_set = [0.1]

In [11]:
result1 = []
for penalty in penalty_set:
    for C in C_set:
        logreg_model = LogisticRegression(penalty=penalty, C=C, class_weight='balanced', multi_class="multinomial", solver='saga', max_iter=10000)
        logreg_model = logreg_model.fit(X_train, y_train)
#         Y_val_score = model.decision_function(train_X_val)
        y_test_score = logreg_model.predict_proba(X_test)[:, 1]
        val_proba = "{:.4f}".format(logreg_model.score(X_test, y_test))
        fpr, tpr, _ = roc_curve(y_test, y_test_score, pos_label=True)
        result1.append((logreg_model, penalty, C, val_proba, auc(fpr, tpr)))

KeyboardInterrupt: 

In [None]:
result1

In [None]:
logreg_result = sorted(result1, key=lambda x: x[3], reverse=True)

In [None]:
logreg_result

# Best Model Result

In [None]:
best_logreg_result = logreg_result[0]
print(best_logreg_result)

## MAE value of Best Model

In [None]:
best_logreg_model = best_logreg_result[0]
best_logreg_model = best_logreg_model.fit(X_train, y_train)
print(metrics.mean_absolute_error(best_logreg_model.predict(X_test), y_test))

In [None]:
# predict_proba 결과 중 앞부분 6개에 대해서만 확인한다.
print("예측 확률:\n{}".format(best_logreg_model.predict_proba(X_test)[:6]))

# 행 방향으로 확률을 더하면 모두 1이 된다.
print("합: {}".format(best_logreg_model.predict_proba(X_test)[:6].sum(axis=1)))

## Based on predict_proba result, perform prediction with argmax function

In [None]:
print("가장 큰 예측 확률의 인덱스:\n{}".format(np.argmax(best_logreg_model.predict_proba(X_test), axis=1)))
print("예측:\n{}".format(best_logreg_model.predict(X_test)))

In [None]:
print("훈련 데이터에 있는 클래스 종류: {}".format(best_logreg_model.classes_))
argmax_dec_func = np.argmax(best_logreg_model.decision_function(X_train), axis=1)
print("가장 큰 결정 함수의 인덱스: {}".format(argmax_dec_func[:10]))
print("인덱스를 classses_에 연결: {}".format(best_logreg_model.classes_[argmax_dec_func][:10]))
print("Validation set의 예측: {}".format(best_knn_model.predict(X_test)[:10]))
print("실제 Validation set: {}".format(y_test[:10]))
print("Validation Set의 정확도: {:.2f}".format(best_logreg_model.score(X_test, y_test)))

In [21]:
from sklearn.ensemble import RandomForestClassifier
num_trees = 100
num_features = 10

model = RandomForestClassifier(n_estimators=num_trees, max_features=num_features)
model.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=10, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [22]:
rf_index_list = np.arange(len(model.feature_importances_))

In [23]:
rf_index_list = np.arange(len(model.feature_importances_))
rf_coef_list = []

for index in rf_index_list:
    rf_coef_list.append(model.feature_importances_[index])

rf_coef_list, rf_index_list = (list(t) for t in zip(*sorted(zip(rf_coef_list, rf_index_list), reverse=True)))

important_features = []
for coef, index in zip(rf_coef_list, rf_index_list):
    important_features.append(X.columns[index])

RF_selected_features = important_features[:20]

In [24]:
for feature in RF_selected_features:
    print(feature)

shot_id
shot_distance
action_type-Jump Shot
home_play
period-3
period-1
period-2
period-4
action_type-Layup Shot
game_month-3
game_month-1
combined_shot_type-Dunk
game_month-4
game_month-2
game_month-12
game_month-11
action_type-Driving Layup Shot
loc_y-(-10.6, 22.8]
action_type-Running Jump Shot
loc_x-(-10.96, 8.96]


In [25]:
RF_X = X[RF_selected_features]
RF_X.head()

Unnamed: 0,shot_id,shot_distance,action_type-Jump Shot,home_play,period-3,period-1,period-2,period-4,action_type-Layup Shot,game_month-3,game_month-1,combined_shot_type-Dunk,game_month-4,game_month-2,game_month-12,game_month-11,action_type-Driving Layup Shot,"loc_y-(-10.6, 22.8]",action_type-Running Jump Shot,"loc_x-(-10.96, 8.96]"
1,2,15,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,3,16,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,4,22,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1
5,6,14,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [27]:
from sklearn.cross_validation import KFold, cross_val_score

# setting parameters
seed = 7
processors=1
num_folds=5
num_instances=len(RF_X)
scoring='log_loss'

kfold = KFold(n=num_instances, n_folds=num_folds, random_state=seed)

In [28]:
# 대표적인 ensemble models
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, AdaBoostClassifier

In [29]:
cart = DecisionTreeClassifier()
num_trees = 100

model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=seed)

results = cross_val_score(model, RF_X, y, cv=kfold, n_jobs=processors)
print("({0:.3f}) +/- ({1:.3f})".format(results.mean(), results.std()))

(0.628) +/- (0.009)


In [30]:
num_trees = 100
num_features = 10

model = RandomForestClassifier(n_estimators=num_trees, max_features=num_features)

results = cross_val_score(model, RF_X, y, cv=kfold, n_jobs=processors)
print("({0:.3f}) +/- ({1:.3f})".format(results.mean(), results.std()))

(0.630) +/- (0.010)


In [None]:
class_weight_set = [None, 'balanced']
max_depth_set = [3, 4, 5, 6, 7]

result_tree = []

for class_weight in class_weight_set:
    for max_depth in max_depth_set:
        dt_model = DecisionTreeClassifier(class_weight=class_weight, max_depth=max_depth)
        dt_model = dt_model.fit(train_X_train, train_Y_train)
        Y_val_score = dt_model.predict_proba(train_X_val)[:, 1]
        val_proba = "{:.4f}".format(dt_model.score(train_X_val, train_Y_val))
        fpr, tpr, _ = roc_curve(train_Y_val, Y_val_score, pos_label=True)
        result_tree.append((dt_model, class_weight, max_depth, val_proba, auc(fpr, tpr)))