In [109]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## Kobe Bryant  shot selection

Cсылка на соревнование: https://www.kaggle.com/c/kobe-bryant-shot-selection

Goal: Fun and education

Using 20 years of data on Kobe's swishes and misses, can you predict which shots will find the bottom of the net? This competition is well suited for practicing classification basics, feature engineering, and time series analysis. Practice got Kobe an eight-figure contract and 5 championship rings. What will it get you?

This data contains the location and circumstances of every field goal attempted by Kobe Bryant took during his 20-year career. Your task is to predict whether the basket went in (shot_made_flag).

We have removed 5000 of the shot_made_flags (represented as missing values in the csv file). These are the test set shots for which you must submit a prediction. You are provided a sample submission file with the correct shot_ids needed for a valid prediction.

In [110]:
data = pd.read_csv('Kobe.csv')

In [111]:
data.head()

Unnamed: 0,action_type,combined_shot_type,game_event_id,game_id,lat,loc_x,loc_y,lon,minutes_remaining,period,...,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,team_id,team_name,game_date,matchup,opponent,shot_id
0,Jump Shot,Jump Shot,10,20000012,34,167,72,-120.0,10,1,...,2PT Field Goal,Right Side(R),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,1
1,Jump Shot,Jump Shot,12,20000012,34,-157,0,-120.0,10,1,...,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,2
2,Jump Shot,Jump Shot,35,20000012,34,-101,135,-120.0,7,1,...,2PT Field Goal,Left Side Center(LC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,3
3,Jump Shot,Jump Shot,43,20000012,34,138,175,-120.0,6,1,...,2PT Field Goal,Right Side Center(RC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,4
4,Driving Dunk Shot,Dunk,155,20000012,34,0,0,-120.0,6,2,...,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,5


In [112]:
target = 'shot_made_flag'

**Задания:**

1. Провести анализ данных. Много хороших примеров анализа можно посмотреть здесь https://www.kaggle.com/c/kobe-bryant-shot-selection/kernels
2. Подготовить фичи для обучения модели - нагенерить признаков, обработать пропущенные значения, проверить на возможные выбросы, обработать категориальные признаки и др.
3. Обучить линейную модель, Lasso, Ridge на тех же признаках - построить сравнительную таблицу коэффициентов, сделать заключения о том, как меняется величина коэффициентов, какие зануляются. Посчитать RSS

**Дополнительно**
4. Сравнить результаты на тестовом наборе данных - сделать train_test_split в самом начале, подготовить переменные, сравнить результаты работы классификаторов (те же 3), метрика ROC AUC
5. Построить PCA на подготовленных признаках, посмотреть, какие компоненты составляют наибольшую часть дисперсии целевой переменной

In [113]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30697 entries, 0 to 30696
Data columns (total 25 columns):
action_type           30697 non-null object
combined_shot_type    30697 non-null object
game_event_id         30697 non-null int64
game_id               30697 non-null int64
lat                   30697 non-null float64
loc_x                 30697 non-null int64
loc_y                 30697 non-null int64
lon                   30697 non-null float64
minutes_remaining     30697 non-null int64
period                30697 non-null int64
playoffs              30697 non-null int64
season                30697 non-null object
seconds_remaining     30697 non-null int64
shot_distance         30697 non-null int64
shot_made_flag        25697 non-null float64
shot_type             30697 non-null object
shot_zone_area        30697 non-null object
shot_zone_basic       30697 non-null object
shot_zone_range       30697 non-null object
team_id               30697 non-null int64
team_name         

In [114]:
data['remaining_time'] = data['minutes_remaining'] * 60 + data['seconds_remaining']

In [115]:
data['season'] = data['season'].apply(lambda x: int(x.split('-')[1]) )

In [116]:
data['season'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 97,
       98, 99,  0])

In [117]:
data['shot_type'].unique()

array(['2PT Field Goal', '3PT Field Goal'], dtype=object)

In [118]:
data['shot_zone_area'].unique()

array(['Right Side(R)', 'Left Side(L)', 'Left Side Center(LC)',
       'Right Side Center(RC)', 'Center(C)', 'Back Court(BC)'],
      dtype=object)

In [119]:
data['shot_zone_basic'].unique()

array(['Mid-Range', 'Restricted Area', 'In The Paint (Non-RA)',
       'Above the Break 3', 'Right Corner 3', 'Backcourt',
       'Left Corner 3'], dtype=object)

In [120]:
data['shot_zone_range'].unique()

array(['16-24 ft.', '8-16 ft.', 'Less Than 8 ft.', '24+ ft.',
       'Back Court Shot'], dtype=object)

In [121]:
data[['season','shot_made_flag']].groupby(['season'])['shot_made_flag'].agg('count')

season
0     1312
1     1575
2     1708
3     1852
4     1371
5     1127
6     1924
7     1579
8     1819
9     1851
10    1772
11    1521
12    1416
13    1328
14      59
15     593
16     932
97     383
98     810
99     765
Name: shot_made_flag, dtype: int64

In [122]:
one = data[data['shot_made_flag'] == 1.0 ]

In [123]:
one['shot_made_flag'].value_counts()

1    11465
Name: shot_made_flag, dtype: int64

In [124]:
one[['season','shot_made_flag']].groupby(['season']).agg('count').sort_values(by=['shot_made_flag'],ascending = False)

Unnamed: 0_level_0,shot_made_flag
season,Unnamed: 1_level_1
6,873
9,866
8,852
3,808
10,804
2,783
1,735
7,723
11,679
13,608


In [125]:
noshot = data[data['shot_made_flag'] == 0.0 ] 

In [126]:
noshot[['season','shot_made_flag']].groupby(['season']).agg('count').sort_values(by=['shot_made_flag'],ascending = False)

Unnamed: 0_level_0,shot_made_flag
season,Unnamed: 1_level_1
6,1051
3,1044
9,985
10,968
8,967
2,925
7,856
11,842
1,840
12,813


In [127]:
drops = ['action_type','shot_id', 'team_id', 'team_name', 'opponent',\
         'matchup', 'lon', 'lat', 'seconds_remaining', 'minutes_remaining', \
          'loc_x', 'loc_y', 'game_event_id', 'game_id', 'game_date']
for drop in drops:
    data = data.drop(drop, 1)

In [128]:
data.head()

Unnamed: 0,combined_shot_type,period,playoffs,season,shot_distance,shot_made_flag,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,remaining_time
0,Jump Shot,1,0,1,18,,2PT Field Goal,Right Side(R),Mid-Range,16-24 ft.,627
1,Jump Shot,1,0,1,15,0.0,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,622
2,Jump Shot,1,0,1,16,1.0,2PT Field Goal,Left Side Center(LC),Mid-Range,16-24 ft.,465
3,Jump Shot,1,0,1,22,0.0,2PT Field Goal,Right Side Center(RC),Mid-Range,16-24 ft.,412
4,Dunk,2,0,1,0,1.0,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,379


In [129]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30697 entries, 0 to 30696
Data columns (total 11 columns):
combined_shot_type    30697 non-null object
period                30697 non-null int64
playoffs              30697 non-null int64
season                30697 non-null int64
shot_distance         30697 non-null int64
shot_made_flag        25697 non-null float64
shot_type             30697 non-null object
shot_zone_area        30697 non-null object
shot_zone_basic       30697 non-null object
shot_zone_range       30697 non-null object
remaining_time        30697 non-null int64
dtypes: float64(1), int64(5), object(5)
memory usage: 2.6+ MB


In [130]:
filter_season = data[(data['season'] <= 13 ) & (data['season'] >= 6)]

In [131]:
filter_season[['season','shot_made_flag']].groupby(['season'])['shot_made_flag'].agg('count')

season
6     1924
7     1579
8     1819
9     1851
10    1772
11    1521
12    1416
13    1328
Name: shot_made_flag, dtype: int64

In [132]:
filter_season['shot_made_flag'].value_counts()

0    7202
1    6008
Name: shot_made_flag, dtype: int64

In [133]:
test=filter_season[filter_season['shot_made_flag'].isnull()]
test.shape

(2525, 11)

In [134]:
train=filter_season[filter_season['shot_made_flag'].notnull()]
train.shape

(13210, 11)

In [135]:
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)


In [136]:
train = MultiColumnLabelEncoder(columns = ['combined_shot_type', 'shot_type', 'shot_zone_basic', 'shot_zone_range', 'shot_zone_area']).fit_transform(train)

In [137]:
train.head()

Unnamed: 0,combined_shot_type,period,playoffs,season,shot_distance,shot_made_flag,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,remaining_time
7533,4,1,0,6,0,0,0,1,5,4,643
7534,3,1,0,6,21,1,0,4,4,0,596
7536,3,1,0,6,16,1,0,5,4,0,452
7539,4,1,0,6,0,1,0,1,5,4,230
7540,4,2,0,6,0,1,0,1,5,4,463


In [138]:
test = MultiColumnLabelEncoder(columns = ['combined_shot_type', 'shot_type', 'shot_zone_basic', 'shot_zone_range', 'shot_zone_area']).fit_transform(test)

In [139]:
from sklearn.linear_model import LinearRegression

In [140]:
len(predictors)

10

In [141]:
def linear_regression(data, predictors):
    # predictors:
    predictors=predictors
    
    # fit
    linreg = LinearRegression(normalize=True)
    linreg.fit(data[predictors], data[target])
    y_pred = linreg.predict(data[predictors])
       
    # Return the result in pre-defined format
    rss = sum((y_pred - data[target]) ** 2)
    ret = [rss]
    ret.extend([linreg.intercept_])
    ret.extend(linreg.coef_)
    return ret

In [142]:
#Initialize a dataframe to store the results:
col = ['rss', 'intercept'] + ['coef_x_%d' % i for i in range(1, 10)]
ind = ['model_pow_%d' % i for i in range(1, 10)]
coef_matrix_simple = pd.DataFrame(index=ind, columns=col)

#Iterate through all powers and assimilate results
for i in range(1, 10):
    coef_matrix_simple.iloc[i - 1, 0:i + 2] = linear_regression(
        train,
        predictors=predictors[:i]
    )

In [143]:
# Set the display float format
pd.options.display.float_format = '{:,.2g}'.format
coef_matrix_simple

Unnamed: 0,rss,intercept,coef_x_1,coef_x_2,coef_x_3,coef_x_4,coef_x_5,coef_x_6,coef_x_7,coef_x_8,coef_x_9,coef_x_10
model_pow_1,3300.0,0.61,-0.049,,,,,,,,,
model_pow_2,3300.0,0.65,-0.05,-0.017,,,,,,,,
model_pow_3,3300.0,0.65,-0.05,-0.017,-0.00084,,,,,,,
model_pow_4,3300.0,0.68,-0.05,-0.017,-0.00084,-0.0028,,,,,,
model_pow_5,3100.0,0.97,-0.092,-0.011,0.00034,-0.0041,-0.012,,,,,
model_pow_6,3100.0,0.98,-0.094,-0.011,0.00045,-0.0041,-0.013,0.026,,,,
model_pow_7,3100.0,1.0,-0.095,-0.012,-9.9e-05,-0.0043,-0.012,0.015,-0.0091,,,
model_pow_8,3100.0,0.93,-0.1,-0.012,0.0015,-0.004,-0.011,0.079,-0.013,0.023,,
model_pow_9,3100.0,0.99,-0.1,-0.012,0.0017,-0.0038,-0.013,0.1,-0.014,0.022,-0.013,


In [144]:
from sklearn.linear_model import Ridge

In [145]:
def ridge_regression(data, predictors, alpha):
    # Fit
    ridgereg = Ridge(alpha=alpha, normalize=True)
    ridgereg.fit(data[predictors], data[target])
    y_pred = ridgereg.predict(data[predictors])
    
    # format
    rss = sum((y_pred-data[target]) ** 2)
    ret = [rss]
    ret.extend([ridgereg.intercept_])
    ret.extend(ridgereg.coef_)
    return ret

In [146]:
# different alphas
alpha_ridge = [1e-15, 1e-10, 1e-8, 1e-4, 1e-3, 1e-2, 1, 5, 10, 20]

# create dataframe to save coefficients
col = ['rss', 'intercept'] + ['coef_x_%d' % i for i in range(1, 11)]
ind = ['alpha_%.2g' % alpha_ridge[i] for i in range(0, 10)]
coef_matrix_ridge = pd.DataFrame(index=ind, columns=col)

for i in range(10):
    coef_matrix_ridge.iloc[i, ] = ridge_regression(train, predictors, alpha_ridge[i])
plt.show()

In [147]:
# Set the display float format
pd.options.display.float_format = '{:,.2g}'.format
coef_matrix_ridge

Unnamed: 0,rss,intercept,coef_x_1,coef_x_2,coef_x_3,coef_x_4,coef_x_5,coef_x_6,coef_x_7,coef_x_8,coef_x_9,coef_x_10
alpha_1e-15,3100.0,0.97,-0.1,-0.011,0.0016,-0.0036,-0.013,0.1,-0.015,0.022,-0.013,4.1e-05
alpha_1e-10,3100.0,0.97,-0.1,-0.011,0.0016,-0.0036,-0.013,0.1,-0.015,0.022,-0.013,4.1e-05
alpha_1e-08,3100.0,0.97,-0.1,-0.011,0.0016,-0.0036,-0.013,0.1,-0.015,0.022,-0.013,4.1e-05
alpha_0.0001,3100.0,0.97,-0.1,-0.011,0.0016,-0.0036,-0.013,0.1,-0.015,0.022,-0.013,4.1e-05
alpha_0.001,3100.0,0.97,-0.1,-0.011,0.0016,-0.0036,-0.013,0.099,-0.014,0.022,-0.012,4.1e-05
alpha_0.01,3100.0,0.94,-0.1,-0.011,0.0014,-0.0037,-0.012,0.084,-0.014,0.022,-0.0064,4.2e-05
alpha_1,3100.0,0.62,-0.04,-0.0062,-0.00011,-0.0017,-0.0032,-0.029,-0.0091,0.01,0.014,2.6e-05
alpha_5,3200.0,0.5,-0.011,-0.0024,-7.9e-05,-0.00048,-0.0014,-0.018,-0.0043,0.0048,0.0064,1.1e-05
alpha_10,3200.0,0.48,-0.0053,-0.0014,-5.2e-05,-0.00025,-0.00084,-0.012,-0.0026,0.0029,0.0039,6.1e-06
alpha_20,3200.0,0.47,-0.0026,-0.00077,-3.1e-05,-0.00012,-0.00047,-0.0067,-0.0014,0.0016,0.0022,3.3e-06


In [148]:
coef_matrix_ridge.apply(lambda x: sum(x.values == 0), axis=1)

alpha_1e-15     0
alpha_1e-10     0
alpha_1e-08     0
alpha_0.0001    0
alpha_0.001     0
alpha_0.01      0
alpha_1         0
alpha_5         0
alpha_10        0
alpha_20        0
dtype: int64

In [149]:
from sklearn.linear_model import Lasso

def lasso_regression(data, predictors, alpha, models_to_plot={}):
    # fit
    lassoreg = Lasso(alpha=alpha, normalize=True, max_iter=1e5)
    lassoreg.fit(data[predictors], data[target])
    y_pred = lassoreg.predict(data[predictors])
     
    # format
    rss = sum((y_pred-data[target]) ** 2)
    ret = [rss]
    ret.extend([lassoreg.intercept_])
    ret.extend(lassoreg.coef_)
    return ret

In [150]:
# alphas
alpha_lasso = [1e-15, 1e-10, 1e-8, 1e-5,1e-4, 1e-3,1e-2, 1, 5, 10]

# create dataframe to store coefficients
col = ['rss', 'intercept'] + ['coef_x_%d' % i for i in range(1, 11)]
ind = ['alpha_%.2g' % alpha_lasso[i] for i in range(0, 10)]
coef_matrix_lasso = pd.DataFrame(index=ind, columns=col)

#Iterate over the 10 alpha values:
for i in range(10):
    coef_matrix_lasso.iloc[i, ] = lasso_regression(train, predictors, alpha_lasso[i])

In [151]:
# Set the display float format
pd.options.display.float_format = '{:,.2g}'.format
coef_matrix_lasso

Unnamed: 0,rss,intercept,coef_x_1,coef_x_2,coef_x_3,coef_x_4,coef_x_5,coef_x_6,coef_x_7,coef_x_8,coef_x_9,coef_x_10
alpha_1e-15,3100.0,0.97,-0.1,-0.011,0.0016,-0.0036,-0.013,0.1,-0.015,0.022,-0.013,4.1e-05
alpha_1e-10,3100.0,0.97,-0.1,-0.011,0.0016,-0.0036,-0.013,0.1,-0.015,0.022,-0.013,4.1e-05
alpha_1e-08,3100.0,0.97,-0.1,-0.011,0.0016,-0.0036,-0.013,0.1,-0.015,0.022,-0.013,4.1e-05
alpha_1e-05,3100.0,0.9,-0.099,-0.01,0.0,-0.0033,-0.01,0.065,-0.013,0.02,-0.00034,3.7e-05
alpha_0.0001,3100.0,0.8,-0.069,-0.0018,0.0,-0.0,-0.0093,-0.0,-0.0054,0.0061,0.0,0.0
alpha_0.001,3300.0,0.45,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0
alpha_0.01,3300.0,0.45,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0
alpha_1,3300.0,0.45,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0
alpha_5,3300.0,0.45,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0
alpha_10,3300.0,0.45,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0


In [152]:
coef_matrix_lasso.apply(lambda x: sum(x.values == 0), axis=1)

alpha_1e-15      0
alpha_1e-10      0
alpha_1e-08      0
alpha_1e-05      1
alpha_0.0001     5
alpha_0.001     10
alpha_0.01      10
alpha_1         10
alpha_5         10
alpha_10        10
dtype: int64