In [1]:
import pandas as pd
pd_data = pd.read_csv('./train.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [2]:
pd_data = pd_data[['PlayerHeight', 'PlayerWeight', 'DefendersInTheBox', 'Distance', 'Quarter',
                  'HomeScoreBeforePlay', 'VisitorScoreBeforePlay', 'NflId', 'GameId',
                  'YardLine', 'Down', 'Temperature', 'Yards']]

In [3]:
def height_to_numerical(height):
    feet = height.split('-')[0]
    inches = height.split('-')[1]
    return int(feet)*12 + int(inches)

In [4]:
pd_data['PlayerHeight'] = pd_data['PlayerHeight'].apply(height_to_numerical)

In [5]:
import cudf
cudf_data = cudf.from_pandas(pd_data)
print(type(cudf_data))

<class 'cudf.dataframe.dataframe.DataFrame'>


In [6]:
cleaned_data = pd_data.to_csv('./cleaned_data.csv')
import dask_cudf
dask_data = dask_cudf.read_csv('./cleaned_data.csv')
print(type(dask_data))

<class 'dask_cudf.core.DataFrame'>


<h1> Advantage RAPIDS (cudf) </h1>

In [7]:
%%timeit
pd_data['PlayerHeight'].mean()

1.27 ms ± 1.98 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [8]:
%%timeit
cudf_data['PlayerHeight'].mean()

112 µs ± 273 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [27]:
%%timeit
dask_data['PlayerHeight'].mean()

1.55 ms ± 11.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [9]:
%%timeit
pd_data['DefendersInTheBox_vs_Distance'] = pd_data['DefendersInTheBox'] / pd_data['Distance']

1.58 ms ± 1.46 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [10]:
%%timeit
cudf_data['DefendersInTheBox_vs_Distance'] = cudf_data['DefendersInTheBox'] / cudf_data['Distance']

1.29 ms ± 4.76 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [28]:
%%timeit
dask_data['DefendersInTheBox_vs_Distance'] = dask_data['DefendersInTheBox'] / dask_data['Distance']

50 ms ± 481 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


<h1> Advantage Pandas (pd) </h1>

In [11]:
%%timeit
pd_data['BMI'] = 703 * (pd_data['PlayerWeight']/pd_data['PlayerHeight']**2)

2.83 ms ± 7.41 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [12]:
%%timeit
cudf_data['BMI'] = 703 * (cudf_data['PlayerWeight']/cudf_data['PlayerHeight']**2)

4.97 ms ± 228 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [13]:
%%timeit
pd_data['Margin'] = pd_data['HomeScoreBeforePlay'] - pd_data['VisitorScoreBeforePlay']

1.24 ms ± 1.81 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [14]:
%%timeit
cudf_data['Margin'] = cudf_data['HomeScoreBeforePlay'] - cudf_data['VisitorScoreBeforePlay']

1.44 ms ± 6.17 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [15]:
%%timeit
pd_data['Urgency'] = pd_data['Quarter'] * pd_data['Margin']

1.27 ms ± 662 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [16]:
%%timeit
cudf_data['Urgency'] = cudf_data['Quarter'] * cudf_data['Margin']

1.43 ms ± 2.29 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [17]:
%%time
for col in pd_data:
    pd_data[col] = pd_data[col] / pd_data[col].max()

CPU times: user 64.6 ms, sys: 8.07 ms, total: 72.6 ms
Wall time: 71.2 ms


In [18]:
%%time
for col in cudf_data:
    cudf_data[col] = cudf_data[col] / cudf_data[col].max()

CPU times: user 155 ms, sys: 23.9 ms, total: 179 ms
Wall time: 179 ms


In [19]:
pd_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 509762 entries, 0 to 509761
Data columns (total 17 columns):
PlayerHeight                     509762 non-null float64
PlayerWeight                     509762 non-null float64
DefendersInTheBox                509696 non-null float64
Distance                         509762 non-null float64
Quarter                          509762 non-null float64
HomeScoreBeforePlay              509762 non-null float64
VisitorScoreBeforePlay           509762 non-null float64
NflId                            509762 non-null float64
GameId                           509762 non-null float64
YardLine                         509762 non-null float64
Down                             509762 non-null float64
Temperature                      461230 non-null float64
Yards                            509762 non-null float64
DefendersInTheBox_vs_Distance    509696 non-null float64
BMI                              509762 non-null float64
Margin                           5

In [20]:
import numpy as np
X = np.array(pd_data.drop(columns='Yards'))
y = pd_data['Yards']
print(y.value_counts())
y = np.array(y)

 0.020202    65692
 0.010101    60544
 0.030303    58652
 0.040404    51502
 0.000000    47190
 0.050505    38742
 0.060606    26862
-0.010101    22792
 0.070707    20020
-0.020202    14674
 0.080808    14652
 0.090909    14476
 0.111111     8558
-0.030303     8338
 0.101010     7194
-0.040404     5566
 0.121212     5236
 0.141414     4576
 0.131313     4246
 0.151515     3300
 0.161616     3036
-0.050505     2486
 0.171717     2398
 0.181818     1980
 0.212121     1782
 0.202020     1628
 0.191919     1386
-0.060606     1166
 0.252525      836
 0.232323      836
             ...  
 0.616162       66
 0.696970       66
 0.474747       66
-0.111111       44
 0.393939       44
 0.494949       44
 0.666667       44
 0.606061       44
 0.707071       44
 0.909091       44
 0.484848       44
 0.717172       44
 0.444444       44
 0.727273       44
 0.595960       44
 0.575758       44
 0.656566       44
 0.777778       22
 1.000000       22
 0.676768       22
 0.787879       22
 0.878788   

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [22]:
import tensorflow.keras as keras

In [23]:
model = keras.models.Sequential()
model.add(keras.layers.Dense(256, activation='relu'))
model.add(keras.layers.Dense(512, activation='relu'))
model.add(keras.layers.Dense(256, activation='relu'))
model.add(keras.layers.Dense(1, activation='tanh'))

In [24]:
from tensorflow.keras.optimizers import SGD
model.compile(optimizer=SGD(lr=0.001),
             loss='mse')

In [25]:
model.fit(X_train, y_train, epochs=10)

Train on 407809 samples
Epoch 1/10




Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f03847a3518>

In [26]:
import warnings
warnings.filterwarnings('ignore')
model.evaluate(X_test, y_test)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



0.004258217414762187