In [159]:
import pandas as pd
import numpy as np
import csv
import re

In [160]:
data = pd.read_csv("./warriors_points_data.csv")

In [161]:
data.head()

Unnamed: 0,Player,Rk,G,Date,Age,Tm,Home,Opp,Unnamed: 8,GS,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-
0,Stephen Curry,1,1.0,10/25/16,28-225,GSW,,SAS,L (-29),1,...,3,3,4,0,0,4,1,26,15.9,-9
1,Stephen Curry,2,2.0,10/28/16,28-228,GSW,@,NOP,W (+8),1,...,0,1,8,1,0,4,3,23,14.6,4
2,Stephen Curry,3,3.0,10/30/16,28-230,GSW,@,PHO,W (+6),1,...,1,1,3,0,0,1,5,28,19.1,2
3,Stephen Curry,4,4.0,11/1/16,28-232,GSW,@,POR,W (+23),1,...,3,4,3,1,0,4,3,28,16.4,24
4,Stephen Curry,5,5.0,11/3/16,28-234,GSW,,OKC,W (+26),1,...,1,1,7,2,1,1,1,21,19.7,16


In [162]:
# Clean up the data by replacing invalid values
data.loc[(data["PTS"] == "Inactive") | (data["PTS"] == "Did Not Dress"), "PTS"] = 0
data.loc[(data["MP"] == "Inactive") | (data["MP"] == "Did Not Dress"), "MP"] = "0:" # Very Hacky

In [163]:
# Parse out minutes played from MP column
def parseMinutes(row):
    mr = re.match("[0-9]+:", row["MP"])
    try:
        return int(mr.string[mr.start():mr.end()-1])
    except:
        print(row)

minsPlayed = data.apply(parseMinutes, axis=1)

In [164]:
data["PTS"] = pd.to_numeric(data["PTS"], errors="coerce")
data["MinutesPlayed"] = minsPlayed

In [165]:
def pointsPerMinute(row):
    points = row["PTS"]
    minutesPlayed = row["MinutesPlayed"]
    
    if minutesPlayed == 0:
        return 0
    else:
        return float(points)/float(minutesPlayed)

In [166]:
data["PPM"] = data.apply(pointsPerMinute, axis=1)

In [167]:
data["Player"].value_counts()

Klay Thompson     50
Stephen Curry     50
Kevin Durant      50
Draymond Green    49
Name: Player, dtype: int64

In [168]:
data["PPM"].describe()

count    199.000000
mean       0.618419
std        0.296565
min        0.000000
25%        0.427069
50%        0.625000
75%        0.785831
max        2.068966
Name: PPM, dtype: float64

In [169]:
scurry = data.loc[data["Player"] == "Stephen Curry", ]
kthompson = data.loc[data["Player"] == "Klay Thompson", ]
kdurant = data.loc[data["Player"] == "Kevin Durant", ]
dgreen = data.loc[data["Player"] == "Draymond Green", ]

In [170]:
# Draymond Green is missing 1 value, find out which.
unique_dates_scurry = scurry["Date"].unique()
unique_dates_dgreen = dgreen["Date"].unique()
for d in unique_dates_scurry:
    if d not in unique_dates_dgreen:
        print(d)

2/2/17


In [171]:
scurry = scurry[:-1]
kthompson = kthompson[:-1]
kdurant = kdurant[:-1]

In [138]:
dates = scurry.loc[:,"Date"]

In [191]:
scurry_final = scurry.reset_index()["PPM"]
kthompson_final = kthompson.reset_index()["PPM"]
kdurant_final = kdurant.reset_index()["PPM"]
dgreen_final = dgreen.reset_index()["PPM"]

0     0.750000
1     0.769231
2     1.000000
3     0.666667
4     1.258065
5     0.771429
6     0.611111
7     0.800000
8     0.620690
9     0.783784
10    0.857143
11    0.718750
12    0.942857
13    0.518519
14    0.933333
15    0.878788
16    0.736842
17    0.694444
18    0.795918
19    0.625000
20    0.909091
21    0.516129
22    0.583333
23    0.700000
24    0.594595
25    0.729730
26    0.454545
27    1.133333
28    0.916667
29    0.787879
30    1.032258
31    0.947368
32    0.578947
33    0.542857
34    0.617647
35    0.810811
36    0.642857
37    0.777778
38    0.777778
39    0.806452
40    0.750000
41    1.212121
42    0.969697
43    0.517241
44    0.750000
45    0.916667
46    0.821429
47    0.916667
48    0.562500
Name: PPM, dtype: float64

In [192]:
finalDF = pd.DataFrame({
        "Date": dates,
        "Stephen Curry": scurry_final,
        "Klay Thompson": kthompson_final,
        "Kevin Durant": kdurant_final,
        "Draymond Green": dgreen_final
    })

In [193]:
finalDF.to_csv("formatted_data.csv", sep=",")

149    0.333333
150    0.736842
151    0.350000
152    0.466667
153    0.562500
154    0.270270
155    0.631579
156    0.714286
157    0.612903
158    0.909091
159    0.483871
160    0.933333
161    0.783784
162    0.961538
163    1.130435
164    0.545455
165    0.605263
166    0.540541
167    0.319149
168    0.787879
169    2.068966
170    0.685714
171    0.370370
172    0.258065
173    0.789474
174    0.459459
175    0.781250
176    0.640000
177    0.548387
178    0.657143
179    0.459459
180    0.600000
181    0.617647
182    0.828571
183    0.675676
184    0.378378
185    0.414634
186    0.500000
187    0.000000
188    0.657143
189    0.838710
190    0.482759
191    0.470588
192    0.636364
193    0.594595
194    0.593750
195    0.615385
196    0.771429
197    0.878788
Name: PPM, dtype: float64