In [18]:
import pandas as pd
import numpy as np
import csv
import re

In [19]:
data = pd.read_csv("./warriors_points_data.csv")

In [20]:
data.head()

Unnamed: 0,Player,Rk,G,Date,Age,Tm,Home,Opp,Unnamed: 8,GS,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-
0,Stephen Curry,1,1.0,10/25/16,28-225,GSW,,SAS,L (-29),1,...,3,3,4,0,0,4,1,26,15.9,-9
1,Stephen Curry,2,2.0,10/28/16,28-228,GSW,@,NOP,W (+8),1,...,0,1,8,1,0,4,3,23,14.6,4
2,Stephen Curry,3,3.0,10/30/16,28-230,GSW,@,PHO,W (+6),1,...,1,1,3,0,0,1,5,28,19.1,2
3,Stephen Curry,4,4.0,11/1/16,28-232,GSW,@,POR,W (+23),1,...,3,4,3,1,0,4,3,28,16.4,24
4,Stephen Curry,5,5.0,11/3/16,28-234,GSW,,OKC,W (+26),1,...,1,1,7,2,1,1,1,21,19.7,16


In [21]:
# Clean up the data by replacing invalid values
data.loc[(data["PTS"] == "Inactive") | (data["PTS"] == "Did Not Dress"), "PTS"] = 0
data.loc[(data["MP"] == "Inactive") | (data["MP"] == "Did Not Dress"), "MP"] = "0:" # Very Hacky

In [22]:
# Parse out minutes played from MP column
def parseMinutes(row):
    mr = re.match("[0-9]+:", row["MP"])
    try:
        return int(mr.string[mr.start():mr.end()-1])
    except:
        print(row)

minsPlayed = data.apply(parseMinutes, axis=1)

In [23]:
data["PTS"] = pd.to_numeric(data["PTS"], errors="coerce")
data["MinutesPlayed"] = minsPlayed

In [24]:
def pointsPerMinute(row):
    points = row["PTS"]
    minutesPlayed = row["MinutesPlayed"]
    
    if minutesPlayed == 0:
        return 0
    else:
        return float(points)/float(minutesPlayed)

In [25]:
data["PPM"] = data.apply(pointsPerMinute, axis=1)

In [26]:
data["Player"].value_counts()

Klay Thompson     50
Stephen Curry     50
Kevin Durant      50
Draymond Green    49
Name: Player, dtype: int64

In [27]:
data["PPM"].describe()

count    199.000000
mean       0.618419
std        0.296565
min        0.000000
25%        0.427069
50%        0.625000
75%        0.785831
max        2.068966
Name: PPM, dtype: float64

In [28]:
scurry = data.loc[data["Player"] == "Stephen Curry", ]
kthompson = data.loc[data["Player"] == "Klay Thompson", ]
kdurant = data.loc[data["Player"] == "Kevin Durant", ]
dgreen = data.loc[data["Player"] == "Draymond Green", ]

In [29]:
# Draymond Green is missing 1 value, find out which.
unique_dates_scurry = scurry["Date"].unique()
unique_dates_dgreen = dgreen["Date"].unique()
for d in unique_dates_scurry:
    if d not in unique_dates_dgreen:
        print(d)

2/2/17


In [30]:
scurry = scurry[:-1]
kthompson = kthompson[:-1]
kdurant = kdurant[:-1]

In [31]:
dates = scurry.loc[:,"Date"]

In [32]:
scurry_final = scurry.reset_index()["PTS"]
kthompson_final = kthompson.reset_index()["PTS"]
kdurant_final = kdurant.reset_index()["PTS"]
dgreen_final = dgreen.reset_index()["PTS"]

In [33]:
finalDF = pd.DataFrame({
        "Date": dates,
        "Stephen Curry": scurry_final,
        "Klay Thompson": kthompson_final,
        "Kevin Durant": kdurant_final,
        "Draymond Green": dgreen_final
    })

In [34]:
finalDF.to_csv("formatted_data.csv", sep=",")
finalDF.to_json("formatted_data.json")