In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, FunctionTransformer, LabelEncoder,MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier,VotingClassifier,GradientBoostingRegressor,RandomForestRegressor
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import r2_score
TOP_DEF_VALID_TEMPLATE = (
   'Validation Score on Top DEF RTG is: {score:.3g}\n'
    )
TOP_DEF_TRAIN_TEMPLATE = (
   'Training Score on Top DEF RTG is: {score:.3g}\n'
    )
MID_DEF_VALID_TEMPLATE = (
   'Validation Score on Mid DEF RTG is: {score:.3g}\n'
    )
MID_DEF_TRAIN_TEMPLATE = (
   'Training Score on Mid DEF RTG is: {score:.3g}\n'
    )
BOT_DEF_VALID_TEMPLATE = (
   'Validation Score on Bot DEF RTG is: {score:.3g}\n'
    )
BOT_DEF_TRAIN_TEMPLATE = (
   'Training Score on Bot DEF RTG is: {score:.3g}\n'
    )


In [2]:
df = pd.read_csv('../data/shot-data-all.csv')
df_top = pd.read_csv('../data/top-def.csv')
df_mid = pd.read_csv('../data/mid-def.csv')
df_bot = pd.read_csv('../data/bot-def.csv')

In [3]:
#creating dummy variable
number = LabelEncoder()
df['vs_int'] = number.fit_transform(df['vs'])


In [4]:
X = df[['game_time','shot_made','quarter','home','vs_int','shot_distance']]
y = df['cum_shot_made']
X_train, X_valid, y_train, y_valid = train_test_split(X, y,test_size=0.1)

### Linear Regression

In [5]:
reg = LinearRegression()
reg.fit(X_train,y_train)
score = reg.score(X_valid,y_valid)
print(score)

0.5290788623527369


### Voting Classifier

In [6]:
#VOTING CLASS
model = VotingClassifier([
       
       ('tree1', DecisionTreeClassifier(max_depth=4)),
       ('tree2', DecisionTreeClassifier(min_samples_leaf=10)),
    ])
model.fit(X_train,y_train)
score = model.score(X_valid,y_valid)
print(score)

0.2393961179007908


### Gradient Boost

In [7]:
model = GradientBoostingRegressor(n_estimators=100)
model.fit(X_train,y_train)
score = model.score(X_valid,y_valid)
print(score)

0.5473698004781673


# All team Test

In [8]:
X = df[['game_time','shot_made','quarter','home','shot_distance','shot_value', 'vs_int']]
y = df['cum_shot_made']
X_train, X_valid, y_train, y_valid = train_test_split(X, y,test_size = 0.25)
model = make_pipeline(
    PCA(7), 
    GradientBoostingRegressor()
)
for i in range(0,5):
    X_train, X_valid, y_train, y_valid = train_test_split(X, y,test_size = 0.25)

    model.fit(X_train,y_train)
    score = model.score(X_valid,y_valid)
    print(score)


0.5587903303562772
0.5537185536744569
0.5575286223560929
0.5797927683097044
0.5529442738534688


# Random Forest Model

In [9]:
Houston = df[df['vs'] == 'HOU']
X = Houston[['game_time','shot_made','quarter','home','shot_distance','shot_value']]
y = Houston['cum_shot_made']
X_train, X_valid, y_train, y_valid = train_test_split(X, y,test_size = 0.25)


model = RandomForestRegressor()
model.fit(X_train,y_train)
score = model.score(X_valid,y_valid)
print(score)

0.4743835238159644




### PCA

In [10]:
Houston = df[df['vs'] == 'HOU']
X = Houston[['game_time','shot_made','quarter','home','shot_distance','shot_value','cum_attempts','cum_fg_percent']]
y = Houston['cum_shot_made']

# scaler = MinMaxScaler()
# X = scaler.fit_transform(X)
# pca = PCA(n_components=2)
# principal_components = pca.fit_transform(X)


model = make_pipeline(
    StandardScaler(),
    PCA(2), 
    GradientBoostingRegressor()
)
#for i in range(0,10):
X_train, X_valid, y_train, y_valid = train_test_split(X, y,test_size = 0.25)

model.fit(X_train,y_train)
print(model.score(X_valid,y_valid))
print(model.score(X_train,y_train))


0.7707666511017681
0.8818421141538557


In [11]:
Knicks = df[df['vs'] == 'NYK']
X = Knicks[['game_time','shot_made','quarter','home','shot_distance','shot_value','cum_attempts','cum_fg_percent']]
y = Knicks['cum_shot_made']

# scaler = MinMaxScaler()
# X = scaler.fit_transform(X)
# pca = PCA(n_components=2)
# principal_components = pca.fit_transform(X)


model = make_pipeline(
    StandardScaler(),
    PCA(), 
    GradientBoostingRegressor()
)

X_train, X_valid, y_train, y_valid = train_test_split(X, y,test_size = 0.25)

model.fit(X_train,y_train)
print(model.score(X_valid,y_valid))
print(model.score(X_train,y_train))


0.9489214543272588
0.9971022839301261


In [12]:
X = df[['game_time','shot_made','quarter','home','shot_distance','shot_value','cum_attempts','cum_fg_percent']]
y = df['cum_shot_made']

# scaler = MinMaxScaler()
# X = scaler.fit_transform(X)
# pca = PCA(n_components=2)
# principal_components = pca.fit_transform(X)


model = make_pipeline(
    StandardScaler(), 
    PCA(), 
    GradientBoostingRegressor()
)
#for i in range(0,10):
X_train, X_valid, y_train, y_valid = train_test_split(X, y,test_size = 0.25)

model.fit(X_train,y_train)
print(model.score(X_valid,y_valid))
print(model.score(X_train,y_train))

0.9791004619332817
0.9835487168954484


In [13]:
model.predict(X_valid)

array([ 0.78491096, 10.36143846,  6.14264739, ..., 12.59944022,
        2.64964543,  4.52618696])

In [14]:
y_predicted = model.predict(X_valid)
y_predicted


array([ 0.78491096, 10.36143846,  6.14264739, ..., 12.59944022,
        2.64964543,  4.52618696])

In [15]:
r2_score(y_valid, y_predicted)

0.9791004619332817

In [22]:
def return_model():
    return make_pipeline(
    StandardScaler(), 
    PCA(8), 
    GradientBoostingRegressor()
    )
def train_test(df,test_size):
    X = df[['game_time','shot_made','quarter','home','shot_distance','shot_value','cum_attempts','cum_fg_percent']]
    y = df['cum_shot_made']
    X_train, X_valid, y_train, y_valid = train_test_split(X, y)
    return X_train, X_valid, y_train, y_valid

In [32]:
top_def_model = return_model()
mid_def_model = return_model()
bot_def_model = return_model()

#TOP DEFENSE
X_train, X_valid, y_train, y_valid = train_test(df_top, 0.25)
top_def_model.fit(X_train,y_train)

print(TOP_DEF_TRAIN_TEMPLATE.format(score=top_def_model.score(X_train,y_train)))
print(TOP_DEF_VALID_TEMPLATE.format(score=top_def_model.score(X_valid,y_valid)))

#MID DEFENSE
X_train, X_valid, y_train, y_valid = train_test(df_mid, 0.25)
mid_def_model.fit(X_train,y_train) 

print(MID_DEF_TRAIN_TEMPLATE.format(score=mid_def_model.score(X_train,y_train)))
print(MID_DEF_VALID_TEMPLATE.format(score=mid_def_model.score(X_train,y_train)))

#BOT DEFENSE
X_train, X_valid, y_train, y_valid = train_test(df_bot, 0.25)
bot_def_model.fit(X_train,y_train)
print(BOT_DEF_TRAIN_TEMPLATE.format(score=bot_def_model.score(X_train,y_train)))
print(BOT_DEF_VALID_TEMPLATE.format(score=bot_def_model.score(X_train,y_train)))



Training Score on Top DEF RTG is: 0.982

Validation Score on Top DEF RTG is: 0.964

Training Score on Mid DEF RTG is: 0.985

Validation Score on Mid DEF RTG is: 0.985

Training Score on Bot DEF RTG is: 0.988

Validation Score on Bot DEF RTG is: 0.988

