In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, FunctionTransformer, LabelEncoder,MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier,VotingClassifier,GradientBoostingRegressor,RandomForestRegressor
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import r2_score

In [2]:
df = pd.read_csv('../data/shot-data-all.csv')
df_top = pd.read_csv('../data/top-def.csv')
df_mid = pd.read_csv('../data/mid-def.csv')
df_bot = pd.read_csv('../data/bot-def.csv')

In [3]:
#creating dummy variable
number = LabelEncoder()
df['vs_int'] = number.fit_transform(df['vs'])


In [4]:
X = df[['game_time','shot_made','quarter','home','vs_int','shot_distance']]
y = df['cum_shot_made']
X_train, X_valid, y_train, y_valid = train_test_split(X, y,test_size=0.1)

### Linear Regression

In [5]:
reg = LinearRegression()
reg.fit(X_train,y_train)
score = reg.score(X_valid,y_valid)
print(score)

0.561381912799016


### Voting Classifier

In [6]:
#VOTING CLASS
model = VotingClassifier([
       
       ('tree1', DecisionTreeClassifier(max_depth=4)),
       ('tree2', DecisionTreeClassifier(min_samples_leaf=10)),
    ])
model.fit(X_train,y_train)
score = model.score(X_valid,y_valid)
print(score)

0.22286125089863407


### Gradient Boost

In [7]:
model = GradientBoostingRegressor(n_estimators=100)
model.fit(X_train,y_train)
score = model.score(X_valid,y_valid)
print(score)

0.5760128394571966


# All team Test

In [8]:
X = df[['game_time','shot_made','quarter','home','shot_distance','shot_value', 'vs_int']]
y = df['cum_shot_made']
X_train, X_valid, y_train, y_valid = train_test_split(X, y,test_size = 0.25)
model = make_pipeline(
    PCA(7), 
    GradientBoostingRegressor()
)
for i in range(0,5):
    X_train, X_valid, y_train, y_valid = train_test_split(X, y,test_size = 0.25)

    model.fit(X_train,y_train)
    score = model.score(X_valid,y_valid)
    print(score)


0.5735508951871903
0.5658961447625397
0.585960273854895
0.5759843097334149
0.5585171758326362


# Random Forest Model

In [9]:
Houston = df[df['vs'] == 'HOU']
X = Houston[['game_time','shot_made','quarter','home','shot_distance','shot_value']]
y = Houston['cum_shot_made']
X_train, X_valid, y_train, y_valid = train_test_split(X, y,test_size = 0.25)


model = RandomForestRegressor()
model.fit(X_train,y_train)
score = model.score(X_valid,y_valid)
print(score)

0.4254149530853105




### PCA

In [10]:
Houston = df[df['vs'] == 'HOU']
X = Houston[['game_time','shot_made','quarter','home','shot_distance','shot_value','cum_attempts','cum_fg_percent']]
y = Houston['cum_shot_made']

# scaler = MinMaxScaler()
# X = scaler.fit_transform(X)
# pca = PCA(n_components=2)
# principal_components = pca.fit_transform(X)


model = make_pipeline(
    StandardScaler(),
    PCA(2), 
    GradientBoostingRegressor()
)
#for i in range(0,10):
X_train, X_valid, y_train, y_valid = train_test_split(X, y,test_size = 0.25)

model.fit(X_train,y_train)
print(model.score(X_valid,y_valid))
print(model.score(X_train,y_train))


0.791016611196371
0.8699754420448247


In [11]:
Knicks = df[df['vs'] == 'NYK']
X = Knicks[['game_time','shot_made','quarter','home','shot_distance','shot_value','cum_attempts','cum_fg_percent']]
y = Knicks['cum_shot_made']

# scaler = MinMaxScaler()
# X = scaler.fit_transform(X)
# pca = PCA(n_components=2)
# principal_components = pca.fit_transform(X)


model = make_pipeline(
    StandardScaler(),
    PCA(), 
    GradientBoostingRegressor()
)

X_train, X_valid, y_train, y_valid = train_test_split(X, y,test_size = 0.25)

model.fit(X_train,y_train)
print(model.score(X_valid,y_valid))
print(model.score(X_train,y_train))


0.9569558819915865
0.9975309925374812


In [12]:
X = df[['game_time','shot_made','quarter','home','shot_distance','shot_value','cum_attempts','cum_fg_percent']]
y = df['cum_shot_made']

# scaler = MinMaxScaler()
# X = scaler.fit_transform(X)
# pca = PCA(n_components=2)
# principal_components = pca.fit_transform(X)


model = make_pipeline(
    StandardScaler(), 
    PCA(), 
    GradientBoostingRegressor()
)
#for i in range(0,10):
X_train, X_valid, y_train, y_valid = train_test_split(X, y,test_size = 0.25)

model.fit(X_train,y_train)
print(model.score(X_valid,y_valid))
print(model.score(X_train,y_train))

0.9772424776381664
0.981981663120631


In [13]:
model.predict(X_valid)

array([ 2.57908228,  0.74154353, -0.15771874, ...,  1.60478386,
        4.29711636,  7.64756724])

In [14]:
y_predicted = model.predict(X_valid)
y_predicted


array([ 2.57908228,  0.74154353, -0.15771874, ...,  1.60478386,
        4.29711636,  7.64756724])

In [16]:
r2_score(y_valid, y_predicted)

0.9772424776381664